Bladeren bron

Unified naming scheme & version bump 1.0.1

The Zeit servers no longer send the Content-Disposition header to allow
for placing the files under human-readable names. The new version uses
the release date.
Christoph Stelz 2 jaren geleden
bovenliggende
commit
c211ad8241
2 gewijzigde bestanden met toevoegingen van 10 en 9 verwijderingen
  1. 1 1
      setup.py
  2. 9 8
      zeitdownload.py

+ 1 - 1
setup.py

@@ -3,7 +3,7 @@ from setuptools import setup
 
 setup(
         name = "zeitdownload",
-        version = "1.0.0",
+        version = "1.0.1",
         author = "Christoph Stelz",
         author_email = "mail+python@ch-st.de",
         description = "Download the digital version of the newspaper \"Die Zeit\"",

+ 9 - 8
zeitdownload.py

@@ -3,6 +3,7 @@ import requests
 import lxml.html
 import cgi
 import sys
+import re
 from argparse import ArgumentParser
 
 parser = ArgumentParser(description='Download "Die Zeit" in multiple formats from the premium subscription service')
@@ -32,6 +33,7 @@ if formats == None:
 
 RELEASE_XPATH = '//p[@class="epaper-info-release-date"]'
 DOWNLOAD_XPATH = "//a[contains(text(), '{}')]"
+DATE_REGEX = r"^\d{2}\.\d{2}\.\d{4}$"
 
 s = requests.Session()
 response = s.post('https://meine.zeit.de/anmelden', {
@@ -57,8 +59,11 @@ response = s.get('https://epaper.zeit.de/abo/diezeit')
 document = lxml.html.fromstring(response.text)
 release_dates = list(map(lambda el: el.text,
         document.xpath(RELEASE_XPATH)))
-
 latest_release = release_dates[0]
+
+if not re.match(DATE_REGEX, latest_release):
+    print(f"Scraping broken, {latest_release} not valid date.")
+
 response = s.get(f"https://epaper.zeit.de/abo/diezeit/{latest_release}")
 document = lxml.html.fromstring(response.text)
 
@@ -72,13 +77,9 @@ for fmt in formats:
             if not link.startswith('https') else link)
 
     # Get filename from Content-Disposition header
-    filename = ''
-    if 'Content-Disposition' in response.headers.keys():
-        value, params = cgi.parse_header(response.headers['Content-Disposition'])
-        filename = params['filename']
-    else:
-        filename = link.split('/')[-1]
+    date = "-".join(latest_release.split(".")[::-1])
+    filename = 'die_zeit_' + date + "." + fmt
 
     with open(filename, 'wb') as file:
         file.write(response.content)
-    print(f"Downloaded {fmt}.")
+    print(f"Downloaded {fmt} to {filename}")