há 3 anos atrás · 04736ad3e9
--- a/zeitdownload.py
+++ b/zeitdownload.py
@@ -0,0 +1,83 @@
 
				+import requests
			
 
				+import lxml.html
			
 
				+import cgi
			
 
				+import sys
			
 
				+from argparse import ArgumentParser
			
 
				+
			
 
				+parser = ArgumentParser(description='Download "Die Zeit" in multiple formats from the premium subscription service')
			
 
				+parser.add_argument('--email', type=str, required=True,
			
 
				+        help='Email you used for the digital subscription signup')
			
 
				+parser.add_argument('--password', type=str, required=True,
			
 
				+        help='Corresponding password')
			
 
				+parser.add_argument('--pdf', dest='formats',
			
 
				+        action='append_const', const='pdf',
			
 
				+        help='Download full-page PDF')
			
 
				+parser.add_argument('--epub', dest='formats',
			
 
				+        action='append_const', const='epub',
			
 
				+        help='Download EPUB file for E-Readers')
			
 
				+parser.add_argument('--mobi', dest='formats',
			
 
				+        action='append_const', const='mobi',
			
 
				+        help='Download MOBI file for Kindles')
			
 
				+
			
 
				+args = parser.parse_args()
			
 
				+
			
 
				+email = args.email
			
 
				+password = args.password
			
 
				+formats = args.formats
			
 
				+
			
 
				+if formats == None:
			
 
				+    print("No formats specified, all done.")
			
 
				+    sys.exit(0)
			
 
				+
			
 
				+RELEASE_XPATH = '//p[@class="epaper-info-release-date"]'
			
 
				+DOWNLOAD_XPATH = "//a[contains(text(), '{}')]"
			
 
				+
			
 
				+s = requests.Session()
			
 
				+response = s.post('https://meine.zeit.de/anmelden', {
			
 
				+    'entry_service': 'sonstige',
			
 
				+    'product_id': 'sonstige',
			
 
				+    'return_url': 'https://www.zeit.de/index',
			
 
				+    'email': email,
			
 
				+    'pass': password,
			
 
				+    'permanent': 'on'
			
 
				+})
			
 
				+if not 'zeit_sso_201501' in s.cookies:
			
 
				+    print("Invalid login.")
			
 
				+    sys.exit(-1)
			
 
				+
			
 
				+format_btns = {
			
 
				+    'pdf': 'GESAMT-PDF LADEN',
			
 
				+    'epub': 'EPUB FÜR E-READER LADEN',
			
 
				+    'mobi': 'MOBI FÜR KINDLE LADEN'
			
 
				+}
			
 
				+
			
 
				+response = s.get('https://epaper.zeit.de/abo/diezeit')
			
 
				+
			
 
				+document = lxml.html.fromstring(response.text)
			
 
				+release_dates = list(map(lambda el: el.text,
			
 
				+        document.xpath(RELEASE_XPATH)))
			
 
				+
			
 
				+latest_release = release_dates[0]
			
 
				+response = s.get(f"https://epaper.zeit.de/abo/diezeit/{latest_release}")
			
 
				+document = lxml.html.fromstring(response.text)
			
 
				+
			
 
				+for fmt in formats:
			
 
				+    link_elements = document.xpath(DOWNLOAD_XPATH.format(format_btns[fmt]))
			
 
				+    if len(link_elements) < 1:
			
 
				+        print(f"Skipping {fmt} download, scraping broken")
			
 
				+    link = link_elements[0].attrib['href']
			
 
				+    print(f"Downloading {fmt} from {link}...")
			
 
				+    response = s.get("https://epaper.zeit.de" + link 
			
 
				+            if not link.startswith('https') else link)
			
 
				+
			
 
				+    # Get filename from Content-Disposition header
			
 
				+    filename = ''
			
 
				+    if 'Content-Disposition' in response.headers.keys():
			
 
				+        value, params = cgi.parse_header(response.headers['Content-Disposition'])
			
 
				+        filename = params['filename']
			
 
				+    else:
			
 
				+        filename = link.split('/')[-1]
			
 
				+
			
 
				+    with open(filename, 'wb') as file:
			
 
				+        file.write(response.content)
			
 
				+    print(f"Downloaded {fmt}.")