123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158 |
- #!/usr/bin/env python3
- import requests
- import lxml.html
- import sys
- import re
- import os.path
- import hashlib
- from argparse import ArgumentParser
- RELEASE_XPATH = '//p[@class="epaper-info-release-date"]'
- DOWNLOAD_XPATH = "//a[contains(text(), '{}')]"
- DATE_REGEX = r"^\d{2}\.\d{2}\.\d{4}$"
- parser = ArgumentParser(description='Download "Die Zeit" in multiple formats from the premium subscription service')
- parser.add_argument('--email', type=str, required=True,
- help='Email you used for the digital subscription signup')
- parser.add_argument('--password', type=str, required=True,
- help='Corresponding password')
- parser.add_argument('--reload', default=False, action='store_true',
- help='Download file even though it already exists')
- parser.add_argument('--pdf', dest='formats',
- action='append_const', const='pdf',
- help='Download full-page PDF')
- parser.add_argument('--epub', dest='formats',
- action='append_const', const='epub',
- help='Download EPUB file for E-Readers')
- group = parser.add_mutually_exclusive_group()
- group.add_argument('--date', type=str,
- help='Download file from specified date (dd.mm.yyyy)')
- group.add_argument('--num-release', type=int, choices=range(0, 7),
- help='Download one of the past releases by numbers from the current one; \n \
- 0 is the current release, 1 the previous one, up until 7')
- args = parser.parse_args()
- email = args.email
- password = args.password
- forcereload = args.reload
- formats = args.formats
- release_date = args.date
- num_release = args.num_release
- if release_date:
- if not re.match(DATE_REGEX, release_date):
- print(f"{release_date} is not a valid date.")
- sys.exit(5)
- if formats == None:
- print("No formats specified, all done.")
- sys.exit(0)
- # Src: https://stackoverflow.com/questions/22058048/hashing-a-file-in-python#22058673
- def md5sum(path):
- BUF_SIZE = 4 * 1024 * 1024 # 4 MiB
- md5 = hashlib.md5()
- with open(path, 'rb') as f:
- while True:
- data = f.read(BUF_SIZE)
- if not data:
- break
- md5.update(data)
- return md5.hexdigest()
- def download_file(format, filename, req_session, doc):
- link_elements = document.xpath(DOWNLOAD_XPATH.format(format_btns[fmt]))
- if len(link_elements) < 1:
- return -1
- link = link_elements[0].attrib['href']
- request_headers = {}
- if os.path.exists(filename) and not forcereload:
- # Somehow E-Tags do not work for PDF
- if fmt == 'pdf':
- return -2
- else:
- request_headers["If-None-Match"] = '"' + md5sum(filename) + '"'
- url = "https://epaper.zeit.de" + link \
- if not link.startswith('https') else link
- response = s.get(url, headers=request_headers)
- if response.status_code == 304:
- return 304
- if response.status_code != 200:
- return response
- return response.content
- s = requests.Session()
- headers = {
- 'Origin': 'https://meine.zeit.de',
- }
- login_page = s.get('https://meine.zeit.de/anmelden?url=https%3A%2F%2Fwww.zeit.de%2Findex&entry_service=sonstige')
- response = s.post('https://meine.zeit.de/anmelden', {
- 'entry_service': 'sonstige',
- 'product_id': 'sonstige',
- 'return_url': 'https://www.zeit.de/index',
- 'email': email,
- 'pass': password,
- 'csrf_token': s.cookies['csrf_token']
- }, headers=headers)
- if not 'zeit_sso_201501' in s.cookies:
- print("Invalid login.")
- sys.exit(-1)
- format_btns = {
- 'pdf': 'GESAMT-PDF LADEN',
- 'epub': 'EPUB FÜR E-READER LADEN'
- }
- # Figure out which date to use if no date was supplied directly
- if not release_date:
- num = 0
- if num_release:
- num = num_release
- response = s.get('https://epaper.zeit.de/abo/diezeit')
- document = lxml.html.fromstring(response.text)
- latest_releases = list(map(lambda el: el.text,
- document.xpath(RELEASE_XPATH)))
- if not re.match(DATE_REGEX, latest_releases[num]):
- print(f"Scraping broken, {latest_releases[num]} not valid date.")
- release_date = latest_releases[num]
- # Get buttons for format downloads
- # This is done separated from the download_file function to
- # avoid an overhead through multiple downloads
- response = s.get(f"https://epaper.zeit.de/abo/diezeit/{release_date}")
- if (response.url == 'https://epaper.zeit.de/abo/diezeit'):
- print(f"No release published on {release_date}")
- sys.exit(6)
- document = lxml.html.fromstring(response.text)
- for fmt in formats:
- # Get filename from Content-Disposition header
- date = "-".join(release_date.split(".")[::-1])
- filename = 'die_zeit_' + release_date + "." + fmt
-
- print(f"Downloading {fmt}...")
- response = download_file(fmt, filename, s, document)
- if (response == -1):
- print(f"Skipping {fmt} download, scraping broken")
- continue
- elif (response == -2):
- print(f"File {filename} already exits. If you want to download anyway, use --reload")
- continue
- elif (response == 304):
- print(" => Skipped, file did not change")
- continue
- elif (isinstance(response, int)):
- print(f"Request returned status {response}", file=sys.stderr)
- continue
- # Everything is clear, function returns actual file
- with open(filename, 'wb') as file:
- file.write(response)
- print(f"Downloaded {fmt} to {filename}")
|