zeitdownload.py 3.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125
  1. #!/usr/bin/env python3
  2. import requests
  3. import lxml.html
  4. import cgi
  5. import sys
  6. import re
  7. import os.path
  8. import hashlib
  9. from argparse import ArgumentParser
  10. parser = ArgumentParser(description='Download "Die Zeit" in multiple formats from the premium subscription service')
  11. parser.add_argument('--email', type=str, required=True,
  12. help='Email you used for the digital subscription signup')
  13. parser.add_argument('--password', type=str, required=True,
  14. help='Corresponding password')
  15. parser.add_argument('--reload', default=False, action='store_true',
  16. help='Download file even though it already exists')
  17. parser.add_argument('--pdf', dest='formats',
  18. action='append_const', const='pdf',
  19. help='Download full-page PDF')
  20. parser.add_argument('--epub', dest='formats',
  21. action='append_const', const='epub',
  22. help='Download EPUB file for E-Readers')
  23. args = parser.parse_args()
  24. email = args.email
  25. password = args.password
  26. forcereload = args.reload
  27. formats = args.formats
  28. if formats == None:
  29. print("No formats specified, all done.")
  30. sys.exit(0)
  31. # Src: https://stackoverflow.com/questions/22058048/hashing-a-file-in-python#22058673
  32. def md5sum(path):
  33. BUF_SIZE = 4 * 1024 * 1024 # 4 MiB
  34. md5 = hashlib.md5()
  35. with open(path, 'rb') as f:
  36. while True:
  37. data = f.read(BUF_SIZE)
  38. if not data:
  39. break
  40. md5.update(data)
  41. return md5.hexdigest()
  42. RELEASE_XPATH = '//p[@class="epaper-info-release-date"]'
  43. DOWNLOAD_XPATH = "//a[contains(text(), '{}')]"
  44. DATE_REGEX = r"^\d{2}\.\d{2}\.\d{4}$"
  45. s = requests.Session()
  46. headers = {
  47. 'Origin': 'https://meine.zeit.de',
  48. }
  49. login_page = s.get('https://meine.zeit.de/anmelden?url=https%3A%2F%2Fwww.zeit.de%2Findex&entry_service=sonstige')
  50. response = s.post('https://meine.zeit.de/anmelden', {
  51. 'entry_service': 'sonstige',
  52. 'product_id': 'sonstige',
  53. 'return_url': 'https://www.zeit.de/index',
  54. 'email': email,
  55. 'pass': password,
  56. 'csrf_token': s.cookies['csrf_token']
  57. }, headers=headers)
  58. if not 'zeit_sso_201501' in s.cookies:
  59. print("Invalid login.")
  60. sys.exit(-1)
  61. format_btns = {
  62. 'pdf': 'GESAMT-PDF LADEN',
  63. 'epub': 'EPUB FÜR E-READER LADEN'
  64. }
  65. response = s.get('https://epaper.zeit.de/abo/diezeit')
  66. document = lxml.html.fromstring(response.text)
  67. release_dates = list(map(lambda el: el.text,
  68. document.xpath(RELEASE_XPATH)))
  69. latest_release = release_dates[0]
  70. if not re.match(DATE_REGEX, latest_release):
  71. print(f"Scraping broken, {latest_release} not valid date.")
  72. response = s.get(f"https://epaper.zeit.de/abo/diezeit/{latest_release}")
  73. document = lxml.html.fromstring(response.text)
  74. for fmt in formats:
  75. link_elements = document.xpath(DOWNLOAD_XPATH.format(format_btns[fmt]))
  76. if len(link_elements) < 1:
  77. print(f"Skipping {fmt} download, scraping broken")
  78. link = link_elements[0].attrib['href']
  79. # Get filename from Content-Disposition header
  80. date = "-".join(latest_release.split(".")[::-1])
  81. filename = 'die_zeit_' + date + "." + fmt
  82. request_headers = {}
  83. if os.path.exists(filename) and not forcereload:
  84. # Somehow E-Tags do not work for PDF
  85. if fmt == 'pdf':
  86. print(f"File {filename} already exits. If you want to download anyway, use --reload")
  87. continue
  88. else:
  89. request_headers["If-None-Match"] = '"' + md5sum(filename) + '"'
  90. url = "https://epaper.zeit.de" + link \
  91. if not link.startswith('https') else link
  92. print(f"Downloading {fmt} from {url}...")
  93. response = s.get(url, headers=request_headers)
  94. if response.status_code == 304:
  95. print(" => Skipped, file did not change")
  96. continue
  97. if response.status_code != 200:
  98. print(f"Request for {url} returned status {response.status_code}", file=sys.stderr)
  99. sys.exit(-1)
  100. with open(filename, 'wb') as file:
  101. file.write(response.content)
  102. print(f"Downloaded {fmt} to {filename}")