zeitdownload.py 5.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158
  1. #!/usr/bin/env python3
  2. import requests
  3. import lxml.html
  4. import sys
  5. import re
  6. import os.path
  7. import hashlib
  8. from argparse import ArgumentParser
  9. RELEASE_XPATH = '//p[@class="epaper-info-release-date"]'
  10. DOWNLOAD_XPATH = "//a[contains(text(), '{}')]"
  11. DATE_REGEX = r"^\d{2}\.\d{2}\.\d{4}$"
  12. parser = ArgumentParser(description='Download "Die Zeit" in multiple formats from the premium subscription service')
  13. parser.add_argument('--email', type=str, required=True,
  14. help='Email you used for the digital subscription signup')
  15. parser.add_argument('--password', type=str, required=True,
  16. help='Corresponding password')
  17. parser.add_argument('--reload', default=False, action='store_true',
  18. help='Download file even though it already exists')
  19. parser.add_argument('--pdf', dest='formats',
  20. action='append_const', const='pdf',
  21. help='Download full-page PDF')
  22. parser.add_argument('--epub', dest='formats',
  23. action='append_const', const='epub',
  24. help='Download EPUB file for E-Readers')
  25. group = parser.add_mutually_exclusive_group()
  26. group.add_argument('--date', type=str,
  27. help='Download file from specified date (dd.mm.yyyy)')
  28. group.add_argument('--num-release', type=int, choices=range(0, 7),
  29. help='Download one of the past releases by numbers from the current one; \n \
  30. 0 is the current release, 1 the previous one, up until 7')
  31. args = parser.parse_args()
  32. email = args.email
  33. password = args.password
  34. forcereload = args.reload
  35. formats = args.formats
  36. release_date = args.date
  37. num_release = args.num_release
  38. if release_date:
  39. if not re.match(DATE_REGEX, release_date):
  40. print(f"{release_date} is not a valid date.")
  41. sys.exit(5)
  42. if formats == None:
  43. print("No formats specified, all done.")
  44. sys.exit(0)
  45. # Src: https://stackoverflow.com/questions/22058048/hashing-a-file-in-python#22058673
  46. def md5sum(path):
  47. BUF_SIZE = 4 * 1024 * 1024 # 4 MiB
  48. md5 = hashlib.md5()
  49. with open(path, 'rb') as f:
  50. while True:
  51. data = f.read(BUF_SIZE)
  52. if not data:
  53. break
  54. md5.update(data)
  55. return md5.hexdigest()
  56. def download_file(format, filename, req_session, doc):
  57. link_elements = document.xpath(DOWNLOAD_XPATH.format(format_btns[fmt]))
  58. if len(link_elements) < 1:
  59. return -1
  60. link = link_elements[0].attrib['href']
  61. request_headers = {}
  62. if os.path.exists(filename) and not forcereload:
  63. # Somehow E-Tags do not work for PDF
  64. if fmt == 'pdf':
  65. return -2
  66. else:
  67. request_headers["If-None-Match"] = '"' + md5sum(filename) + '"'
  68. url = "https://epaper.zeit.de" + link \
  69. if not link.startswith('https') else link
  70. response = s.get(url, headers=request_headers)
  71. if response.status_code == 304:
  72. return 304
  73. if response.status_code != 200:
  74. return response
  75. return response.content
  76. s = requests.Session()
  77. headers = {
  78. 'Origin': 'https://meine.zeit.de',
  79. }
  80. login_page = s.get('https://meine.zeit.de/anmelden?url=https%3A%2F%2Fwww.zeit.de%2Findex&entry_service=sonstige')
  81. response = s.post('https://meine.zeit.de/anmelden', {
  82. 'entry_service': 'sonstige',
  83. 'product_id': 'sonstige',
  84. 'return_url': 'https://www.zeit.de/index',
  85. 'email': email,
  86. 'pass': password,
  87. 'csrf_token': s.cookies['csrf_token']
  88. }, headers=headers)
  89. if not 'zeit_sso_201501' in s.cookies:
  90. print("Invalid login.")
  91. sys.exit(-1)
  92. format_btns = {
  93. 'pdf': 'GESAMT-PDF LADEN',
  94. 'epub': 'EPUB FÜR E-READER LADEN'
  95. }
  96. # Figure out which date to use if no date was supplied directly
  97. if not release_date:
  98. num = 0
  99. if num_release:
  100. num = num_release
  101. response = s.get('https://epaper.zeit.de/abo/diezeit')
  102. document = lxml.html.fromstring(response.text)
  103. latest_releases = list(map(lambda el: el.text,
  104. document.xpath(RELEASE_XPATH)))
  105. if not re.match(DATE_REGEX, latest_releases[num]):
  106. print(f"Scraping broken, {latest_releases[num]} not valid date.")
  107. release_date = latest_releases[num]
  108. # Get buttons for format downloads
  109. # This is done separated from the download_file function to
  110. # avoid an overhead through multiple downloads
  111. response = s.get(f"https://epaper.zeit.de/abo/diezeit/{release_date}")
  112. if (response.url == 'https://epaper.zeit.de/abo/diezeit'):
  113. print(f"No release published on {release_date}")
  114. sys.exit(6)
  115. document = lxml.html.fromstring(response.text)
  116. for fmt in formats:
  117. # Get filename from Content-Disposition header
  118. date = "-".join(release_date.split(".")[::-1])
  119. filename = 'die_zeit_' + release_date + "." + fmt
  120. print(f"Downloading {fmt}...")
  121. response = download_file(fmt, filename, s, document)
  122. if (response == -1):
  123. print(f"Skipping {fmt} download, scraping broken")
  124. continue
  125. elif (response == -2):
  126. print(f"File {filename} already exits. If you want to download anyway, use --reload")
  127. continue
  128. elif (response == 304):
  129. print(" => Skipped, file did not change")
  130. continue
  131. elif (isinstance(response, int)):
  132. print(f"Request returned status {response}", file=sys.stderr)
  133. continue
  134. # Everything is clear, function returns actual file
  135. with open(filename, 'wb') as file:
  136. file.write(response)
  137. print(f"Downloaded {fmt} to {filename}")