zeitdownload.py 2.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384
  1. #!/usr/bin/env python3
  2. import requests
  3. import lxml.html
  4. import cgi
  5. import sys
  6. from argparse import ArgumentParser
  7. parser = ArgumentParser(description='Download "Die Zeit" in multiple formats from the premium subscription service')
  8. parser.add_argument('--email', type=str, required=True,
  9. help='Email you used for the digital subscription signup')
  10. parser.add_argument('--password', type=str, required=True,
  11. help='Corresponding password')
  12. parser.add_argument('--pdf', dest='formats',
  13. action='append_const', const='pdf',
  14. help='Download full-page PDF')
  15. parser.add_argument('--epub', dest='formats',
  16. action='append_const', const='epub',
  17. help='Download EPUB file for E-Readers')
  18. parser.add_argument('--mobi', dest='formats',
  19. action='append_const', const='mobi',
  20. help='Download MOBI file for Kindles')
  21. args = parser.parse_args()
  22. email = args.email
  23. password = args.password
  24. formats = args.formats
  25. if formats == None:
  26. print("No formats specified, all done.")
  27. sys.exit(0)
  28. RELEASE_XPATH = '//p[@class="epaper-info-release-date"]'
  29. DOWNLOAD_XPATH = "//a[contains(text(), '{}')]"
  30. s = requests.Session()
  31. response = s.post('https://meine.zeit.de/anmelden', {
  32. 'entry_service': 'sonstige',
  33. 'product_id': 'sonstige',
  34. 'return_url': 'https://www.zeit.de/index',
  35. 'email': email,
  36. 'pass': password,
  37. 'permanent': 'on'
  38. })
  39. if not 'zeit_sso_201501' in s.cookies:
  40. print("Invalid login.")
  41. sys.exit(-1)
  42. format_btns = {
  43. 'pdf': 'GESAMT-PDF LADEN',
  44. 'epub': 'EPUB FÜR E-READER LADEN',
  45. 'mobi': 'MOBI FÜR KINDLE LADEN'
  46. }
  47. response = s.get('https://epaper.zeit.de/abo/diezeit')
  48. document = lxml.html.fromstring(response.text)
  49. release_dates = list(map(lambda el: el.text,
  50. document.xpath(RELEASE_XPATH)))
  51. latest_release = release_dates[0]
  52. response = s.get(f"https://epaper.zeit.de/abo/diezeit/{latest_release}")
  53. document = lxml.html.fromstring(response.text)
  54. for fmt in formats:
  55. link_elements = document.xpath(DOWNLOAD_XPATH.format(format_btns[fmt]))
  56. if len(link_elements) < 1:
  57. print(f"Skipping {fmt} download, scraping broken")
  58. link = link_elements[0].attrib['href']
  59. print(f"Downloading {fmt} from {link}...")
  60. response = s.get("https://epaper.zeit.de" + link
  61. if not link.startswith('https') else link)
  62. # Get filename from Content-Disposition header
  63. filename = ''
  64. if 'Content-Disposition' in response.headers.keys():
  65. value, params = cgi.parse_header(response.headers['Content-Disposition'])
  66. filename = params['filename']
  67. else:
  68. filename = link.split('/')[-1]
  69. with open(filename, 'wb') as file:
  70. file.write(response.content)
  71. print(f"Downloaded {fmt}.")