youtube_dl/extractor/applepodcasts.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 from .common import InfoExtractor
   5 from ..utils import (
   6     clean_html,
   7     clean_podcast_url,
   8     get_element_by_class,
   9     int_or_none,
  10     parse_codecs,
  11     parse_iso8601,
  12     try_get,
  13 )
  14
  15
  16 class ApplePodcastsIE(InfoExtractor):
  17     _VALID_URL = r'https?://podcasts\.apple\.com/(?:[^/]+/)?podcast(?:/[^/]+){1,2}.*?\bi=(?P<id>\d+)'
  18     _TESTS = [{
  19         'url': 'https://podcasts.apple.com/us/podcast/207-whitney-webb-returns/id1135137367?i=1000482637777',
  20         'md5': '41dc31cd650143e530d9423b6b5a344f',
  21         'info_dict': {
  22             'id': '1000482637777',
  23             'ext': 'mp3',
  24             'title': '207 - Whitney Webb Returns',
  25             'description': 'md5:75ef4316031df7b41ced4e7b987f79c6',
  26             'upload_date': '20200705',
  27             'timestamp': 1593932400,
  28             'duration': 6454,
  29             'series': 'The Tim Dillon Show',
  30             'thumbnail': 're:.+[.](png|jpe?g|webp)',
  31         }
  32     }, {
  33         'url': 'https://podcasts.apple.com/podcast/207-whitney-webb-returns/id1135137367?i=1000482637777',
  34         'only_matching': True,
  35     }, {
  36         'url': 'https://podcasts.apple.com/podcast/207-whitney-webb-returns?i=1000482637777',
  37         'only_matching': True,
  38     }, {
  39         'url': 'https://podcasts.apple.com/podcast/id1135137367?i=1000482637777',
  40         'only_matching': True,
  41     }]
  42
  43     def _real_extract(self, url):
  44         episode_id = self._match_id(url)
  45         webpage = self._download_webpage(url, episode_id)
  46         episode_data = {}
  47         ember_data = {}
  48         # new page type 2021-11
  49         amp_data = self._parse_json(self._search_regex(
  50             r'(?s)id="shoebox-media-api-cache-amp-podcasts"[^>]*>\s*({.+?})\s*<',
  51             webpage, 'AMP data', default='{}'), episode_id, fatal=False) or {}
  52         amp_data = try_get(amp_data,
  53                            lambda a: self._parse_json(
  54                                next(a[x] for x in iter(a) if episode_id in x),
  55                                episode_id),
  56                            dict) or {}
  57         amp_data = amp_data.get('d') or []
  58         episode_data = try_get(
  59             amp_data,
  60             lambda a: next(x for x in a
  61                            if x['type'] == 'podcast-episodes' and x['id'] == episode_id),
  62             dict)
  63         if not episode_data:
  64             # try pre 2021-11 page type: TODO: consider deleting if no longer used
  65             ember_data = self._parse_json(self._search_regex(
  66                 r'(?s)id="shoebox-ember-data-store"[^>]*>\s*({.+?})\s*<',
  67                 webpage, 'ember data'), episode_id) or {}
  68             ember_data = ember_data.get(episode_id) or ember_data
  69             episode_data = try_get(ember_data, lambda x: x['data'], dict)
  70         episode = episode_data['attributes']
  71         description = episode.get('description') or {}
  72
  73         series = None
  74         for inc in (amp_data or ember_data.get('included') or []):
  75             if inc.get('type') == 'media/podcast':
  76                 series = try_get(inc, lambda x: x['attributes']['name'])
  77         series = series or clean_html(get_element_by_class('podcast-header__identity', webpage))
  78
  79         info = [{
  80             'id': episode_id,
  81             'title': episode['name'],
  82             'url': clean_podcast_url(episode['assetUrl']),
  83             'description': description.get('standard') or description.get('short'),
  84             'timestamp': parse_iso8601(episode.get('releaseDateTime')),
  85             'duration': int_or_none(episode.get('durationInMilliseconds'), 1000),
  86             'series': series,
  87             'thumbnail': self._og_search_thumbnail(webpage),
  88         }]
  89         self._sort_formats(info)
  90         info = info[0]
  91         codecs = parse_codecs(info.get('ext', 'mp3'))
  92         info.update(codecs)
  93         return info