]> asedeno.scripts.mit.edu Git - youtube-dl.git/blob - youtube_dl/extractor/applepodcasts.py
[NHK] Use new API URL
[youtube-dl.git] / youtube_dl / extractor / applepodcasts.py
1 # coding: utf-8
2 from __future__ import unicode_literals
3
4 from .common import InfoExtractor
5 from ..utils import (
6     clean_html,
7     clean_podcast_url,
8     get_element_by_class,
9     int_or_none,
10     parse_codecs,
11     parse_iso8601,
12     try_get,
13 )
14
15
16 class ApplePodcastsIE(InfoExtractor):
17     _VALID_URL = r'https?://podcasts\.apple\.com/(?:[^/]+/)?podcast(?:/[^/]+){1,2}.*?\bi=(?P<id>\d+)'
18     _TESTS = [{
19         'url': 'https://podcasts.apple.com/us/podcast/207-whitney-webb-returns/id1135137367?i=1000482637777',
20         'md5': '41dc31cd650143e530d9423b6b5a344f',
21         'info_dict': {
22             'id': '1000482637777',
23             'ext': 'mp3',
24             'title': '207 - Whitney Webb Returns',
25             'description': 'md5:75ef4316031df7b41ced4e7b987f79c6',
26             'upload_date': '20200705',
27             'timestamp': 1593932400,
28             'duration': 6454,
29             'series': 'The Tim Dillon Show',
30             'thumbnail': 're:.+[.](png|jpe?g|webp)',
31         }
32     }, {
33         'url': 'https://podcasts.apple.com/podcast/207-whitney-webb-returns/id1135137367?i=1000482637777',
34         'only_matching': True,
35     }, {
36         'url': 'https://podcasts.apple.com/podcast/207-whitney-webb-returns?i=1000482637777',
37         'only_matching': True,
38     }, {
39         'url': 'https://podcasts.apple.com/podcast/id1135137367?i=1000482637777',
40         'only_matching': True,
41     }]
42
43     def _real_extract(self, url):
44         episode_id = self._match_id(url)
45         webpage = self._download_webpage(url, episode_id)
46         episode_data = {}
47         ember_data = {}
48         # new page type 2021-11
49         amp_data = self._parse_json(self._search_regex(
50             r'(?s)id="shoebox-media-api-cache-amp-podcasts"[^>]*>\s*({.+?})\s*<',
51             webpage, 'AMP data', default='{}'), episode_id, fatal=False) or {}
52         amp_data = try_get(amp_data,
53                            lambda a: self._parse_json(
54                                next(a[x] for x in iter(a) if episode_id in x),
55                                episode_id),
56                            dict) or {}
57         amp_data = amp_data.get('d') or []
58         episode_data = try_get(
59             amp_data,
60             lambda a: next(x for x in a
61                            if x['type'] == 'podcast-episodes' and x['id'] == episode_id),
62             dict)
63         if not episode_data:
64             # try pre 2021-11 page type: TODO: consider deleting if no longer used
65             ember_data = self._parse_json(self._search_regex(
66                 r'(?s)id="shoebox-ember-data-store"[^>]*>\s*({.+?})\s*<',
67                 webpage, 'ember data'), episode_id) or {}
68             ember_data = ember_data.get(episode_id) or ember_data
69             episode_data = try_get(ember_data, lambda x: x['data'], dict)
70         episode = episode_data['attributes']
71         description = episode.get('description') or {}
72
73         series = None
74         for inc in (amp_data or ember_data.get('included') or []):
75             if inc.get('type') == 'media/podcast':
76                 series = try_get(inc, lambda x: x['attributes']['name'])
77         series = series or clean_html(get_element_by_class('podcast-header__identity', webpage))
78
79         info = [{
80             'id': episode_id,
81             'title': episode['name'],
82             'url': clean_podcast_url(episode['assetUrl']),
83             'description': description.get('standard') or description.get('short'),
84             'timestamp': parse_iso8601(episode.get('releaseDateTime')),
85             'duration': int_or_none(episode.get('durationInMilliseconds'), 1000),
86             'series': series,
87             'thumbnail': self._og_search_thumbnail(webpage),
88         }]
89         self._sort_formats(info)
90         info = info[0]
91         codecs = parse_codecs(info.get('ext', 'mp3'))
92         info.update(codecs)
93         return info