]> asedeno.scripts.mit.edu Git - youtube-dl.git/blob - youtube_dl/extractor/stitcher.py
[twitch:clips] Add access token query to download URLs (closes #29136)
[youtube-dl.git] / youtube_dl / extractor / stitcher.py
1 from __future__ import unicode_literals
2
3 from .common import InfoExtractor
4 from ..compat import compat_str
5 from ..utils import (
6     clean_html,
7     clean_podcast_url,
8     ExtractorError,
9     int_or_none,
10     str_or_none,
11     try_get,
12     url_or_none,
13 )
14
15
16 class StitcherBaseIE(InfoExtractor):
17     _VALID_URL_BASE = r'https?://(?:www\.)?stitcher\.com/(?:podcast|show)/'
18
19     def _call_api(self, path, video_id, query):
20         resp = self._download_json(
21             'https://api.prod.stitcher.com/' + path,
22             video_id, query=query)
23         error_massage = try_get(resp, lambda x: x['errors'][0]['message'])
24         if error_massage:
25             raise ExtractorError(error_massage, expected=True)
26         return resp['data']
27
28     def _extract_description(self, data):
29         return clean_html(data.get('html_description') or data.get('description'))
30
31     def _extract_audio_url(self, episode):
32         return url_or_none(episode.get('audio_url') or episode.get('guid'))
33
34     def _extract_show_info(self, show):
35         return {
36             'thumbnail': show.get('image_base_url'),
37             'series': show.get('title'),
38         }
39
40     def _extract_episode(self, episode, audio_url, show_info):
41         info = {
42             'id': compat_str(episode['id']),
43             'display_id': episode.get('slug'),
44             'title': episode['title'].strip(),
45             'description': self._extract_description(episode),
46             'duration': int_or_none(episode.get('duration')),
47             'url': clean_podcast_url(audio_url),
48             'vcodec': 'none',
49             'timestamp': int_or_none(episode.get('date_published')),
50             'season_number': int_or_none(episode.get('season')),
51             'season_id': str_or_none(episode.get('season_id')),
52         }
53         info.update(show_info)
54         return info
55
56
57 class StitcherIE(StitcherBaseIE):
58     _VALID_URL = StitcherBaseIE._VALID_URL_BASE + r'(?:[^/]+/)+e(?:pisode)?/(?:[^/#?&]+-)?(?P<id>\d+)'
59     _TESTS = [{
60         'url': 'http://www.stitcher.com/podcast/the-talking-machines/e/40789481?autoplay=true',
61         'md5': 'e9635098e0da10b21a0e2b85585530f6',
62         'info_dict': {
63             'id': '40789481',
64             'ext': 'mp3',
65             'title': 'Machine Learning Mastery and Cancer Clusters',
66             'description': 'md5:547adb4081864be114ae3831b4c2b42f',
67             'duration': 1604,
68             'thumbnail': r're:^https?://.*\.jpg',
69             'upload_date': '20151008',
70             'timestamp': 1444285800,
71             'series': 'Talking Machines',
72         },
73     }, {
74         'url': 'http://www.stitcher.com/podcast/panoply/vulture-tv/e/the-rare-hourlong-comedy-plus-40846275?autoplay=true',
75         'info_dict': {
76             'id': '40846275',
77             'display_id': 'the-rare-hourlong-comedy-plus',
78             'ext': 'mp3',
79             'title': "The CW's 'Crazy Ex-Girlfriend'",
80             'description': 'md5:04f1e2f98eb3f5cbb094cea0f9e19b17',
81             'duration': 2235,
82             'thumbnail': r're:^https?://.*\.jpg',
83         },
84         'params': {
85             'skip_download': True,
86         },
87         'skip': 'Page Not Found',
88     }, {
89         # escaped title
90         'url': 'http://www.stitcher.com/podcast/marketplace-on-stitcher/e/40910226?autoplay=true',
91         'only_matching': True,
92     }, {
93         'url': 'http://www.stitcher.com/podcast/panoply/getting-in/e/episode-2a-how-many-extracurriculars-should-i-have-40876278?autoplay=true',
94         'only_matching': True,
95     }, {
96         'url': 'https://www.stitcher.com/show/threedom/episode/circles-on-a-stick-200212584',
97         'only_matching': True,
98     }]
99
100     def _real_extract(self, url):
101         audio_id = self._match_id(url)
102         data = self._call_api(
103             'shows/episodes', audio_id, {'episode_ids': audio_id})
104         episode = data['episodes'][0]
105         audio_url = self._extract_audio_url(episode)
106         if not audio_url:
107             self.raise_login_required()
108         show = try_get(data, lambda x: x['shows'][0], dict) or {}
109         return self._extract_episode(
110             episode, audio_url, self._extract_show_info(show))
111
112
113 class StitcherShowIE(StitcherBaseIE):
114     _VALID_URL = StitcherBaseIE._VALID_URL_BASE + r'(?P<id>[^/#?&]+)/?(?:[?#&]|$)'
115     _TESTS = [{
116         'url': 'http://www.stitcher.com/podcast/the-talking-machines',
117         'info_dict': {
118             'id': 'the-talking-machines',
119             'title': 'Talking Machines',
120             'description': 'md5:831f0995e40f26c10231af39cf1ebf0b',
121         },
122         'playlist_mincount': 106,
123     }, {
124         'url': 'https://www.stitcher.com/show/the-talking-machines',
125         'only_matching': True,
126     }]
127
128     def _real_extract(self, url):
129         show_slug = self._match_id(url)
130         data = self._call_api(
131             'search/show/%s/allEpisodes' % show_slug, show_slug, {'count': 10000})
132         show = try_get(data, lambda x: x['shows'][0], dict) or {}
133         show_info = self._extract_show_info(show)
134
135         entries = []
136         for episode in (data.get('episodes') or []):
137             audio_url = self._extract_audio_url(episode)
138             if not audio_url:
139                 continue
140             entries.append(self._extract_episode(episode, audio_url, show_info))
141
142         return self.playlist_result(
143             entries, show_slug, show.get('title'),
144             self._extract_description(show))