]> asedeno.scripts.mit.edu Git - youtube-dl.git/blob - youtube_dl/extractor/arte.py
[NHK] Use new API URL
[youtube-dl.git] / youtube_dl / extractor / arte.py
1 # coding: utf-8
2 from __future__ import unicode_literals
3
4 import re
5
6 from .common import InfoExtractor
7 from ..compat import (
8     compat_str,
9     compat_urlparse,
10 )
11 from ..utils import (
12     ExtractorError,
13     int_or_none,
14     qualities,
15     strip_or_none,
16     try_get,
17     unified_strdate,
18     url_or_none,
19 )
20
21
22 class ArteTVBaseIE(InfoExtractor):
23     _ARTE_LANGUAGES = 'fr|de|en|es|it|pl'
24     _API_BASE = 'https://api.arte.tv/api/player/v1'
25
26
27 class ArteTVIE(ArteTVBaseIE):
28     _VALID_URL = r'''(?x)
29                     https?://
30                         (?:
31                             (?:www\.)?arte\.tv/(?P<lang>%(langs)s)/videos|
32                             api\.arte\.tv/api/player/v\d+/config/(?P<lang_2>%(langs)s)
33                         )
34                         /(?P<id>\d{6}-\d{3}-[AF])
35                     ''' % {'langs': ArteTVBaseIE._ARTE_LANGUAGES}
36     _TESTS = [{
37         'url': 'https://www.arte.tv/en/videos/088501-000-A/mexico-stealing-petrol-to-survive/',
38         'info_dict': {
39             'id': '088501-000-A',
40             'ext': 'mp4',
41             'title': 'Mexico: Stealing Petrol to Survive',
42             'upload_date': '20190628',
43         },
44     }, {
45         'url': 'https://www.arte.tv/pl/videos/100103-000-A/usa-dyskryminacja-na-porodowce/',
46         'only_matching': True,
47     }, {
48         'url': 'https://api.arte.tv/api/player/v2/config/de/100605-013-A',
49         'only_matching': True,
50     }]
51
52     def _real_extract(self, url):
53         mobj = re.match(self._VALID_URL, url)
54         video_id = mobj.group('id')
55         lang = mobj.group('lang') or mobj.group('lang_2')
56
57         info = self._download_json(
58             '%s/config/%s/%s' % (self._API_BASE, lang, video_id), video_id)
59         player_info = info['videoJsonPlayer']
60
61         vsr = try_get(player_info, lambda x: x['VSR'], dict)
62         if not vsr:
63             error = None
64             if try_get(player_info, lambda x: x['custom_msg']['type']) == 'error':
65                 error = try_get(
66                     player_info, lambda x: x['custom_msg']['msg'], compat_str)
67             if not error:
68                 error = 'Video %s is not available' % player_info.get('VID') or video_id
69             raise ExtractorError(error, expected=True)
70
71         upload_date_str = player_info.get('shootingDate')
72         if not upload_date_str:
73             upload_date_str = (player_info.get('VRA') or player_info.get('VDA') or '').split(' ')[0]
74
75         title = (player_info.get('VTI') or player_info['VID']).strip()
76         subtitle = player_info.get('VSU', '').strip()
77         if subtitle:
78             title += ' - %s' % subtitle
79
80         qfunc = qualities(['MQ', 'HQ', 'EQ', 'SQ'])
81
82         LANGS = {
83             'fr': 'F',
84             'de': 'A',
85             'en': 'E[ANG]',
86             'es': 'E[ESP]',
87             'it': 'E[ITA]',
88             'pl': 'E[POL]',
89         }
90
91         langcode = LANGS.get(lang, lang)
92
93         formats = []
94         for format_id, format_dict in vsr.items():
95             f = dict(format_dict)
96             format_url = url_or_none(f.get('url'))
97             streamer = f.get('streamer')
98             if not format_url and not streamer:
99                 continue
100             versionCode = f.get('versionCode')
101             l = re.escape(langcode)
102
103             # Language preference from most to least priority
104             # Reference: section 6.8 of
105             # https://www.arte.tv/sites/en/corporate/files/complete-technical-guidelines-arte-geie-v1-07-1.pdf
106             PREFERENCES = (
107                 # original version in requested language, without subtitles
108                 r'VO{0}$'.format(l),
109                 # original version in requested language, with partial subtitles in requested language
110                 r'VO{0}-ST{0}$'.format(l),
111                 # original version in requested language, with subtitles for the deaf and hard-of-hearing in requested language
112                 r'VO{0}-STM{0}$'.format(l),
113                 # non-original (dubbed) version in requested language, without subtitles
114                 r'V{0}$'.format(l),
115                 # non-original (dubbed) version in requested language, with subtitles partial subtitles in requested language
116                 r'V{0}-ST{0}$'.format(l),
117                 # non-original (dubbed) version in requested language, with subtitles for the deaf and hard-of-hearing in requested language
118                 r'V{0}-STM{0}$'.format(l),
119                 # original version in requested language, with partial subtitles in different language
120                 r'VO{0}-ST(?!{0}).+?$'.format(l),
121                 # original version in requested language, with subtitles for the deaf and hard-of-hearing in different language
122                 r'VO{0}-STM(?!{0}).+?$'.format(l),
123                 # original version in different language, with partial subtitles in requested language
124                 r'VO(?:(?!{0}).+?)?-ST{0}$'.format(l),
125                 # original version in different language, with subtitles for the deaf and hard-of-hearing in requested language
126                 r'VO(?:(?!{0}).+?)?-STM{0}$'.format(l),
127                 # original version in different language, without subtitles
128                 r'VO(?:(?!{0}))?$'.format(l),
129                 # original version in different language, with partial subtitles in different language
130                 r'VO(?:(?!{0}).+?)?-ST(?!{0}).+?$'.format(l),
131                 # original version in different language, with subtitles for the deaf and hard-of-hearing in different language
132                 r'VO(?:(?!{0}).+?)?-STM(?!{0}).+?$'.format(l),
133             )
134
135             for pref, p in enumerate(PREFERENCES):
136                 if re.match(p, versionCode):
137                     lang_pref = len(PREFERENCES) - pref
138                     break
139             else:
140                 lang_pref = -1
141
142             media_type = f.get('mediaType')
143             if media_type == 'hls':
144                 m3u8_formats = self._extract_m3u8_formats(
145                     format_url, video_id, 'mp4', entry_protocol='m3u8_native',
146                     m3u8_id=format_id, fatal=False)
147                 for m3u8_format in m3u8_formats:
148                     m3u8_format['language_preference'] = lang_pref
149                 formats.extend(m3u8_formats)
150                 continue
151
152             format = {
153                 'format_id': format_id,
154                 'preference': -10 if f.get('videoFormat') == 'M3U8' else None,
155                 'language_preference': lang_pref,
156                 'format_note': '%s, %s' % (f.get('versionCode'), f.get('versionLibelle')),
157                 'width': int_or_none(f.get('width')),
158                 'height': int_or_none(f.get('height')),
159                 'tbr': int_or_none(f.get('bitrate')),
160                 'quality': qfunc(f.get('quality')),
161             }
162
163             if media_type == 'rtmp':
164                 format['url'] = f['streamer']
165                 format['play_path'] = 'mp4:' + f['url']
166                 format['ext'] = 'flv'
167             else:
168                 format['url'] = f['url']
169
170             formats.append(format)
171
172         self._sort_formats(formats)
173
174         return {
175             'id': player_info.get('VID') or video_id,
176             'title': title,
177             'description': player_info.get('VDE'),
178             'upload_date': unified_strdate(upload_date_str),
179             'thumbnail': player_info.get('programImage') or player_info.get('VTU', {}).get('IUR'),
180             'formats': formats,
181         }
182
183
184 class ArteTVEmbedIE(InfoExtractor):
185     _VALID_URL = r'https?://(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+'
186     _TESTS = [{
187         'url': 'https://www.arte.tv/player/v5/index.php?json_url=https%3A%2F%2Fapi.arte.tv%2Fapi%2Fplayer%2Fv2%2Fconfig%2Fde%2F100605-013-A&lang=de&autoplay=true&mute=0100605-013-A',
188         'info_dict': {
189             'id': '100605-013-A',
190             'ext': 'mp4',
191             'title': 'United we Stream November Lockdown Edition #13',
192             'description': 'md5:be40b667f45189632b78c1425c7c2ce1',
193             'upload_date': '20201116',
194         },
195     }, {
196         'url': 'https://www.arte.tv/player/v3/index.php?json_url=https://api.arte.tv/api/player/v2/config/de/100605-013-A',
197         'only_matching': True,
198     }]
199
200     @staticmethod
201     def _extract_urls(webpage):
202         return [url for _, url in re.findall(
203             r'<(?:iframe|script)[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+?)\1',
204             webpage)]
205
206     def _real_extract(self, url):
207         qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
208         json_url = qs['json_url'][0]
209         video_id = ArteTVIE._match_id(json_url)
210         return self.url_result(
211             json_url, ie=ArteTVIE.ie_key(), video_id=video_id)
212
213
214 class ArteTVPlaylistIE(ArteTVBaseIE):
215     _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>%s)/videos/(?P<id>RC-\d{6})' % ArteTVBaseIE._ARTE_LANGUAGES
216     _TESTS = [{
217         'url': 'https://www.arte.tv/en/videos/RC-016954/earn-a-living/',
218         'info_dict': {
219             'id': 'RC-016954',
220             'title': 'Earn a Living',
221             'description': 'md5:d322c55011514b3a7241f7fb80d494c2',
222         },
223         'playlist_mincount': 6,
224     }, {
225         'url': 'https://www.arte.tv/pl/videos/RC-014123/arte-reportage/',
226         'only_matching': True,
227     }]
228
229     def _real_extract(self, url):
230         lang, playlist_id = re.match(self._VALID_URL, url).groups()
231         collection = self._download_json(
232             '%s/collectionData/%s/%s?source=videos'
233             % (self._API_BASE, lang, playlist_id), playlist_id)
234         entries = []
235         for video in collection['videos']:
236             if not isinstance(video, dict):
237                 continue
238             video_url = url_or_none(video.get('url')) or url_or_none(video.get('jsonUrl'))
239             if not video_url:
240                 continue
241             video_id = video.get('programId')
242             entries.append({
243                 '_type': 'url_transparent',
244                 'url': video_url,
245                 'id': video_id,
246                 'title': video.get('title'),
247                 'alt_title': video.get('subtitle'),
248                 'thumbnail': url_or_none(try_get(video, lambda x: x['mainImage']['url'], compat_str)),
249                 'duration': int_or_none(video.get('durationSeconds')),
250                 'view_count': int_or_none(video.get('views')),
251                 'ie_key': ArteTVIE.ie_key(),
252             })
253         title = collection.get('title')
254         description = collection.get('shortDescription') or collection.get('teaserText')
255         return self.playlist_result(entries, playlist_id, title, description)
256
257
258 class ArteTVCategoryIE(ArteTVBaseIE):
259     _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>%s)/videos/(?P<id>[\w-]+(?:/[\w-]+)*)/?\s*$' % ArteTVBaseIE._ARTE_LANGUAGES
260     _TESTS = [{
261         'url': 'https://www.arte.tv/en/videos/politics-and-society/',
262         'info_dict': {
263             'id': 'politics-and-society',
264             'title': 'Politics and society',
265             'description': 'Investigative documentary series, geopolitical analysis, and international commentary',
266         },
267         'playlist_mincount': 13,
268     },
269     ]
270
271     @classmethod
272     def suitable(cls, url):
273         return (
274             not any(ie.suitable(url) for ie in (ArteTVIE, ArteTVPlaylistIE, ))
275             and super(ArteTVCategoryIE, cls).suitable(url))
276
277     def _real_extract(self, url):
278         lang, playlist_id = re.match(self._VALID_URL, url).groups()
279         webpage = self._download_webpage(url, playlist_id)
280
281         items = []
282         for video in re.finditer(
283                 r'<a\b[^>]*?href\s*=\s*(?P<q>"|\'|\b)(?P<url>https?://www\.arte\.tv/%s/videos/[\w/-]+)(?P=q)' % lang,
284                 webpage):
285             video = video.group('url')
286             if video == url:
287                 continue
288             if any(ie.suitable(video) for ie in (ArteTVIE, ArteTVPlaylistIE, )):
289                 items.append(video)
290
291         if items:
292             title = (self._og_search_title(webpage, default=None)
293                      or self._html_search_regex(r'<title\b[^>]*>([^<]+)</title>', default=None))
294             title = strip_or_none(title.rsplit('|', 1)[0]) or self._generic_title(url)
295
296             result = self.playlist_from_matches(items, playlist_id=playlist_id, playlist_title=title)
297             if result:
298                 description = self._og_search_description(webpage, default=None)
299                 if description:
300                     result['description'] = description
301                 return result