2 from __future__ import unicode_literals
18 from ..compat import (
19 compat_cookiejar_Cookie,
20 compat_cookies_SimpleCookie,
22 compat_etree_fromstring,
29 compat_urllib_parse_unquote,
30 compat_urllib_parse_urlencode,
31 compat_urllib_request,
33 compat_xml_parse_error,
35 from ..downloader.f4m import (
37 remove_encrypted_media,
65 parse_m3u8_attributes,
87 class InfoExtractor(object):
88 """Information Extractor class.
90 Information extractors are the classes that, given a URL, extract
91 information about the video (or videos) the URL refers to. This
92 information includes the real video URL, the video title, author and
93 others. The information is stored in a dictionary which is then
94 passed to the YoutubeDL. The YoutubeDL processes this
95 information possibly downloading the video to the file system, among
96 other possible outcomes.
98 The type field determines the type of the result.
99 By far the most common value (and the default if _type is missing) is
100 "video", which indicates a single video.
102 For a video, the dictionaries must include the following fields:
104 id: Video identifier.
105 title: Video title, unescaped.
107 Additionally, it must contain either a formats entry or a url one:
109 formats: A list of dictionaries for each format available, ordered
110 from worst to best quality.
113 * url The mandatory URL representing the media:
114 for plain file media - HTTP URL of this file,
116 for HLS - URL of the M3U8 media playlist,
117 for HDS - URL of the F4M manifest,
119 - HTTP URL to plain file media (in case of
121 - URL of the MPD manifest or base URL
122 representing the media if MPD manifest
123 is parsed from a string (in case of
125 for MSS - URL of the ISM manifest.
127 The URL of the manifest file in case of
129 for HLS - URL of the M3U8 master playlist,
130 for HDS - URL of the F4M manifest,
131 for DASH - URL of the MPD manifest,
132 for MSS - URL of the ISM manifest.
133 * ext Will be calculated from URL if missing
134 * format A human-readable description of the format
135 ("mp4 container with h264/opus").
136 Calculated from the format_id, width, height.
137 and format_note fields if missing.
138 * format_id A short description of the format
139 ("mp4_h264_opus" or "19").
140 Technically optional, but strongly recommended.
141 * format_note Additional info about the format
142 ("3D" or "DASH video")
143 * width Width of the video, if known
144 * height Height of the video, if known
145 * resolution Textual description of width and height
146 * tbr Average bitrate of audio and video in KBit/s
147 * abr Average audio bitrate in KBit/s
148 * acodec Name of the audio codec in use
149 * asr Audio sampling rate in Hertz
150 * vbr Average video bitrate in KBit/s
152 * vcodec Name of the video codec in use
153 * container Name of the container format
154 * filesize The number of bytes, if known in advance
155 * filesize_approx An estimate for the number of bytes
156 * player_url SWF Player URL (used for rtmpdump).
157 * protocol The protocol that will be used for the actual
158 download, lower-case.
159 "http", "https", "rtsp", "rtmp", "rtmpe",
160 "m3u8", "m3u8_native" or "http_dash_segments".
162 Base URL for fragments. Each fragment's path
163 value (if present) will be relative to
165 * fragments A list of fragments of a fragmented media.
166 Each fragment entry must contain either an url
167 or a path. If an url is present it should be
168 considered by a client. Otherwise both path and
169 fragment_base_url must be present. Here is
170 the list of all potential fields:
171 * "url" - fragment's URL
172 * "path" - fragment's path relative to
174 * "duration" (optional, int or float)
175 * "filesize" (optional, int)
176 * preference Order number of this format. If this field is
177 present and not None, the formats get sorted
178 by this field, regardless of all other values.
179 -1 for default (order by other properties),
180 -2 or smaller for less than default.
181 < -1000 to hide the format (if there is
182 another one which is strictly better)
183 * language Language code, e.g. "de" or "en-US".
184 * language_preference Is this in the language mentioned in
186 10 if it's what the URL is about,
187 -1 for default (don't know),
188 -10 otherwise, other values reserved for now.
189 * quality Order number of the video quality of this
190 format, irrespective of the file format.
191 -1 for default (order by other properties),
192 -2 or smaller for less than default.
193 * source_preference Order number for this video source
194 (quality takes higher priority)
195 -1 for default (order by other properties),
196 -2 or smaller for less than default.
197 * http_headers A dictionary of additional HTTP headers
198 to add to the request.
199 * stretched_ratio If given and not 1, indicates that the
200 video's pixels are not square.
201 width : height ratio as float.
202 * no_resume The server does not support resuming the
203 (HTTP or RTMP) download. Boolean.
204 * downloader_options A dictionary of downloader options as
205 described in FileDownloader
207 url: Final video URL.
208 ext: Video filename extension.
209 format: The video format, defaults to ext (used for --get-format)
210 player_url: SWF Player URL (used for rtmpdump).
212 The following fields are optional:
214 alt_title: A secondary title of the video.
215 display_id An alternative identifier for the video, not necessarily
216 unique, but available before title. Typically, id is
217 something like "4234987", title "Dancing naked mole rats",
218 and display_id "dancing-naked-mole-rats"
219 thumbnails: A list of dictionaries, with the following entries:
220 * "id" (optional, string) - Thumbnail format ID
222 * "preference" (optional, int) - quality of the image
223 * "width" (optional, int)
224 * "height" (optional, int)
225 * "resolution" (optional, string "{width}x{height}",
227 * "filesize" (optional, int)
228 thumbnail: Full URL to a video thumbnail image.
229 description: Full video description.
230 uploader: Full name of the video uploader.
231 license: License name the video is licensed under.
232 creator: The creator of the video.
233 release_timestamp: UNIX timestamp of the moment the video was released.
234 release_date: The date (YYYYMMDD) when the video was released.
235 timestamp: UNIX timestamp of the moment the video became available
237 upload_date: Video upload date (YYYYMMDD).
238 If not explicitly set, calculated from timestamp.
239 uploader_id: Nickname or id of the video uploader.
240 uploader_url: Full URL to a personal webpage of the video uploader.
241 channel: Full name of the channel the video is uploaded on.
242 Note that channel fields may or may not repeat uploader
243 fields. This depends on a particular extractor.
244 channel_id: Id of the channel.
245 channel_url: Full URL to a channel webpage.
246 location: Physical location where the video was filmed.
247 subtitles: The available subtitles as a dictionary in the format
248 {tag: subformats}. "tag" is usually a language code, and
249 "subformats" is a list sorted from lower to higher
250 preference, each element is a dictionary with the "ext"
252 * "data": The subtitles file contents
253 * "url": A URL pointing to the subtitles file
254 "ext" will be calculated from URL if missing
255 automatic_captions: Like 'subtitles', used by the YoutubeIE for
256 automatically generated captions
257 duration: Length of the video in seconds, as an integer or float.
258 view_count: How many users have watched the video on the platform.
259 like_count: Number of positive ratings of the video
260 dislike_count: Number of negative ratings of the video
261 repost_count: Number of reposts of the video
262 average_rating: Average rating give by users, the scale used depends on the webpage
263 comment_count: Number of comments on the video
264 comments: A list of comments, each with one or more of the following
265 properties (all but one of text or html optional):
266 * "author" - human-readable name of the comment author
267 * "author_id" - user ID of the comment author
269 * "html" - Comment as HTML
270 * "text" - Plain text of the comment
271 * "timestamp" - UNIX timestamp of comment
272 * "parent" - ID of the comment this one is replying to.
273 Set to "root" to indicate that this is a
274 comment to the original video.
275 age_limit: Age restriction for the video, as an integer (years)
276 webpage_url: The URL to the video webpage, if given to youtube-dl it
277 should allow to get the same result again. (It will be set
278 by YoutubeDL if it's missing)
279 categories: A list of categories that the video falls in, for example
281 tags: A list of tags assigned to the video, e.g. ["sweden", "pop music"]
282 is_live: True, False, or None (=unknown). Whether this video is a
283 live stream that goes on instead of a fixed-length video.
284 start_time: Time in seconds where the reproduction should start, as
285 specified in the URL.
286 end_time: Time in seconds where the reproduction should end, as
287 specified in the URL.
288 chapters: A list of dictionaries, with the following entries:
289 * "start_time" - The start time of the chapter in seconds
290 * "end_time" - The end time of the chapter in seconds
291 * "title" (optional, string)
293 The following fields should only be used when the video belongs to some logical
296 chapter: Name or title of the chapter the video belongs to.
297 chapter_number: Number of the chapter the video belongs to, as an integer.
298 chapter_id: Id of the chapter the video belongs to, as a unicode string.
300 The following fields should only be used when the video is an episode of some
301 series, programme or podcast:
303 series: Title of the series or programme the video episode belongs to.
304 season: Title of the season the video episode belongs to.
305 season_number: Number of the season the video episode belongs to, as an integer.
306 season_id: Id of the season the video episode belongs to, as a unicode string.
307 episode: Title of the video episode. Unlike mandatory video title field,
308 this field should denote the exact title of the video episode
309 without any kind of decoration.
310 episode_number: Number of the video episode within a season, as an integer.
311 episode_id: Id of the video episode, as a unicode string.
313 The following fields should only be used when the media is a track or a part of
316 track: Title of the track.
317 track_number: Number of the track within an album or a disc, as an integer.
318 track_id: Id of the track (useful in case of custom indexing, e.g. 6.iii),
320 artist: Artist(s) of the track.
321 genre: Genre(s) of the track.
322 album: Title of the album the track belongs to.
323 album_type: Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
324 album_artist: List of all artists appeared on the album (e.g.
325 "Ash Borer / Fell Voices" or "Various Artists", useful for splits
327 disc_number: Number of the disc or other physical medium the track belongs to,
329 release_year: Year (YYYY) when the album was released.
331 Unless mentioned otherwise, the fields should be Unicode strings.
333 Unless mentioned otherwise, None is equivalent to absence of information.
336 _type "playlist" indicates multiple videos.
337 There must be a key "entries", which is a list, an iterable, or a PagedList
338 object, each element of which is a valid dictionary by this specification.
340 Additionally, playlists can have "id", "title", "description", "uploader",
341 "uploader_id", "uploader_url", "duration" attributes with the same semantics
342 as videos (see above).
345 _type "multi_video" indicates that there are multiple videos that
346 form a single show, for examples multiple acts of an opera or TV episode.
347 It must have an entries key like a playlist and contain all the keys
348 required for a video at the same time.
351 _type "url" indicates that the video must be extracted from another
352 location, possibly by a different extractor. Its only required key is:
353 "url" - the next URL to extract.
354 The key "ie_key" can be set to the class name (minus the trailing "IE",
355 e.g. "Youtube") if the extractor class is known in advance.
356 Additionally, the dictionary may have any properties of the resolved entity
357 known in advance, for example "title" if the title of the referred video is
361 _type "url_transparent" entities have the same specification as "url", but
362 indicate that the given additional information is more precise than the one
363 associated with the resolved URL.
364 This is useful when a site employs a video service that hosts the video and
365 its technical metadata, but that video service does not embed a useful
366 title, description etc.
369 Subclasses of this one should re-define the _real_initialize() and
370 _real_extract() methods and define a _VALID_URL regexp.
371 Probably, they should also be added to the list of extractors.
373 _GEO_BYPASS attribute may be set to False in order to disable
374 geo restriction bypass mechanisms for a particular extractor.
375 Though it won't disable explicit geo restriction bypass based on
376 country code provided with geo_bypass_country.
378 _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
379 countries for this extractor. One of these countries will be used by
380 geo restriction bypass mechanism right away in order to bypass
381 geo restriction, of course, if the mechanism is not disabled.
383 _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
384 IP blocks in CIDR notation for this extractor. One of these IP blocks
385 will be used by geo restriction bypass mechanism similarly
388 Finally, the _WORKING attribute should be set to False for broken IEs
389 in order to warn the users and skip the tests.
394 _x_forwarded_for_ip = None
396 _GEO_COUNTRIES = None
397 _GEO_IP_BLOCKS = None
400 def __init__(self, downloader=None):
401 """Constructor. Receives an optional downloader."""
403 self._x_forwarded_for_ip = None
404 self.set_downloader(downloader)
407 def suitable(cls, url):
408 """Receives a URL and returns True if suitable for this IE."""
410 # This does not use has/getattr intentionally - we want to know whether
411 # we have cached the regexp for *this* class, whereas getattr would also
412 # match the superclass
413 if '_VALID_URL_RE' not in cls.__dict__:
414 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
415 return cls._VALID_URL_RE.match(url) is not None
418 def _match_id(cls, url):
419 if '_VALID_URL_RE' not in cls.__dict__:
420 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
421 m = cls._VALID_URL_RE.match(url)
423 return compat_str(m.group('id'))
427 """Getter method for _WORKING."""
430 def initialize(self):
431 """Initializes an instance (authentication, etc)."""
432 self._initialize_geo_bypass({
433 'countries': self._GEO_COUNTRIES,
434 'ip_blocks': self._GEO_IP_BLOCKS,
437 self._real_initialize()
440 def _initialize_geo_bypass(self, geo_bypass_context):
442 Initialize geo restriction bypass mechanism.
444 This method is used to initialize geo bypass mechanism based on faking
445 X-Forwarded-For HTTP header. A random country from provided country list
446 is selected and a random IP belonging to this country is generated. This
447 IP will be passed as X-Forwarded-For HTTP header in all subsequent
450 This method will be used for initial geo bypass mechanism initialization
451 during the instance initialization with _GEO_COUNTRIES and
454 You may also manually call it from extractor's code if geo bypass
455 information is not available beforehand (e.g. obtained during
456 extraction) or due to some other reason. In this case you should pass
457 this information in geo bypass context passed as first argument. It may
458 contain following fields:
460 countries: List of geo unrestricted countries (similar
462 ip_blocks: List of geo unrestricted IP blocks in CIDR notation
463 (similar to _GEO_IP_BLOCKS)
466 if not self._x_forwarded_for_ip:
468 # Geo bypass mechanism is explicitly disabled by user
469 if not self._downloader.params.get('geo_bypass', True):
472 if not geo_bypass_context:
473 geo_bypass_context = {}
475 # Backward compatibility: previously _initialize_geo_bypass
476 # expected a list of countries, some 3rd party code may still use
478 if isinstance(geo_bypass_context, (list, tuple)):
479 geo_bypass_context = {
480 'countries': geo_bypass_context,
483 # The whole point of geo bypass mechanism is to fake IP
484 # as X-Forwarded-For HTTP header based on some IP block or
487 # Path 1: bypassing based on IP block in CIDR notation
489 # Explicit IP block specified by user, use it right away
490 # regardless of whether extractor is geo bypassable or not
491 ip_block = self._downloader.params.get('geo_bypass_ip_block', None)
493 # Otherwise use random IP block from geo bypass context but only
494 # if extractor is known as geo bypassable
496 ip_blocks = geo_bypass_context.get('ip_blocks')
497 if self._GEO_BYPASS and ip_blocks:
498 ip_block = random.choice(ip_blocks)
501 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
502 if self._downloader.params.get('verbose', False):
503 self._downloader.to_screen(
504 '[debug] Using fake IP %s as X-Forwarded-For.'
505 % self._x_forwarded_for_ip)
508 # Path 2: bypassing based on country code
510 # Explicit country code specified by user, use it right away
511 # regardless of whether extractor is geo bypassable or not
512 country = self._downloader.params.get('geo_bypass_country', None)
514 # Otherwise use random country code from geo bypass context but
515 # only if extractor is known as geo bypassable
517 countries = geo_bypass_context.get('countries')
518 if self._GEO_BYPASS and countries:
519 country = random.choice(countries)
522 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
523 if self._downloader.params.get('verbose', False):
524 self._downloader.to_screen(
525 '[debug] Using fake IP %s (%s) as X-Forwarded-For.'
526 % (self._x_forwarded_for_ip, country.upper()))
528 def extract(self, url):
529 """Extracts URL information and returns it in list of dicts."""
534 ie_result = self._real_extract(url)
535 if self._x_forwarded_for_ip:
536 ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
538 except GeoRestrictedError as e:
539 if self.__maybe_fake_ip_and_retry(e.countries):
542 except ExtractorError:
544 except compat_http_client.IncompleteRead as e:
545 raise ExtractorError('A network error has occurred.', cause=e, expected=True)
546 except (KeyError, StopIteration) as e:
547 raise ExtractorError('An extractor error has occurred.', cause=e)
549 def __maybe_fake_ip_and_retry(self, countries):
550 if (not self._downloader.params.get('geo_bypass_country', None)
552 and self._downloader.params.get('geo_bypass', True)
553 and not self._x_forwarded_for_ip
555 country_code = random.choice(countries)
556 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
557 if self._x_forwarded_for_ip:
559 'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
560 % (self._x_forwarded_for_ip, country_code.upper()))
564 def set_downloader(self, downloader):
565 """Sets the downloader for this IE."""
566 self._downloader = downloader
568 def _real_initialize(self):
569 """Real initialization process. Redefine in subclasses."""
572 def _real_extract(self, url):
573 """Real extraction process. Redefine in subclasses."""
578 """A string for getting the InfoExtractor with get_info_extractor"""
579 return compat_str(cls.__name__[:-2])
583 return compat_str(type(self).__name__[:-2])
586 def __can_accept_status_code(err, expected_status):
587 assert isinstance(err, compat_urllib_error.HTTPError)
588 if expected_status is None:
590 if isinstance(expected_status, compat_integer_types):
591 return err.code == expected_status
592 elif isinstance(expected_status, (list, tuple)):
593 return err.code in expected_status
594 elif callable(expected_status):
595 return expected_status(err.code) is True
599 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None):
601 Return the response handle.
603 See _download_webpage docstring for arguments specification.
606 self.report_download_webpage(video_id)
607 elif note is not False:
609 self.to_screen('%s' % (note,))
611 self.to_screen('%s: %s' % (video_id, note))
613 # Some sites check X-Forwarded-For HTTP header in order to figure out
614 # the origin of the client behind proxy. This allows bypassing geo
615 # restriction by faking this header's value to IP that belongs to some
616 # geo unrestricted country. We will do so once we encounter any
617 # geo restriction error.
618 if self._x_forwarded_for_ip:
619 if 'X-Forwarded-For' not in headers:
620 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
622 if isinstance(url_or_request, compat_urllib_request.Request):
623 url_or_request = update_Request(
624 url_or_request, data=data, headers=headers, query=query)
627 url_or_request = update_url_query(url_or_request, query)
628 if data is not None or headers:
629 url_or_request = sanitized_Request(url_or_request, data, headers)
630 exceptions = [compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error]
631 if hasattr(ssl, 'CertificateError'):
632 exceptions.append(ssl.CertificateError)
634 return self._downloader.urlopen(url_or_request)
635 except tuple(exceptions) as err:
636 if isinstance(err, compat_urllib_error.HTTPError):
637 if self.__can_accept_status_code(err, expected_status):
638 # Retain reference to error to prevent file object from
639 # being closed before it can be read. Works around the
640 # effects of <https://bugs.python.org/issue15002>
641 # introduced in Python 3.4.1.
648 errnote = 'Unable to download webpage'
650 errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
652 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
654 self._downloader.report_warning(errmsg)
657 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
659 Return a tuple (page content as string, URL handle).
661 See _download_webpage docstring for arguments specification.
663 # Strip hashes from the URL (#1038)
664 if isinstance(url_or_request, (compat_str, str)):
665 url_or_request = url_or_request.partition('#')[0]
667 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
671 content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
672 return (content, urlh)
675 def _guess_encoding_from_content(content_type, webpage_bytes):
676 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
678 encoding = m.group(1)
680 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
681 webpage_bytes[:1024])
683 encoding = m.group(1).decode('ascii')
684 elif webpage_bytes.startswith(b'\xff\xfe'):
691 def __check_blocked(self, content):
692 first_block = content[:512]
693 if ('<title>Access to this site is blocked</title>' in content
694 and 'Websense' in first_block):
695 msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
696 blocked_iframe = self._html_search_regex(
697 r'<iframe src="([^"]+)"', content,
698 'Websense information URL', default=None)
700 msg += ' Visit %s for more details' % blocked_iframe
701 raise ExtractorError(msg, expected=True)
702 if '<title>The URL you requested has been blocked</title>' in first_block:
704 'Access to this webpage has been blocked by Indian censorship. '
705 'Use a VPN or proxy server (with --proxy) to route around it.')
706 block_msg = self._html_search_regex(
707 r'</h1><p>(.*?)</p>',
708 content, 'block message', default=None)
710 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
711 raise ExtractorError(msg, expected=True)
712 if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
713 and 'blocklist.rkn.gov.ru' in content):
714 raise ExtractorError(
715 'Access to this webpage has been blocked by decision of the Russian government. '
716 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
719 def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
720 content_type = urlh.headers.get('Content-Type', '')
721 webpage_bytes = urlh.read()
722 if prefix is not None:
723 webpage_bytes = prefix + webpage_bytes
725 encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
726 if self._downloader.params.get('dump_intermediate_pages', False):
727 self.to_screen('Dumping request to ' + urlh.geturl())
728 dump = base64.b64encode(webpage_bytes).decode('ascii')
729 self._downloader.to_screen(dump)
730 if self._downloader.params.get('write_pages', False):
731 basen = '%s_%s' % (video_id, urlh.geturl())
733 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
734 basen = basen[:240 - len(h)] + h
735 raw_filename = basen + '.dump'
736 filename = sanitize_filename(raw_filename, restricted=True)
737 self.to_screen('Saving request to ' + filename)
738 # Working around MAX_PATH limitation on Windows (see
739 # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
740 if compat_os_name == 'nt':
741 absfilepath = os.path.abspath(filename)
742 if len(absfilepath) > 259:
743 filename = '\\\\?\\' + absfilepath
744 with open(filename, 'wb') as outf:
745 outf.write(webpage_bytes)
748 content = webpage_bytes.decode(encoding, 'replace')
750 content = webpage_bytes.decode('utf-8', 'replace')
752 self.__check_blocked(content)
756 def _download_webpage(
757 self, url_or_request, video_id, note=None, errnote=None,
758 fatal=True, tries=1, timeout=5, encoding=None, data=None,
759 headers={}, query={}, expected_status=None):
761 Return the data of the page as a string.
764 url_or_request -- plain text URL as a string or
765 a compat_urllib_request.Requestobject
766 video_id -- Video/playlist/item identifier (string)
769 note -- note printed before downloading (string)
770 errnote -- note printed in case of an error (string)
771 fatal -- flag denoting whether error should be considered fatal,
772 i.e. whether it should cause ExtractionError to be raised,
773 otherwise a warning will be reported and extraction continued
774 tries -- number of tries
775 timeout -- sleep interval between tries
776 encoding -- encoding for a page content decoding, guessed automatically
777 when not explicitly specified
778 data -- POST data (bytes)
779 headers -- HTTP headers (dict)
780 query -- URL query (dict)
781 expected_status -- allows to accept failed HTTP requests (non 2xx
782 status code) by explicitly specifying a set of accepted status
783 codes. Can be any of the following entities:
784 - an integer type specifying an exact failed status code to
786 - a list or a tuple of integer types specifying a list of
787 failed status codes to accept
788 - a callable accepting an actual failed status code and
789 returning True if it should be accepted
790 Note that this argument does not affect success status codes (2xx)
791 which are always accepted.
796 while success is False:
798 res = self._download_webpage_handle(
799 url_or_request, video_id, note, errnote, fatal,
800 encoding=encoding, data=data, headers=headers, query=query,
801 expected_status=expected_status)
803 except compat_http_client.IncompleteRead as e:
805 if try_count >= tries:
807 self._sleep(timeout, video_id)
814 def _download_xml_handle(
815 self, url_or_request, video_id, note='Downloading XML',
816 errnote='Unable to download XML', transform_source=None,
817 fatal=True, encoding=None, data=None, headers={}, query={},
818 expected_status=None):
820 Return a tuple (xml as an compat_etree_Element, URL handle).
822 See _download_webpage docstring for arguments specification.
824 res = self._download_webpage_handle(
825 url_or_request, video_id, note, errnote, fatal=fatal,
826 encoding=encoding, data=data, headers=headers, query=query,
827 expected_status=expected_status)
830 xml_string, urlh = res
831 return self._parse_xml(
832 xml_string, video_id, transform_source=transform_source,
836 self, url_or_request, video_id,
837 note='Downloading XML', errnote='Unable to download XML',
838 transform_source=None, fatal=True, encoding=None,
839 data=None, headers={}, query={}, expected_status=None):
841 Return the xml as an compat_etree_Element.
843 See _download_webpage docstring for arguments specification.
845 res = self._download_xml_handle(
846 url_or_request, video_id, note=note, errnote=errnote,
847 transform_source=transform_source, fatal=fatal, encoding=encoding,
848 data=data, headers=headers, query=query,
849 expected_status=expected_status)
850 return res if res is False else res[0]
852 def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
854 xml_string = transform_source(xml_string)
856 return compat_etree_fromstring(xml_string.encode('utf-8'))
857 except compat_xml_parse_error as ve:
858 errmsg = '%s: Failed to parse XML ' % video_id
860 raise ExtractorError(errmsg, cause=ve)
862 self.report_warning(errmsg + str(ve))
864 def _download_json_handle(
865 self, url_or_request, video_id, note='Downloading JSON metadata',
866 errnote='Unable to download JSON metadata', transform_source=None,
867 fatal=True, encoding=None, data=None, headers={}, query={},
868 expected_status=None):
870 Return a tuple (JSON object, URL handle).
872 See _download_webpage docstring for arguments specification.
874 res = self._download_webpage_handle(
875 url_or_request, video_id, note, errnote, fatal=fatal,
876 encoding=encoding, data=data, headers=headers, query=query,
877 expected_status=expected_status)
880 json_string, urlh = res
881 return self._parse_json(
882 json_string, video_id, transform_source=transform_source,
886 self, url_or_request, video_id, note='Downloading JSON metadata',
887 errnote='Unable to download JSON metadata', transform_source=None,
888 fatal=True, encoding=None, data=None, headers={}, query={},
889 expected_status=None):
891 Return the JSON object as a dict.
893 See _download_webpage docstring for arguments specification.
895 res = self._download_json_handle(
896 url_or_request, video_id, note=note, errnote=errnote,
897 transform_source=transform_source, fatal=fatal, encoding=encoding,
898 data=data, headers=headers, query=query,
899 expected_status=expected_status)
900 return res if res is False else res[0]
902 def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
904 json_string = transform_source(json_string)
906 return json.loads(json_string)
907 except ValueError as ve:
908 errmsg = '%s: Failed to parse JSON ' % video_id
910 raise ExtractorError(errmsg, cause=ve)
912 self.report_warning(errmsg + str(ve))
914 def report_warning(self, msg, video_id=None):
915 idstr = '' if video_id is None else '%s: ' % video_id
916 self._downloader.report_warning(
917 '[%s] %s%s' % (self.IE_NAME, idstr, msg))
919 def to_screen(self, msg):
920 """Print msg to screen, prefixing it with '[ie_name]'"""
921 self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
923 def report_extraction(self, id_or_name):
924 """Report information extraction."""
925 self.to_screen('%s: Extracting information' % id_or_name)
927 def report_download_webpage(self, video_id):
928 """Report webpage download."""
929 self.to_screen('%s: Downloading webpage' % video_id)
931 def report_age_confirmation(self):
932 """Report attempt to confirm age."""
933 self.to_screen('Confirming age')
935 def report_login(self):
936 """Report attempt to log in."""
937 self.to_screen('Logging in')
940 def raise_login_required(msg='This video is only available for registered users'):
941 raise ExtractorError(
942 '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
946 def raise_geo_restricted(msg='This video is not available from your location due to geo restriction', countries=None):
947 raise GeoRestrictedError(msg, countries=countries)
949 # Methods for following #608
951 def url_result(url, ie=None, video_id=None, video_title=None):
952 """Returns a URL that points to a page that should be processed"""
953 # TODO: ie should be the class used for getting the info
954 video_info = {'_type': 'url',
957 if video_id is not None:
958 video_info['id'] = video_id
959 if video_title is not None:
960 video_info['title'] = video_title
963 def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None):
965 self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
967 return self.playlist_result(
968 urls, playlist_id=playlist_id, playlist_title=playlist_title)
971 def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
972 """Returns a playlist"""
973 video_info = {'_type': 'playlist',
976 video_info['id'] = playlist_id
978 video_info['title'] = playlist_title
979 if playlist_description:
980 video_info['description'] = playlist_description
983 def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
985 Perform a regex search on the given string, using a single or a list of
986 patterns returning the first matching group.
987 In case of failure return a default value or raise a WARNING or a
988 RegexNotFoundError, depending on fatal, specifying the field name.
990 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
991 mobj = re.search(pattern, string, flags)
994 mobj = re.search(p, string, flags)
998 if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
999 _name = '\033[0;34m%s\033[0m' % name
1005 # return the first matching group
1006 return next(g for g in mobj.groups() if g is not None)
1008 return mobj.group(group)
1009 elif default is not NO_DEFAULT:
1012 raise RegexNotFoundError('Unable to extract %s' % _name)
1014 self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
1017 def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1019 Like _search_regex, but strips HTML tags and unescapes entities.
1021 res = self._search_regex(pattern, string, name, default, fatal, flags, group)
1023 return clean_html(res).strip()
1027 def _get_netrc_login_info(self, netrc_machine=None):
1030 netrc_machine = netrc_machine or self._NETRC_MACHINE
1032 if self._downloader.params.get('usenetrc', False):
1034 info = netrc.netrc().authenticators(netrc_machine)
1035 if info is not None:
1039 raise netrc.NetrcParseError(
1040 'No authenticators for %s' % netrc_machine)
1041 except (IOError, netrc.NetrcParseError) as err:
1042 self._downloader.report_warning(
1043 'parsing .netrc: %s' % error_to_compat_str(err))
1045 return username, password
1047 def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
1049 Get the login info as (username, password)
1050 First look for the manually specified credentials using username_option
1051 and password_option as keys in params dictionary. If no such credentials
1052 available look in the netrc file using the netrc_machine or _NETRC_MACHINE
1054 If there's no info available, return (None, None)
1056 if self._downloader is None:
1059 downloader_params = self._downloader.params
1061 # Attempt to use provided username and password or .netrc data
1062 if downloader_params.get(username_option) is not None:
1063 username = downloader_params[username_option]
1064 password = downloader_params[password_option]
1066 username, password = self._get_netrc_login_info(netrc_machine)
1068 return username, password
1070 def _get_tfa_info(self, note='two-factor verification code'):
1072 Get the two-factor authentication info
1073 TODO - asking the user will be required for sms/phone verify
1074 currently just uses the command line option
1075 If there's no info available, return None
1077 if self._downloader is None:
1079 downloader_params = self._downloader.params
1081 if downloader_params.get('twofactor') is not None:
1082 return downloader_params['twofactor']
1084 return compat_getpass('Type %s and press [Return]: ' % note)
1086 # Helper functions for extracting OpenGraph info
1088 def _og_regexes(prop):
1089 content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
1090 property_re = (r'(?:name|property)=(?:\'og[:-]%(prop)s\'|"og[:-]%(prop)s"|\s*og[:-]%(prop)s\b)'
1091 % {'prop': re.escape(prop)})
1092 template = r'<meta[^>]+?%s[^>]+?%s'
1094 template % (property_re, content_re),
1095 template % (content_re, property_re),
1099 def _meta_regex(prop):
1100 return r'''(?isx)<meta
1101 (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
1102 [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1104 def _og_search_property(self, prop, html, name=None, **kargs):
1105 if not isinstance(prop, (list, tuple)):
1108 name = 'OpenGraph %s' % prop[0]
1111 og_regexes.extend(self._og_regexes(p))
1112 escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
1115 return unescapeHTML(escaped)
1117 def _og_search_thumbnail(self, html, **kargs):
1118 return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
1120 def _og_search_description(self, html, **kargs):
1121 return self._og_search_property('description', html, fatal=False, **kargs)
1123 def _og_search_title(self, html, **kargs):
1124 return self._og_search_property('title', html, **kargs)
1126 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
1127 regexes = self._og_regexes('video') + self._og_regexes('video:url')
1129 regexes = self._og_regexes('video:secure_url') + regexes
1130 return self._html_search_regex(regexes, html, name, **kargs)
1132 def _og_search_url(self, html, **kargs):
1133 return self._og_search_property('url', html, **kargs)
1135 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1136 if not isinstance(name, (list, tuple)):
1138 if display_name is None:
1139 display_name = name[0]
1140 return self._html_search_regex(
1141 [self._meta_regex(n) for n in name],
1142 html, display_name, fatal=fatal, group='content', **kwargs)
1144 def _dc_search_uploader(self, html):
1145 return self._html_search_meta('dc.creator', html, 'uploader')
1147 def _rta_search(self, html):
1148 # See http://www.rtalabel.org/index.php?content=howtofaq#single
1149 if re.search(r'(?ix)<meta\s+name="rating"\s+'
1150 r' content="RTA-5042-1996-1400-1577-RTA"',
1155 def _media_rating_search(self, html):
1156 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1157 rating = self._html_search_meta('rating', html)
1169 return RATING_TABLE.get(rating.lower())
1171 def _family_friendly_search(self, html):
1172 # See http://schema.org/VideoObject
1173 family_friendly = self._html_search_meta(
1174 'isFamilyFriendly', html, default=None)
1176 if not family_friendly:
1185 return RATING_TABLE.get(family_friendly.lower())
1187 def _twitter_search_player(self, html):
1188 return self._html_search_meta('twitter:player', html,
1189 'twitter card player')
1191 def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
1192 json_ld_list = list(re.finditer(JSON_LD_RE, html))
1193 default = kwargs.get('default', NO_DEFAULT)
1194 # JSON-LD may be malformed and thus `fatal` should be respected.
1195 # At the same time `default` may be passed that assumes `fatal=False`
1196 # for _search_regex. Let's simulate the same behavior here as well.
1197 fatal = kwargs.get('fatal', True) if default == NO_DEFAULT else False
1199 for mobj in json_ld_list:
1200 json_ld_item = self._parse_json(
1201 mobj.group('json_ld'), video_id, fatal=fatal)
1202 if not json_ld_item:
1204 if isinstance(json_ld_item, dict):
1205 json_ld.append(json_ld_item)
1206 elif isinstance(json_ld_item, (list, tuple)):
1207 json_ld.extend(json_ld_item)
1209 json_ld = self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
1212 if default is not NO_DEFAULT:
1215 raise RegexNotFoundError('Unable to extract JSON-LD')
1217 self._downloader.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
1220 def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1221 if isinstance(json_ld, compat_str):
1222 json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1226 if not isinstance(json_ld, (list, tuple, dict)):
1228 if isinstance(json_ld, dict):
1231 INTERACTION_TYPE_MAP = {
1232 'CommentAction': 'comment',
1233 'AgreeAction': 'like',
1234 'DisagreeAction': 'dislike',
1235 'LikeAction': 'like',
1236 'DislikeAction': 'dislike',
1237 'ListenAction': 'view',
1238 'WatchAction': 'view',
1239 'ViewAction': 'view',
1242 def extract_interaction_type(e):
1243 interaction_type = e.get('interactionType')
1244 if isinstance(interaction_type, dict):
1245 interaction_type = interaction_type.get('@type')
1246 return str_or_none(interaction_type)
1248 def extract_interaction_statistic(e):
1249 interaction_statistic = e.get('interactionStatistic')
1250 if isinstance(interaction_statistic, dict):
1251 interaction_statistic = [interaction_statistic]
1252 if not isinstance(interaction_statistic, list):
1254 for is_e in interaction_statistic:
1255 if not isinstance(is_e, dict):
1257 if is_e.get('@type') != 'InteractionCounter':
1259 interaction_type = extract_interaction_type(is_e)
1260 if not interaction_type:
1262 # For interaction count some sites provide string instead of
1263 # an integer (as per spec) with non digit characters (e.g. ",")
1264 # so extracting count with more relaxed str_to_int
1265 interaction_count = str_to_int(is_e.get('userInteractionCount'))
1266 if interaction_count is None:
1268 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1271 count_key = '%s_count' % count_kind
1272 if info.get(count_key) is not None:
1274 info[count_key] = interaction_count
1276 def extract_video_object(e):
1277 assert e['@type'] == 'VideoObject'
1278 author = e.get('author')
1280 'url': url_or_none(e.get('contentUrl')),
1281 'title': unescapeHTML(e.get('name')),
1282 'description': unescapeHTML(e.get('description')),
1283 'thumbnail': url_or_none(e.get('thumbnailUrl') or e.get('thumbnailURL')),
1284 'duration': parse_duration(e.get('duration')),
1285 'timestamp': unified_timestamp(e.get('uploadDate')),
1286 # author can be an instance of 'Organization' or 'Person' types.
1287 # both types can have 'name' property(inherited from 'Thing' type). [1]
1288 # however some websites are using 'Text' type instead.
1289 # 1. https://schema.org/VideoObject
1290 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, compat_str) else None,
1291 'filesize': float_or_none(e.get('contentSize')),
1292 'tbr': int_or_none(e.get('bitrate')),
1293 'width': int_or_none(e.get('width')),
1294 'height': int_or_none(e.get('height')),
1295 'view_count': int_or_none(e.get('interactionCount')),
1297 extract_interaction_statistic(e)
1301 item_type = e.get('@type')
1302 if expected_type is not None and expected_type != item_type:
1304 if item_type in ('TVEpisode', 'Episode'):
1305 episode_name = unescapeHTML(e.get('name'))
1307 'episode': episode_name,
1308 'episode_number': int_or_none(e.get('episodeNumber')),
1309 'description': unescapeHTML(e.get('description')),
1311 if not info.get('title') and episode_name:
1312 info['title'] = episode_name
1313 part_of_season = e.get('partOfSeason')
1314 if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
1316 'season': unescapeHTML(part_of_season.get('name')),
1317 'season_number': int_or_none(part_of_season.get('seasonNumber')),
1319 part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1320 if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
1321 info['series'] = unescapeHTML(part_of_series.get('name'))
1322 elif item_type == 'Movie':
1324 'title': unescapeHTML(e.get('name')),
1325 'description': unescapeHTML(e.get('description')),
1326 'duration': parse_duration(e.get('duration')),
1327 'timestamp': unified_timestamp(e.get('dateCreated')),
1329 elif item_type in ('Article', 'NewsArticle'):
1331 'timestamp': parse_iso8601(e.get('datePublished')),
1332 'title': unescapeHTML(e.get('headline')),
1333 'description': unescapeHTML(e.get('articleBody')),
1335 elif item_type == 'VideoObject':
1336 extract_video_object(e)
1337 if expected_type is None:
1341 video = e.get('video')
1342 if isinstance(video, dict) and video.get('@type') == 'VideoObject':
1343 extract_video_object(video)
1344 if expected_type is None:
1348 return dict((k, v) for k, v in info.items() if v is not None)
1351 def _hidden_inputs(html):
1352 html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1354 for input in re.findall(r'(?i)(<input[^>]+>)', html):
1355 attrs = extract_attributes(input)
1358 if attrs.get('type') not in ('hidden', 'submit'):
1360 name = attrs.get('name') or attrs.get('id')
1361 value = attrs.get('value')
1362 if name and value is not None:
1363 hidden_inputs[name] = value
1364 return hidden_inputs
1366 def _form_hidden_inputs(self, form_id, html):
1367 form = self._search_regex(
1368 r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1369 html, '%s form' % form_id, group='form')
1370 return self._hidden_inputs(form)
1372 def _sort_formats(self, formats, field_preference=None):
1374 raise ExtractorError('No video formats found')
1377 # Automatically determine tbr when missing based on abr and vbr (improves
1378 # formats sorting in some cases)
1379 if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None:
1380 f['tbr'] = f['abr'] + f['vbr']
1382 def _formats_key(f):
1383 # TODO remove the following workaround
1384 from ..utils import determine_ext
1385 if not f.get('ext') and 'url' in f:
1386 f['ext'] = determine_ext(f['url'])
1388 if isinstance(field_preference, (list, tuple)):
1391 if f.get(field) is not None
1392 else ('' if field == 'format_id' else -1)
1393 for field in field_preference)
1395 preference = f.get('preference')
1396 if preference is None:
1398 if f.get('ext') in ['f4f', 'f4m']: # Not yet supported
1401 protocol = f.get('protocol') or determine_protocol(f)
1402 proto_preference = 0 if protocol in ['http', 'https'] else (-0.5 if protocol == 'rtsp' else -0.1)
1404 if f.get('vcodec') == 'none': # audio only
1406 if self._downloader.params.get('prefer_free_formats'):
1407 ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
1409 ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
1412 audio_ext_preference = ORDER.index(f['ext'])
1414 audio_ext_preference = -1
1416 if f.get('acodec') == 'none': # video only
1418 if self._downloader.params.get('prefer_free_formats'):
1419 ORDER = ['flv', 'mp4', 'webm']
1421 ORDER = ['webm', 'flv', 'mp4']
1423 ext_preference = ORDER.index(f['ext'])
1426 audio_ext_preference = 0
1430 f.get('language_preference') if f.get('language_preference') is not None else -1,
1431 f.get('quality') if f.get('quality') is not None else -1,
1432 f.get('tbr') if f.get('tbr') is not None else -1,
1433 f.get('filesize') if f.get('filesize') is not None else -1,
1434 f.get('vbr') if f.get('vbr') is not None else -1,
1435 f.get('height') if f.get('height') is not None else -1,
1436 f.get('width') if f.get('width') is not None else -1,
1439 f.get('abr') if f.get('abr') is not None else -1,
1440 audio_ext_preference,
1441 f.get('fps') if f.get('fps') is not None else -1,
1442 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
1443 f.get('source_preference') if f.get('source_preference') is not None else -1,
1444 f.get('format_id') if f.get('format_id') is not None else '',
1446 formats.sort(key=_formats_key)
1448 def _check_formats(self, formats, video_id):
1450 formats[:] = filter(
1451 lambda f: self._is_valid_url(
1453 item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1457 def _remove_duplicate_formats(formats):
1461 if f['url'] not in format_urls:
1462 format_urls.add(f['url'])
1463 unique_formats.append(f)
1464 formats[:] = unique_formats
1466 def _is_valid_url(self, url, video_id, item='video', headers={}):
1467 url = self._proto_relative_url(url, scheme='http:')
1468 # For now assume non HTTP(S) URLs always valid
1469 if not (url.startswith('http://') or url.startswith('https://')):
1472 self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1474 except ExtractorError as e:
1476 '%s: %s URL is invalid, skipping: %s'
1477 % (video_id, item, error_to_compat_str(e.cause)))
1480 def http_scheme(self):
1481 """ Either "http:" or "https:", depending on the user's preferences """
1484 if self._downloader.params.get('prefer_insecure', False)
1487 def _proto_relative_url(self, url, scheme=None):
1490 if url.startswith('//'):
1492 scheme = self.http_scheme()
1497 def _sleep(self, timeout, video_id, msg_template=None):
1498 if msg_template is None:
1499 msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1500 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1504 def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
1505 transform_source=lambda s: fix_xml_ampersands(s).strip(),
1506 fatal=True, m3u8_id=None, data=None, headers={}, query={}):
1507 manifest = self._download_xml(
1508 manifest_url, video_id, 'Downloading f4m manifest',
1509 'Unable to download f4m manifest',
1510 # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1511 # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
1512 transform_source=transform_source,
1513 fatal=fatal, data=data, headers=headers, query=query)
1515 if manifest is False:
1518 return self._parse_f4m_formats(
1519 manifest, manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1520 transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1522 def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None,
1523 transform_source=lambda s: fix_xml_ampersands(s).strip(),
1524 fatal=True, m3u8_id=None):
1525 if not isinstance(manifest, compat_etree_Element) and not fatal:
1528 # currently youtube-dl cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1529 akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1530 if akamai_pv is not None and ';' in akamai_pv.text:
1531 playerVerificationChallenge = akamai_pv.text.split(';')[0]
1532 if playerVerificationChallenge.strip() != '':
1536 manifest_version = '1.0'
1537 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1539 manifest_version = '2.0'
1540 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1541 # Remove unsupported DRM protected media from final formats
1542 # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
1543 media_nodes = remove_encrypted_media(media_nodes)
1547 manifest_base_url = get_base_url(manifest)
1549 bootstrap_info = xpath_element(
1550 manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1551 'bootstrap info', default=None)
1554 mime_type = xpath_text(
1555 manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1556 'base URL', default=None)
1557 if mime_type and mime_type.startswith('audio/'):
1560 for i, media_el in enumerate(media_nodes):
1561 tbr = int_or_none(media_el.attrib.get('bitrate'))
1562 width = int_or_none(media_el.attrib.get('width'))
1563 height = int_or_none(media_el.attrib.get('height'))
1564 format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
1565 # If <bootstrapInfo> is present, the specified f4m is a
1566 # stream-level manifest, and only set-level manifests may refer to
1567 # external resources. See section 11.4 and section 4 of F4M spec
1568 if bootstrap_info is None:
1570 # @href is introduced in 2.0, see section 11.6 of F4M spec
1571 if manifest_version == '2.0':
1572 media_url = media_el.attrib.get('href')
1573 if media_url is None:
1574 media_url = media_el.attrib.get('url')
1578 media_url if media_url.startswith('http://') or media_url.startswith('https://')
1579 else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1580 # If media_url is itself a f4m manifest do the recursive extraction
1581 # since bitrates in parent manifest (this one) and media_url manifest
1582 # may differ leading to inability to resolve the format by requested
1583 # bitrate in f4m downloader
1584 ext = determine_ext(manifest_url)
1586 f4m_formats = self._extract_f4m_formats(
1587 manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1588 transform_source=transform_source, fatal=fatal)
1589 # Sometimes stream-level manifest contains single media entry that
1590 # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1591 # At the same time parent's media entry in set-level manifest may
1592 # contain it. We will copy it from parent in such cases.
1593 if len(f4m_formats) == 1:
1596 'tbr': f.get('tbr') or tbr,
1597 'width': f.get('width') or width,
1598 'height': f.get('height') or height,
1599 'format_id': f.get('format_id') if not tbr else format_id,
1602 formats.extend(f4m_formats)
1605 formats.extend(self._extract_m3u8_formats(
1606 manifest_url, video_id, 'mp4', preference=preference,
1607 m3u8_id=m3u8_id, fatal=fatal))
1610 'format_id': format_id,
1611 'url': manifest_url,
1612 'manifest_url': manifest_url,
1613 'ext': 'flv' if bootstrap_info is not None else None,
1619 'preference': preference,
1623 def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, m3u8_id=None):
1625 'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1629 'preference': preference - 100 if preference else -100,
1630 'resolution': 'multiple',
1631 'format_note': 'Quality selection URL',
1634 def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
1635 entry_protocol='m3u8', preference=None,
1636 m3u8_id=None, note=None, errnote=None,
1637 fatal=True, live=False, data=None, headers={},
1639 res = self._download_webpage_handle(
1641 note=note or 'Downloading m3u8 information',
1642 errnote=errnote or 'Failed to download m3u8 information',
1643 fatal=fatal, data=data, headers=headers, query=query)
1648 m3u8_doc, urlh = res
1649 m3u8_url = urlh.geturl()
1651 return self._parse_m3u8_formats(
1652 m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
1653 preference=preference, m3u8_id=m3u8_id, live=live)
1655 def _parse_m3u8_formats(self, m3u8_doc, m3u8_url, ext=None,
1656 entry_protocol='m3u8', preference=None,
1657 m3u8_id=None, live=False):
1658 if '#EXT-X-FAXS-CM:' in m3u8_doc: # Adobe Flash Access
1661 if re.search(r'#EXT-X-SESSION-KEY:.*?URI="skd://', m3u8_doc): # Apple FairPlay
1666 format_url = lambda u: (
1668 if re.match(r'^https?://', u)
1669 else compat_urlparse.urljoin(m3u8_url, u))
1672 # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
1673 # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
1674 # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
1676 # We should try extracting formats only from master playlists [1, 4.3.4],
1677 # i.e. playlists that describe available qualities. On the other hand
1678 # media playlists [1, 4.3.3] should be returned as is since they contain
1679 # just the media without qualities renditions.
1680 # Fortunately, master playlist can be easily distinguished from media
1681 # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
1682 # master playlist tags MUST NOT appear in a media playlist and vice versa.
1683 # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
1684 # media playlist and MUST NOT appear in master playlist thus we can
1685 # clearly detect media playlist with this criterion.
1687 if '#EXT-X-TARGETDURATION' in m3u8_doc: # media playlist, return as is
1690 'format_id': m3u8_id,
1692 'protocol': entry_protocol,
1693 'preference': preference,
1697 last_stream_inf = {}
1699 def extract_media(x_media_line):
1700 media = parse_m3u8_attributes(x_media_line)
1701 # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
1702 media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
1703 if not (media_type and group_id and name):
1705 groups.setdefault(group_id, []).append(media)
1706 if media_type not in ('VIDEO', 'AUDIO'):
1708 media_url = media.get('URI')
1711 for v in (m3u8_id, group_id, name):
1715 'format_id': '-'.join(format_id),
1716 'url': format_url(media_url),
1717 'manifest_url': m3u8_url,
1718 'language': media.get('LANGUAGE'),
1720 'protocol': entry_protocol,
1721 'preference': preference,
1723 if media_type == 'AUDIO':
1724 f['vcodec'] = 'none'
1727 def build_stream_name():
1728 # Despite specification does not mention NAME attribute for
1729 # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
1730 # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
1731 # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
1732 stream_name = last_stream_inf.get('NAME')
1735 # If there is no NAME in EXT-X-STREAM-INF it will be obtained
1736 # from corresponding rendition group
1737 stream_group_id = last_stream_inf.get('VIDEO')
1738 if not stream_group_id:
1740 stream_group = groups.get(stream_group_id)
1741 if not stream_group:
1742 return stream_group_id
1743 rendition = stream_group[0]
1744 return rendition.get('NAME') or stream_group_id
1746 # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
1747 # chance to detect video only formats when EXT-X-STREAM-INF tags
1748 # precede EXT-X-MEDIA tags in HLS manifest such as [3].
1749 for line in m3u8_doc.splitlines():
1750 if line.startswith('#EXT-X-MEDIA:'):
1753 for line in m3u8_doc.splitlines():
1754 if line.startswith('#EXT-X-STREAM-INF:'):
1755 last_stream_inf = parse_m3u8_attributes(line)
1756 elif line.startswith('#') or not line.strip():
1759 tbr = float_or_none(
1760 last_stream_inf.get('AVERAGE-BANDWIDTH')
1761 or last_stream_inf.get('BANDWIDTH'), scale=1000)
1764 format_id.append(m3u8_id)
1765 stream_name = build_stream_name()
1766 # Bandwidth of live streams may differ over time thus making
1767 # format_id unpredictable. So it's better to keep provided
1770 format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats)))
1771 manifest_url = format_url(line.strip())
1773 'format_id': '-'.join(format_id),
1774 'url': manifest_url,
1775 'manifest_url': m3u8_url,
1778 'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
1779 'protocol': entry_protocol,
1780 'preference': preference,
1782 resolution = last_stream_inf.get('RESOLUTION')
1784 mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
1786 f['width'] = int(mobj.group('width'))
1787 f['height'] = int(mobj.group('height'))
1788 # Unified Streaming Platform
1790 r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
1792 abr, vbr = mobj.groups()
1793 abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
1798 codecs = parse_codecs(last_stream_inf.get('CODECS'))
1800 audio_group_id = last_stream_inf.get('AUDIO')
1801 # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
1802 # references a rendition group MUST have a CODECS attribute.
1803 # However, this is not always respected, for example, [2]
1804 # contains EXT-X-STREAM-INF tag which references AUDIO
1805 # rendition group but does not have CODECS and despite
1806 # referencing an audio group it represents a complete
1807 # (with audio and video) format. So, for such cases we will
1808 # ignore references to rendition groups and treat them
1809 # as complete formats.
1810 if audio_group_id and codecs and f.get('vcodec') != 'none':
1811 audio_group = groups.get(audio_group_id)
1812 if audio_group and audio_group[0].get('URI'):
1813 # TODO: update acodec for audio only formats with
1815 f['acodec'] = 'none'
1819 progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
1822 del http_f['manifest_url']
1824 'format_id': f['format_id'].replace('hls-', 'http-'),
1826 'url': progressive_uri,
1828 formats.append(http_f)
1830 last_stream_inf = {}
1834 def _xpath_ns(path, namespace=None):
1838 for c in path.split('/'):
1839 if not c or c == '.':
1842 out.append('{%s}%s' % (namespace, c))
1843 return '/'.join(out)
1845 def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
1846 smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
1852 namespace = self._parse_smil_namespace(smil)
1854 return self._parse_smil_formats(
1855 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1857 def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1858 smil = self._download_smil(smil_url, video_id, fatal=fatal)
1861 return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1863 def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
1864 return self._download_xml(
1865 smil_url, video_id, 'Downloading SMIL file',
1866 'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
1868 def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
1869 namespace = self._parse_smil_namespace(smil)
1871 formats = self._parse_smil_formats(
1872 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1873 subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1875 video_id = os.path.splitext(url_basename(smil_url))[0]
1879 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1880 name = meta.attrib.get('name')
1881 content = meta.attrib.get('content')
1882 if not name or not content:
1884 if not title and name == 'title':
1886 elif not description and name in ('description', 'abstract'):
1887 description = content
1888 elif not upload_date and name == 'date':
1889 upload_date = unified_strdate(content)
1892 'id': image.get('type'),
1893 'url': image.get('src'),
1894 'width': int_or_none(image.get('width')),
1895 'height': int_or_none(image.get('height')),
1896 } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
1900 'title': title or video_id,
1901 'description': description,
1902 'upload_date': upload_date,
1903 'thumbnails': thumbnails,
1905 'subtitles': subtitles,
1908 def _parse_smil_namespace(self, smil):
1909 return self._search_regex(
1910 r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1912 def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
1914 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1915 b = meta.get('base') or meta.get('httpBase')
1926 media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
1927 for medium in media:
1928 src = medium.get('src')
1929 if not src or src in srcs:
1933 bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
1934 filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
1935 width = int_or_none(medium.get('width'))
1936 height = int_or_none(medium.get('height'))
1937 proto = medium.get('proto')
1938 ext = medium.get('ext')
1939 src_ext = determine_ext(src)
1940 streamer = medium.get('streamer') or base
1942 if proto == 'rtmp' or streamer.startswith('rtmp'):
1948 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1950 'filesize': filesize,
1954 if transform_rtmp_url:
1955 streamer, src = transform_rtmp_url(streamer, src)
1956 formats[-1].update({
1962 src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
1963 src_url = src_url.strip()
1965 if proto == 'm3u8' or src_ext == 'm3u8':
1966 m3u8_formats = self._extract_m3u8_formats(
1967 src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
1968 if len(m3u8_formats) == 1:
1970 m3u8_formats[0].update({
1971 'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
1976 formats.extend(m3u8_formats)
1977 elif src_ext == 'f4m':
1982 'plugin': 'flowplayer-3.2.0.1',
1984 f4m_url += '&' if '?' in f4m_url else '?'
1985 f4m_url += compat_urllib_parse_urlencode(f4m_params)
1986 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
1987 elif src_ext == 'mpd':
1988 formats.extend(self._extract_mpd_formats(
1989 src_url, video_id, mpd_id='dash', fatal=False))
1990 elif re.search(r'\.ism/[Mm]anifest', src_url):
1991 formats.extend(self._extract_ism_formats(
1992 src_url, video_id, ism_id='mss', fatal=False))
1993 elif src_url.startswith('http') and self._is_valid_url(src, video_id):
1997 'ext': ext or src_ext or 'flv',
1998 'format_id': 'http-%d' % (bitrate or http_count),
2000 'filesize': filesize,
2007 def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
2010 for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
2011 src = textstream.get('src')
2012 if not src or src in urls:
2015 ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
2016 lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
2017 subtitles.setdefault(lang, []).append({
2023 def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
2024 xspf = self._download_xml(
2025 xspf_url, playlist_id, 'Downloading xpsf playlist',
2026 'Unable to download xspf manifest', fatal=fatal)
2029 return self._parse_xspf(
2030 xspf, playlist_id, xspf_url=xspf_url,
2031 xspf_base_url=base_url(xspf_url))
2033 def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
2035 'xspf': 'http://xspf.org/ns/0/',
2036 's1': 'http://static.streamone.nl/player/ns/0',
2040 for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
2042 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
2043 description = xpath_text(
2044 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
2045 thumbnail = xpath_text(
2046 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
2047 duration = float_or_none(
2048 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
2051 for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
2052 format_url = urljoin(xspf_base_url, location.text)
2057 'manifest_url': xspf_url,
2058 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
2059 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
2060 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
2062 self._sort_formats(formats)
2067 'description': description,
2068 'thumbnail': thumbnail,
2069 'duration': duration,
2074 def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
2075 res = self._download_xml_handle(
2077 note=note or 'Downloading MPD manifest',
2078 errnote=errnote or 'Failed to download MPD manifest',
2079 fatal=fatal, data=data, headers=headers, query=query)
2085 mpd_base_url = base_url(urlh.geturl())
2087 return self._parse_mpd_formats(
2088 mpd_doc, mpd_id, mpd_base_url, mpd_url)
2090 def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
2092 Parse formats from MPD manifest.
2094 1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2095 http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2096 2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2098 if mpd_doc.get('type') == 'dynamic':
2101 namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
2104 return self._xpath_ns(path, namespace)
2106 def is_drm_protected(element):
2107 return element.find(_add_ns('ContentProtection')) is not None
2109 def extract_multisegment_info(element, ms_parent_info):
2110 ms_info = ms_parent_info.copy()
2112 # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2113 # common attributes and elements. We will only extract relevant
2115 def extract_common(source):
2116 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2117 if segment_timeline is not None:
2118 s_e = segment_timeline.findall(_add_ns('S'))
2120 ms_info['total_number'] = 0
2123 r = int(s.get('r', 0))
2124 ms_info['total_number'] += 1 + r
2125 ms_info['s'].append({
2126 't': int(s.get('t', 0)),
2127 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2128 'd': int(s.attrib['d']),
2131 start_number = source.get('startNumber')
2133 ms_info['start_number'] = int(start_number)
2134 timescale = source.get('timescale')
2136 ms_info['timescale'] = int(timescale)
2137 segment_duration = source.get('duration')
2138 if segment_duration:
2139 ms_info['segment_duration'] = float(segment_duration)
2141 def extract_Initialization(source):
2142 initialization = source.find(_add_ns('Initialization'))
2143 if initialization is not None:
2144 ms_info['initialization_url'] = initialization.attrib['sourceURL']
2146 segment_list = element.find(_add_ns('SegmentList'))
2147 if segment_list is not None:
2148 extract_common(segment_list)
2149 extract_Initialization(segment_list)
2150 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
2152 ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
2154 segment_template = element.find(_add_ns('SegmentTemplate'))
2155 if segment_template is not None:
2156 extract_common(segment_template)
2157 media = segment_template.get('media')
2159 ms_info['media'] = media
2160 initialization = segment_template.get('initialization')
2162 ms_info['initialization'] = initialization
2164 extract_Initialization(segment_template)
2167 mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
2169 for period in mpd_doc.findall(_add_ns('Period')):
2170 period_duration = parse_duration(period.get('duration')) or mpd_duration
2171 period_ms_info = extract_multisegment_info(period, {
2175 for adaptation_set in period.findall(_add_ns('AdaptationSet')):
2176 if is_drm_protected(adaptation_set):
2178 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
2179 for representation in adaptation_set.findall(_add_ns('Representation')):
2180 if is_drm_protected(representation):
2182 representation_attrib = adaptation_set.attrib.copy()
2183 representation_attrib.update(representation.attrib)
2184 # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
2185 mime_type = representation_attrib['mimeType']
2186 content_type = mime_type.split('/')[0]
2187 if content_type == 'text':
2188 # TODO implement WebVTT downloading
2190 elif content_type in ('video', 'audio'):
2192 for element in (representation, adaptation_set, period, mpd_doc):
2193 base_url_e = element.find(_add_ns('BaseURL'))
2194 if base_url_e is not None:
2195 base_url = base_url_e.text + base_url
2196 if re.match(r'^https?://', base_url):
2198 if mpd_base_url and not re.match(r'^https?://', base_url):
2199 if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
2201 base_url = mpd_base_url + base_url
2202 representation_id = representation_attrib.get('id')
2203 lang = representation_attrib.get('lang')
2204 url_el = representation.find(_add_ns('BaseURL'))
2205 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2206 bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2208 'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
2209 'manifest_url': mpd_url,
2210 'ext': mimetype2ext(mime_type),
2211 'width': int_or_none(representation_attrib.get('width')),
2212 'height': int_or_none(representation_attrib.get('height')),
2213 'tbr': float_or_none(bandwidth, 1000),
2214 'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2215 'fps': int_or_none(representation_attrib.get('frameRate')),
2216 'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2217 'format_note': 'DASH %s' % content_type,
2218 'filesize': filesize,
2219 'container': mimetype2ext(mime_type) + '_dash',
2221 f.update(parse_codecs(representation_attrib.get('codecs')))
2222 representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2224 def prepare_template(template_name, identifiers):
2225 tmpl = representation_ms_info[template_name]
2226 # First of, % characters outside $...$ templates
2227 # must be escaped by doubling for proper processing
2228 # by % operator string formatting used further (see
2229 # https://github.com/ytdl-org/youtube-dl/issues/16867).
2235 in_template = not in_template
2236 elif c == '%' and not in_template:
2238 # Next, $...$ templates are translated to their
2239 # %(...) counterparts to be used with % operator
2240 t = t.replace('$RepresentationID$', representation_id)
2241 t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2242 t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2243 t.replace('$$', '$')
2246 # @initialization is a regular template like @media one
2247 # so it should be handled just the same way (see
2248 # https://github.com/ytdl-org/youtube-dl/issues/11605)
2249 if 'initialization' in representation_ms_info:
2250 initialization_template = prepare_template(
2252 # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2253 # $Time$ shall not be included for @initialization thus
2254 # only $Bandwidth$ remains
2256 representation_ms_info['initialization_url'] = initialization_template % {
2257 'Bandwidth': bandwidth,
2260 def location_key(location):
2261 return 'url' if re.match(r'^https?://', location) else 'path'
2263 if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2265 media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2266 media_location_key = location_key(media_template)
2268 # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2269 # can't be used at the same time
2270 if '%(Number' in media_template and 's' not in representation_ms_info:
2271 segment_duration = None
2272 if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2273 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2274 representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
2275 representation_ms_info['fragments'] = [{
2276 media_location_key: media_template % {
2277 'Number': segment_number,
2278 'Bandwidth': bandwidth,
2280 'duration': segment_duration,
2281 } for segment_number in range(
2282 representation_ms_info['start_number'],
2283 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2285 # $Number*$ or $Time$ in media template with S list available
2286 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2287 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2288 representation_ms_info['fragments'] = []
2291 segment_number = representation_ms_info['start_number']
2293 def add_segment_url():
2294 segment_url = media_template % {
2295 'Time': segment_time,
2296 'Bandwidth': bandwidth,
2297 'Number': segment_number,
2299 representation_ms_info['fragments'].append({
2300 media_location_key: segment_url,
2301 'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2304 for num, s in enumerate(representation_ms_info['s']):
2305 segment_time = s.get('t') or segment_time
2309 for r in range(s.get('r', 0)):
2310 segment_time += segment_d
2313 segment_time += segment_d
2314 elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2316 # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
2317 # or any YouTube dashsegments video
2320 timescale = representation_ms_info['timescale']
2321 for s in representation_ms_info['s']:
2322 duration = float_or_none(s['d'], timescale)
2323 for r in range(s.get('r', 0) + 1):
2324 segment_uri = representation_ms_info['segment_urls'][segment_index]
2326 location_key(segment_uri): segment_uri,
2327 'duration': duration,
2330 representation_ms_info['fragments'] = fragments
2331 elif 'segment_urls' in representation_ms_info:
2332 # Segment URLs with no SegmentTimeline
2333 # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
2334 # https://github.com/ytdl-org/youtube-dl/pull/14844
2336 segment_duration = float_or_none(
2337 representation_ms_info['segment_duration'],
2338 representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
2339 for segment_url in representation_ms_info['segment_urls']:
2341 location_key(segment_url): segment_url,
2343 if segment_duration:
2344 fragment['duration'] = segment_duration
2345 fragments.append(fragment)
2346 representation_ms_info['fragments'] = fragments
2347 # If there is a fragments key available then we correctly recognized fragmented media.
2348 # Otherwise we will assume unfragmented media with direct access. Technically, such
2349 # assumption is not necessarily correct since we may simply have no support for
2350 # some forms of fragmented media renditions yet, but for now we'll use this fallback.
2351 if 'fragments' in representation_ms_info:
2353 # NB: mpd_url may be empty when MPD manifest is parsed from a string
2354 'url': mpd_url or base_url,
2355 'fragment_base_url': base_url,
2357 'protocol': 'http_dash_segments',
2359 if 'initialization_url' in representation_ms_info:
2360 initialization_url = representation_ms_info['initialization_url']
2361 if not f.get('url'):
2362 f['url'] = initialization_url
2363 f['fragments'].append({location_key(initialization_url): initialization_url})
2364 f['fragments'].extend(representation_ms_info['fragments'])
2366 # Assuming direct URL to unfragmented media.
2370 self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2373 def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
2374 res = self._download_xml_handle(
2376 note=note or 'Downloading ISM manifest',
2377 errnote=errnote or 'Failed to download ISM manifest',
2378 fatal=fatal, data=data, headers=headers, query=query)
2385 return self._parse_ism_formats(ism_doc, urlh.geturl(), ism_id)
2387 def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None):
2389 Parse formats from ISM manifest.
2391 1. [MS-SSTR]: Smooth Streaming Protocol,
2392 https://msdn.microsoft.com/en-us/library/ff469518.aspx
2394 if ism_doc.get('IsLive') == 'TRUE' or ism_doc.find('Protection') is not None:
2397 duration = int(ism_doc.attrib['Duration'])
2398 timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
2401 for stream in ism_doc.findall('StreamIndex'):
2402 stream_type = stream.get('Type')
2403 if stream_type not in ('video', 'audio'):
2405 url_pattern = stream.attrib['Url']
2406 stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
2407 stream_name = stream.get('Name')
2408 for track in stream.findall('QualityLevel'):
2409 fourcc = track.get('FourCC', 'AACL' if track.get('AudioTag') == '255' else None)
2410 # TODO: add support for WVC1 and WMAP
2411 if fourcc not in ('H264', 'AVC1', 'AACL'):
2412 self.report_warning('%s is not a supported codec' % fourcc)
2414 tbr = int(track.attrib['Bitrate']) // 1000
2415 # [1] does not mention Width and Height attributes. However,
2416 # they're often present while MaxWidth and MaxHeight are
2417 # missing, so should be used as fallbacks
2418 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
2419 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
2420 sampling_rate = int_or_none(track.get('SamplingRate'))
2422 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
2423 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
2429 stream_fragments = stream.findall('c')
2430 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
2431 fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
2432 fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
2433 fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
2434 if not fragment_ctx['duration']:
2436 next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
2438 next_fragment_time = duration
2439 fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
2440 for _ in range(fragment_repeat):
2442 'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
2443 'duration': fragment_ctx['duration'] / stream_timescale,
2445 fragment_ctx['time'] += fragment_ctx['duration']
2449 format_id.append(ism_id)
2451 format_id.append(stream_name)
2452 format_id.append(compat_str(tbr))
2455 'format_id': '-'.join(format_id),
2457 'manifest_url': ism_url,
2458 'ext': 'ismv' if stream_type == 'video' else 'isma',
2462 'asr': sampling_rate,
2463 'vcodec': 'none' if stream_type == 'audio' else fourcc,
2464 'acodec': 'none' if stream_type == 'video' else fourcc,
2466 'fragments': fragments,
2467 '_download_params': {
2468 'duration': duration,
2469 'timescale': stream_timescale,
2470 'width': width or 0,
2471 'height': height or 0,
2473 'codec_private_data': track.get('CodecPrivateData'),
2474 'sampling_rate': sampling_rate,
2475 'channels': int_or_none(track.get('Channels', 2)),
2476 'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
2477 'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
2482 def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None):
2483 def absolute_url(item_url):
2484 return urljoin(base_url, item_url)
2486 def parse_content_type(content_type):
2487 if not content_type:
2489 ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
2491 mimetype, codecs = ctr.groups()
2492 f = parse_codecs(codecs)
2493 f['ext'] = mimetype2ext(mimetype)
2497 def _media_formats(src, cur_media_type, type_info={}):
2498 full_url = absolute_url(src)
2499 ext = type_info.get('ext') or determine_ext(full_url)
2501 is_plain_url = False
2502 formats = self._extract_m3u8_formats(
2503 full_url, video_id, ext='mp4',
2504 entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
2505 preference=preference, fatal=False)
2507 is_plain_url = False
2508 formats = self._extract_mpd_formats(
2509 full_url, video_id, mpd_id=mpd_id, fatal=False)
2514 'vcodec': 'none' if cur_media_type == 'audio' else None,
2516 return is_plain_url, formats
2519 # amp-video and amp-audio are very similar to their HTML5 counterparts
2520 # so we wll include them right here (see
2521 # https://www.ampproject.org/docs/reference/components/amp-video)
2522 # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
2523 _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
2524 media_tags = [(media_tag, media_tag_name, media_type, '')
2525 for media_tag, media_tag_name, media_type
2526 in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
2527 media_tags.extend(re.findall(
2528 # We only allow video|audio followed by a whitespace or '>'.
2529 # Allowing more characters may end up in significant slow down (see
2530 # https://github.com/ytdl-org/youtube-dl/issues/11979, example URL:
2531 # http://www.porntrex.com/maps/videositemap.xml).
2532 r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
2533 for media_tag, _, media_type, media_content in media_tags:
2538 media_attributes = extract_attributes(media_tag)
2539 src = strip_or_none(media_attributes.get('src'))
2541 _, formats = _media_formats(src, media_type)
2542 media_info['formats'].extend(formats)
2543 media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
2545 for source_tag in re.findall(r'<source[^>]+>', media_content):
2546 s_attr = extract_attributes(source_tag)
2547 # data-video-src and data-src are non standard but seen
2548 # several times in the wild
2549 src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src')))
2552 f = parse_content_type(s_attr.get('type'))
2553 is_plain_url, formats = _media_formats(src, media_type, f)
2555 # width, height, res, label and title attributes are
2556 # all not standard but seen several times in the wild
2559 for lbl in ('label', 'title')
2560 if str_or_none(s_attr.get(lbl))
2562 width = int_or_none(s_attr.get('width'))
2563 height = (int_or_none(s_attr.get('height'))
2564 or int_or_none(s_attr.get('res')))
2565 if not width or not height:
2567 resolution = parse_resolution(lbl)
2570 width = width or resolution.get('width')
2571 height = height or resolution.get('height')
2573 tbr = parse_bitrate(lbl)
2582 'format_id': s_attr.get('label') or s_attr.get('title'),
2584 f.update(formats[0])
2585 media_info['formats'].append(f)
2587 media_info['formats'].extend(formats)
2588 for track_tag in re.findall(r'<track[^>]+>', media_content):
2589 track_attributes = extract_attributes(track_tag)
2590 kind = track_attributes.get('kind')
2591 if not kind or kind in ('subtitles', 'captions'):
2592 src = strip_or_none(track_attributes.get('src'))
2595 lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
2596 media_info['subtitles'].setdefault(lang, []).append({
2597 'url': absolute_url(src),
2599 for f in media_info['formats']:
2600 f.setdefault('http_headers', {})['Referer'] = base_url
2601 if media_info['formats'] or media_info['subtitles']:
2602 entries.append(media_info)
2605 def _extract_akamai_formats(self, manifest_url, video_id, hosts={}):
2606 signed = 'hdnea=' in manifest_url
2608 # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
2609 manifest_url = re.sub(
2610 r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
2611 '', manifest_url).strip('?')
2615 hdcore_sign = 'hdcore=3.7.0'
2616 f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
2617 hds_host = hosts.get('hds')
2619 f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
2620 if 'hdcore=' not in f4m_url:
2621 f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
2622 f4m_formats = self._extract_f4m_formats(
2623 f4m_url, video_id, f4m_id='hds', fatal=False)
2624 for entry in f4m_formats:
2625 entry.update({'extra_param_to_segment_url': hdcore_sign})
2626 formats.extend(f4m_formats)
2628 m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
2629 hls_host = hosts.get('hls')
2631 m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
2632 m3u8_formats = self._extract_m3u8_formats(
2633 m3u8_url, video_id, 'mp4', 'm3u8_native',
2634 m3u8_id='hls', fatal=False)
2635 formats.extend(m3u8_formats)
2637 http_host = hosts.get('http')
2638 if http_host and m3u8_formats and not signed:
2639 REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
2640 qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
2641 qualities_length = len(qualities)
2642 if len(m3u8_formats) in (qualities_length, qualities_length + 1):
2644 for f in m3u8_formats:
2645 if f['vcodec'] != 'none':
2646 for protocol in ('http', 'https'):
2648 del http_f['manifest_url']
2650 REPL_REGEX, protocol + r'://%s/\g<1>%s\3' % (http_host, qualities[i]), f['url'])
2652 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
2654 'protocol': protocol,
2656 formats.append(http_f)
2661 def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
2662 query = compat_urlparse.urlparse(url).query
2663 url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
2665 r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
2666 url_base = mobj.group('url')
2667 http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
2670 def manifest_url(manifest):
2671 m_url = '%s/%s' % (http_base_url, manifest)
2673 m_url += '?%s' % query
2676 if 'm3u8' not in skip_protocols:
2677 formats.extend(self._extract_m3u8_formats(
2678 manifest_url('playlist.m3u8'), video_id, 'mp4',
2679 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
2680 if 'f4m' not in skip_protocols:
2681 formats.extend(self._extract_f4m_formats(
2682 manifest_url('manifest.f4m'),
2683 video_id, f4m_id='hds', fatal=False))
2684 if 'dash' not in skip_protocols:
2685 formats.extend(self._extract_mpd_formats(
2686 manifest_url('manifest.mpd'),
2687 video_id, mpd_id='dash', fatal=False))
2688 if re.search(r'(?:/smil:|\.smil)', url_base):
2689 if 'smil' not in skip_protocols:
2690 rtmp_formats = self._extract_smil_formats(
2691 manifest_url('jwplayer.smil'),
2692 video_id, fatal=False)
2693 for rtmp_format in rtmp_formats:
2694 rtsp_format = rtmp_format.copy()
2695 rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
2696 del rtsp_format['play_path']
2697 del rtsp_format['ext']
2698 rtsp_format.update({
2699 'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
2700 'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
2703 formats.extend([rtmp_format, rtsp_format])
2705 for protocol in ('rtmp', 'rtsp'):
2706 if protocol not in skip_protocols:
2708 'url': '%s:%s' % (protocol, url_base),
2709 'format_id': protocol,
2710 'protocol': protocol,
2714 def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
2716 r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
2720 jwplayer_data = self._parse_json(mobj.group('options'),
2722 transform_source=transform_source)
2723 except ExtractorError:
2726 if isinstance(jwplayer_data, dict):
2727 return jwplayer_data
2729 def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
2730 jwplayer_data = self._find_jwplayer_data(
2731 webpage, video_id, transform_source=js_to_json)
2732 return self._parse_jwplayer_data(
2733 jwplayer_data, video_id, *args, **kwargs)
2735 def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
2736 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
2737 # JWPlayer backward compatibility: flattened playlists
2738 # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
2739 if 'playlist' not in jwplayer_data:
2740 jwplayer_data = {'playlist': [jwplayer_data]}
2744 # JWPlayer backward compatibility: single playlist item
2745 # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
2746 if not isinstance(jwplayer_data['playlist'], list):
2747 jwplayer_data['playlist'] = [jwplayer_data['playlist']]
2749 for video_data in jwplayer_data['playlist']:
2750 # JWPlayer backward compatibility: flattened sources
2751 # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
2752 if 'sources' not in video_data:
2753 video_data['sources'] = [video_data]
2755 this_video_id = video_id or video_data['mediaid']
2757 formats = self._parse_jwplayer_formats(
2758 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
2759 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
2762 tracks = video_data.get('tracks')
2763 if tracks and isinstance(tracks, list):
2764 for track in tracks:
2765 if not isinstance(track, dict):
2767 track_kind = track.get('kind')
2768 if not track_kind or not isinstance(track_kind, compat_str):
2770 if track_kind.lower() not in ('captions', 'subtitles'):
2772 track_url = urljoin(base_url, track.get('file'))
2775 subtitles.setdefault(track.get('label') or 'en', []).append({
2776 'url': self._proto_relative_url(track_url)
2780 'id': this_video_id,
2781 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
2782 'description': clean_html(video_data.get('description')),
2783 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
2784 'timestamp': int_or_none(video_data.get('pubdate')),
2785 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
2786 'subtitles': subtitles,
2788 # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
2789 if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
2791 '_type': 'url_transparent',
2792 'url': formats[0]['url'],
2795 self._sort_formats(formats)
2796 entry['formats'] = formats
2797 entries.append(entry)
2798 if len(entries) == 1:
2801 return self.playlist_result(entries)
2803 def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
2804 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
2807 for source in jwplayer_sources_data:
2808 if not isinstance(source, dict):
2810 source_url = urljoin(
2811 base_url, self._proto_relative_url(source.get('file')))
2812 if not source_url or source_url in urls:
2814 urls.append(source_url)
2815 source_type = source.get('type') or ''
2816 ext = mimetype2ext(source_type) or determine_ext(source_url)
2817 if source_type == 'hls' or ext == 'm3u8':
2818 formats.extend(self._extract_m3u8_formats(
2819 source_url, video_id, 'mp4', entry_protocol='m3u8_native',
2820 m3u8_id=m3u8_id, fatal=False))
2821 elif source_type == 'dash' or ext == 'mpd':
2822 formats.extend(self._extract_mpd_formats(
2823 source_url, video_id, mpd_id=mpd_id, fatal=False))
2825 formats.extend(self._extract_smil_formats(
2826 source_url, video_id, fatal=False))
2827 # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
2828 elif source_type.startswith('audio') or ext in (
2829 'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
2836 height = int_or_none(source.get('height'))
2838 # Often no height is provided but there is a label in
2839 # format like "1080p", "720p SD", or 1080.
2840 height = int_or_none(self._search_regex(
2841 r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
2842 'height', default=None))
2845 'width': int_or_none(source.get('width')),
2847 'tbr': int_or_none(source.get('bitrate')),
2850 if source_url.startswith('rtmp'):
2851 a_format['ext'] = 'flv'
2852 # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
2853 # of jwplayer.flash.swf
2854 rtmp_url_parts = re.split(
2855 r'((?:mp4|mp3|flv):)', source_url, 1)
2856 if len(rtmp_url_parts) == 3:
2857 rtmp_url, prefix, play_path = rtmp_url_parts
2860 'play_path': prefix + play_path,
2863 a_format.update(rtmp_params)
2864 formats.append(a_format)
2867 def _live_title(self, name):
2868 """ Generate the title for a live video """
2869 now = datetime.datetime.now()
2870 now_str = now.strftime('%Y-%m-%d %H:%M')
2871 return name + ' ' + now_str
2873 def _int(self, v, name, fatal=False, **kwargs):
2874 res = int_or_none(v, **kwargs)
2875 if 'get_attr' in kwargs:
2876 print(getattr(v, kwargs['get_attr']))
2878 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2880 raise ExtractorError(msg)
2882 self._downloader.report_warning(msg)
2885 def _float(self, v, name, fatal=False, **kwargs):
2886 res = float_or_none(v, **kwargs)
2888 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2890 raise ExtractorError(msg)
2892 self._downloader.report_warning(msg)
2895 def _set_cookie(self, domain, name, value, expire_time=None, port=None,
2896 path='/', secure=False, discard=False, rest={}, **kwargs):
2897 cookie = compat_cookiejar_Cookie(
2898 0, name, value, port, port is not None, domain, True,
2899 domain.startswith('.'), path, True, secure, expire_time,
2900 discard, None, None, rest)
2901 self._downloader.cookiejar.set_cookie(cookie)
2903 def _get_cookies(self, url):
2904 """ Return a compat_cookies_SimpleCookie with the cookies for the url """
2905 req = sanitized_Request(url)
2906 self._downloader.cookiejar.add_cookie_header(req)
2907 return compat_cookies_SimpleCookie(req.get_header('Cookie'))
2909 def _apply_first_set_cookie_header(self, url_handle, cookie):
2911 Apply first Set-Cookie header instead of the last. Experimental.
2913 Some sites (e.g. [1-3]) may serve two cookies under the same name
2914 in Set-Cookie header and expect the first (old) one to be set rather
2915 than second (new). However, as of RFC6265 the newer one cookie
2916 should be set into cookie store what actually happens.
2917 We will workaround this issue by resetting the cookie to
2918 the first one manually.
2919 1. https://new.vk.com/
2920 2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
2921 3. https://learning.oreilly.com/
2923 for header, cookies in url_handle.headers.items():
2924 if header.lower() != 'set-cookie':
2926 if sys.version_info[0] >= 3:
2927 cookies = cookies.encode('iso-8859-1')
2928 cookies = cookies.decode('utf-8')
2929 cookie_value = re.search(
2930 r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
2932 value, domain = cookie_value.groups()
2933 self._set_cookie(domain, cookie, value)
2936 def get_testcases(self, include_onlymatching=False):
2937 t = getattr(self, '_TEST', None)
2939 assert not hasattr(self, '_TESTS'), \
2940 '%s has _TEST and _TESTS' % type(self).__name__
2943 tests = getattr(self, '_TESTS', [])
2945 if not include_onlymatching and t.get('only_matching', False):
2947 t['name'] = type(self).__name__[:-len('IE')]
2950 def is_suitable(self, age_limit):
2951 """ Test whether the extractor is generally suitable for the given
2952 age limit (i.e. pornographic sites are not, all others usually are) """
2954 any_restricted = False
2955 for tc in self.get_testcases(include_onlymatching=False):
2956 if tc.get('playlist', []):
2957 tc = tc['playlist'][0]
2958 is_restricted = age_restricted(
2959 tc.get('info_dict', {}).get('age_limit'), age_limit)
2960 if not is_restricted:
2962 any_restricted = any_restricted or is_restricted
2963 return not any_restricted
2965 def extract_subtitles(self, *args, **kwargs):
2966 if (self._downloader.params.get('writesubtitles', False)
2967 or self._downloader.params.get('listsubtitles')):
2968 return self._get_subtitles(*args, **kwargs)
2971 def _get_subtitles(self, *args, **kwargs):
2972 raise NotImplementedError('This method must be implemented by subclasses')
2975 def _merge_subtitle_items(subtitle_list1, subtitle_list2):
2976 """ Merge subtitle items for one language. Items with duplicated URLs
2977 will be dropped. """
2978 list1_urls = set([item['url'] for item in subtitle_list1])
2979 ret = list(subtitle_list1)
2980 ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
2984 def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
2985 """ Merge two subtitle dictionaries, language by language. """
2986 ret = dict(subtitle_dict1)
2987 for lang in subtitle_dict2:
2988 ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
2991 def extract_automatic_captions(self, *args, **kwargs):
2992 if (self._downloader.params.get('writeautomaticsub', False)
2993 or self._downloader.params.get('listsubtitles')):
2994 return self._get_automatic_captions(*args, **kwargs)
2997 def _get_automatic_captions(self, *args, **kwargs):
2998 raise NotImplementedError('This method must be implemented by subclasses')
3000 def mark_watched(self, *args, **kwargs):
3001 if (self._downloader.params.get('mark_watched', False)
3002 and (self._get_login_info()[0] is not None
3003 or self._downloader.params.get('cookiefile') is not None)):
3004 self._mark_watched(*args, **kwargs)
3006 def _mark_watched(self, *args, **kwargs):
3007 raise NotImplementedError('This method must be implemented by subclasses')
3009 def geo_verification_headers(self):
3011 geo_verification_proxy = self._downloader.params.get('geo_verification_proxy')
3012 if geo_verification_proxy:
3013 headers['Ytdl-request-proxy'] = geo_verification_proxy
3016 def _generic_id(self, url):
3017 return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
3019 def _generic_title(self, url):
3020 return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
3023 class SearchInfoExtractor(InfoExtractor):
3025 Base class for paged search queries extractors.
3026 They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
3027 Instances should define _SEARCH_KEY and _MAX_RESULTS.
3031 def _make_valid_url(cls):
3032 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
3035 def suitable(cls, url):
3036 return re.match(cls._make_valid_url(), url) is not None
3038 def _real_extract(self, query):
3039 mobj = re.match(self._make_valid_url(), query)
3041 raise ExtractorError('Invalid search query "%s"' % query)
3043 prefix = mobj.group('prefix')
3044 query = mobj.group('query')
3046 return self._get_n_results(query, 1)
3047 elif prefix == 'all':
3048 return self._get_n_results(query, self._MAX_RESULTS)
3052 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
3053 elif n > self._MAX_RESULTS:
3054 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
3055 n = self._MAX_RESULTS
3056 return self._get_n_results(query, n)
3058 def _get_n_results(self, query, n):
3059 """Get a specified number of results for a query"""
3060 raise NotImplementedError('This method must be implemented by subclasses')
3063 def SEARCH_KEY(self):
3064 return self._SEARCH_KEY