youtube_dl/extractor/common.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 import base64
   5 import datetime
   6 import hashlib
   7 import json
   8 import netrc
   9 import os
  10 import random
  11 import re
  12 import socket
  13 import ssl
  14 import sys
  15 import time
  16 import math
  17
  18 from ..compat import (
  19     compat_cookiejar_Cookie,
  20     compat_cookies_SimpleCookie,
  21     compat_etree_Element,
  22     compat_etree_fromstring,
  23     compat_getpass,
  24     compat_integer_types,
  25     compat_http_client,
  26     compat_os_name,
  27     compat_str,
  28     compat_urllib_error,
  29     compat_urllib_parse_unquote,
  30     compat_urllib_parse_urlencode,
  31     compat_urllib_request,
  32     compat_urlparse,
  33     compat_xml_parse_error,
  34 )
  35 from ..downloader.f4m import (
  36     get_base_url,
  37     remove_encrypted_media,
  38 )
  39 from ..utils import (
  40     NO_DEFAULT,
  41     age_restricted,
  42     base_url,
  43     bug_reports_message,
  44     clean_html,
  45     compiled_regex_type,
  46     determine_ext,
  47     determine_protocol,
  48     dict_get,
  49     error_to_compat_str,
  50     ExtractorError,
  51     extract_attributes,
  52     fix_xml_ampersands,
  53     float_or_none,
  54     GeoRestrictedError,
  55     GeoUtils,
  56     int_or_none,
  57     js_to_json,
  58     JSON_LD_RE,
  59     mimetype2ext,
  60     orderedSet,
  61     parse_bitrate,
  62     parse_codecs,
  63     parse_duration,
  64     parse_iso8601,
  65     parse_m3u8_attributes,
  66     parse_resolution,
  67     RegexNotFoundError,
  68     sanitized_Request,
  69     sanitize_filename,
  70     str_or_none,
  71     str_to_int,
  72     strip_or_none,
  73     unescapeHTML,
  74     unified_strdate,
  75     unified_timestamp,
  76     update_Request,
  77     update_url_query,
  78     urljoin,
  79     url_basename,
  80     url_or_none,
  81     xpath_element,
  82     xpath_text,
  83     xpath_with_ns,
  84 )
  85
  86
  87 class InfoExtractor(object):
  88     """Information Extractor class.
  89
  90     Information extractors are the classes that, given a URL, extract
  91     information about the video (or videos) the URL refers to. This
  92     information includes the real video URL, the video title, author and
  93     others. The information is stored in a dictionary which is then
  94     passed to the YoutubeDL. The YoutubeDL processes this
  95     information possibly downloading the video to the file system, among
  96     other possible outcomes.
  97
  98     The type field determines the type of the result.
  99     By far the most common value (and the default if _type is missing) is
 100     "video", which indicates a single video.
 101
 102     For a video, the dictionaries must include the following fields:
 103
 104     id:             Video identifier.
 105     title:          Video title, unescaped.
 106
 107     Additionally, it must contain either a formats entry or a url one:
 108
 109     formats:        A list of dictionaries for each format available, ordered
 110                     from worst to best quality.
 111
 112                     Potential fields:
 113                     * url        The mandatory URL representing the media:
 114                                    for plain file media - HTTP URL of this file,
 115                                    for RTMP - RTMP URL,
 116                                    for HLS - URL of the M3U8 media playlist,
 117                                    for HDS - URL of the F4M manifest,
 118                                    for DASH
 119                                      - HTTP URL to plain file media (in case of
 120                                        unfragmented media)
 121                                      - URL of the MPD manifest or base URL
 122                                        representing the media if MPD manifest
 123                                        is parsed from a string (in case of
 124                                        fragmented media)
 125                                    for MSS - URL of the ISM manifest.
 126                     * manifest_url
 127                                  The URL of the manifest file in case of
 128                                  fragmented media:
 129                                    for HLS - URL of the M3U8 master playlist,
 130                                    for HDS - URL of the F4M manifest,
 131                                    for DASH - URL of the MPD manifest,
 132                                    for MSS - URL of the ISM manifest.
 133                     * ext        Will be calculated from URL if missing
 134                     * format     A human-readable description of the format
 135                                  ("mp4 container with h264/opus").
 136                                  Calculated from the format_id, width, height.
 137                                  and format_note fields if missing.
 138                     * format_id  A short description of the format
 139                                  ("mp4_h264_opus" or "19").
 140                                 Technically optional, but strongly recommended.
 141                     * format_note Additional info about the format
 142                                  ("3D" or "DASH video")
 143                     * width      Width of the video, if known
 144                     * height     Height of the video, if known
 145                     * resolution Textual description of width and height
 146                     * tbr        Average bitrate of audio and video in KBit/s
 147                     * abr        Average audio bitrate in KBit/s
 148                     * acodec     Name of the audio codec in use
 149                     * asr        Audio sampling rate in Hertz
 150                     * vbr        Average video bitrate in KBit/s
 151                     * fps        Frame rate
 152                     * vcodec     Name of the video codec in use
 153                     * container  Name of the container format
 154                     * filesize   The number of bytes, if known in advance
 155                     * filesize_approx  An estimate for the number of bytes
 156                     * player_url SWF Player URL (used for rtmpdump).
 157                     * protocol   The protocol that will be used for the actual
 158                                  download, lower-case.
 159                                  "http", "https", "rtsp", "rtmp", "rtmpe",
 160                                  "m3u8", "m3u8_native" or "http_dash_segments".
 161                     * fragment_base_url
 162                                  Base URL for fragments. Each fragment's path
 163                                  value (if present) will be relative to
 164                                  this URL.
 165                     * fragments  A list of fragments of a fragmented media.
 166                                  Each fragment entry must contain either an url
 167                                  or a path. If an url is present it should be
 168                                  considered by a client. Otherwise both path and
 169                                  fragment_base_url must be present. Here is
 170                                  the list of all potential fields:
 171                                  * "url" - fragment's URL
 172                                  * "path" - fragment's path relative to
 173                                             fragment_base_url
 174                                  * "duration" (optional, int or float)
 175                                  * "filesize" (optional, int)
 176                     * preference Order number of this format. If this field is
 177                                  present and not None, the formats get sorted
 178                                  by this field, regardless of all other values.
 179                                  -1 for default (order by other properties),
 180                                  -2 or smaller for less than default.
 181                                  < -1000 to hide the format (if there is
 182                                     another one which is strictly better)
 183                     * language   Language code, e.g. "de" or "en-US".
 184                     * language_preference  Is this in the language mentioned in
 185                                  the URL?
 186                                  10 if it's what the URL is about,
 187                                  -1 for default (don't know),
 188                                  -10 otherwise, other values reserved for now.
 189                     * quality    Order number of the video quality of this
 190                                  format, irrespective of the file format.
 191                                  -1 for default (order by other properties),
 192                                  -2 or smaller for less than default.
 193                     * source_preference  Order number for this video source
 194                                   (quality takes higher priority)
 195                                  -1 for default (order by other properties),
 196                                  -2 or smaller for less than default.
 197                     * http_headers  A dictionary of additional HTTP headers
 198                                  to add to the request.
 199                     * stretched_ratio  If given and not 1, indicates that the
 200                                  video's pixels are not square.
 201                                  width : height ratio as float.
 202                     * no_resume  The server does not support resuming the
 203                                  (HTTP or RTMP) download. Boolean.
 204                     * downloader_options  A dictionary of downloader options as
 205                                  described in FileDownloader
 206
 207     url:            Final video URL.
 208     ext:            Video filename extension.
 209     format:         The video format, defaults to ext (used for --get-format)
 210     player_url:     SWF Player URL (used for rtmpdump).
 211
 212     The following fields are optional:
 213
 214     alt_title:      A secondary title of the video.
 215     display_id      An alternative identifier for the video, not necessarily
 216                     unique, but available before title. Typically, id is
 217                     something like "4234987", title "Dancing naked mole rats",
 218                     and display_id "dancing-naked-mole-rats"
 219     thumbnails:     A list of dictionaries, with the following entries:
 220                         * "id" (optional, string) - Thumbnail format ID
 221                         * "url"
 222                         * "preference" (optional, int) - quality of the image
 223                         * "width" (optional, int)
 224                         * "height" (optional, int)
 225                         * "resolution" (optional, string "{width}x{height}",
 226                                         deprecated)
 227                         * "filesize" (optional, int)
 228     thumbnail:      Full URL to a video thumbnail image.
 229     description:    Full video description.
 230     uploader:       Full name of the video uploader.
 231     license:        License name the video is licensed under.
 232     creator:        The creator of the video.
 233     release_timestamp: UNIX timestamp of the moment the video was released.
 234     release_date:   The date (YYYYMMDD) when the video was released.
 235     timestamp:      UNIX timestamp of the moment the video became available
 236                     (uploaded).
 237     upload_date:    Video upload date (YYYYMMDD).
 238                     If not explicitly set, calculated from timestamp.
 239     uploader_id:    Nickname or id of the video uploader.
 240     uploader_url:   Full URL to a personal webpage of the video uploader.
 241     channel:        Full name of the channel the video is uploaded on.
 242                     Note that channel fields may or may not repeat uploader
 243                     fields. This depends on a particular extractor.
 244     channel_id:     Id of the channel.
 245     channel_url:    Full URL to a channel webpage.
 246     location:       Physical location where the video was filmed.
 247     subtitles:      The available subtitles as a dictionary in the format
 248                     {tag: subformats}. "tag" is usually a language code, and
 249                     "subformats" is a list sorted from lower to higher
 250                     preference, each element is a dictionary with the "ext"
 251                     entry and one of:
 252                         * "data": The subtitles file contents
 253                         * "url": A URL pointing to the subtitles file
 254                     "ext" will be calculated from URL if missing
 255     automatic_captions: Like 'subtitles', used by the YoutubeIE for
 256                     automatically generated captions
 257     duration:       Length of the video in seconds, as an integer or float.
 258     view_count:     How many users have watched the video on the platform.
 259     like_count:     Number of positive ratings of the video
 260     dislike_count:  Number of negative ratings of the video
 261     repost_count:   Number of reposts of the video
 262     average_rating: Average rating give by users, the scale used depends on the webpage
 263     comment_count:  Number of comments on the video
 264     comments:       A list of comments, each with one or more of the following
 265                     properties (all but one of text or html optional):
 266                         * "author" - human-readable name of the comment author
 267                         * "author_id" - user ID of the comment author
 268                         * "id" - Comment ID
 269                         * "html" - Comment as HTML
 270                         * "text" - Plain text of the comment
 271                         * "timestamp" - UNIX timestamp of comment
 272                         * "parent" - ID of the comment this one is replying to.
 273                                      Set to "root" to indicate that this is a
 274                                      comment to the original video.
 275     age_limit:      Age restriction for the video, as an integer (years)
 276     webpage_url:    The URL to the video webpage, if given to youtube-dl it
 277                     should allow to get the same result again. (It will be set
 278                     by YoutubeDL if it's missing)
 279     categories:     A list of categories that the video falls in, for example
 280                     ["Sports", "Berlin"]
 281     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 282     is_live:        True, False, or None (=unknown). Whether this video is a
 283                     live stream that goes on instead of a fixed-length video.
 284     start_time:     Time in seconds where the reproduction should start, as
 285                     specified in the URL.
 286     end_time:       Time in seconds where the reproduction should end, as
 287                     specified in the URL.
 288     chapters:       A list of dictionaries, with the following entries:
 289                         * "start_time" - The start time of the chapter in seconds
 290                         * "end_time" - The end time of the chapter in seconds
 291                         * "title" (optional, string)
 292
 293     The following fields should only be used when the video belongs to some logical
 294     chapter or section:
 295
 296     chapter:        Name or title of the chapter the video belongs to.
 297     chapter_number: Number of the chapter the video belongs to, as an integer.
 298     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 299
 300     The following fields should only be used when the video is an episode of some
 301     series, programme or podcast:
 302
 303     series:         Title of the series or programme the video episode belongs to.
 304     season:         Title of the season the video episode belongs to.
 305     season_number:  Number of the season the video episode belongs to, as an integer.
 306     season_id:      Id of the season the video episode belongs to, as a unicode string.
 307     episode:        Title of the video episode. Unlike mandatory video title field,
 308                     this field should denote the exact title of the video episode
 309                     without any kind of decoration.
 310     episode_number: Number of the video episode within a season, as an integer.
 311     episode_id:     Id of the video episode, as a unicode string.
 312
 313     The following fields should only be used when the media is a track or a part of
 314     a music album:
 315
 316     track:          Title of the track.
 317     track_number:   Number of the track within an album or a disc, as an integer.
 318     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
 319                     as a unicode string.
 320     artist:         Artist(s) of the track.
 321     genre:          Genre(s) of the track.
 322     album:          Title of the album the track belongs to.
 323     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
 324     album_artist:   List of all artists appeared on the album (e.g.
 325                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
 326                     and compilations).
 327     disc_number:    Number of the disc or other physical medium the track belongs to,
 328                     as an integer.
 329     release_year:   Year (YYYY) when the album was released.
 330
 331     Unless mentioned otherwise, the fields should be Unicode strings.
 332
 333     Unless mentioned otherwise, None is equivalent to absence of information.
 334
 335
 336     _type "playlist" indicates multiple videos.
 337     There must be a key "entries", which is a list, an iterable, or a PagedList
 338     object, each element of which is a valid dictionary by this specification.
 339
 340     Additionally, playlists can have "id", "title", "description", "uploader",
 341     "uploader_id", "uploader_url", "duration" attributes with the same semantics
 342     as videos (see above).
 343
 344
 345     _type "multi_video" indicates that there are multiple videos that
 346     form a single show, for examples multiple acts of an opera or TV episode.
 347     It must have an entries key like a playlist and contain all the keys
 348     required for a video at the same time.
 349
 350
 351     _type "url" indicates that the video must be extracted from another
 352     location, possibly by a different extractor. Its only required key is:
 353     "url" - the next URL to extract.
 354     The key "ie_key" can be set to the class name (minus the trailing "IE",
 355     e.g. "Youtube") if the extractor class is known in advance.
 356     Additionally, the dictionary may have any properties of the resolved entity
 357     known in advance, for example "title" if the title of the referred video is
 358     known ahead of time.
 359
 360
 361     _type "url_transparent" entities have the same specification as "url", but
 362     indicate that the given additional information is more precise than the one
 363     associated with the resolved URL.
 364     This is useful when a site employs a video service that hosts the video and
 365     its technical metadata, but that video service does not embed a useful
 366     title, description etc.
 367
 368
 369     Subclasses of this one should re-define the _real_initialize() and
 370     _real_extract() methods and define a _VALID_URL regexp.
 371     Probably, they should also be added to the list of extractors.
 372
 373     _GEO_BYPASS attribute may be set to False in order to disable
 374     geo restriction bypass mechanisms for a particular extractor.
 375     Though it won't disable explicit geo restriction bypass based on
 376     country code provided with geo_bypass_country.
 377
 378     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
 379     countries for this extractor. One of these countries will be used by
 380     geo restriction bypass mechanism right away in order to bypass
 381     geo restriction, of course, if the mechanism is not disabled.
 382
 383     _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
 384     IP blocks in CIDR notation for this extractor. One of these IP blocks
 385     will be used by geo restriction bypass mechanism similarly
 386     to _GEO_COUNTRIES.
 387
 388     Finally, the _WORKING attribute should be set to False for broken IEs
 389     in order to warn the users and skip the tests.
 390     """
 391
 392     _ready = False
 393     _downloader = None
 394     _x_forwarded_for_ip = None
 395     _GEO_BYPASS = True
 396     _GEO_COUNTRIES = None
 397     _GEO_IP_BLOCKS = None
 398     _WORKING = True
 399
 400     def __init__(self, downloader=None):
 401         """Constructor. Receives an optional downloader."""
 402         self._ready = False
 403         self._x_forwarded_for_ip = None
 404         self.set_downloader(downloader)
 405
 406     @classmethod
 407     def suitable(cls, url):
 408         """Receives a URL and returns True if suitable for this IE."""
 409
 410         # This does not use has/getattr intentionally - we want to know whether
 411         # we have cached the regexp for *this* class, whereas getattr would also
 412         # match the superclass
 413         if '_VALID_URL_RE' not in cls.__dict__:
 414             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 415         return cls._VALID_URL_RE.match(url) is not None
 416
 417     @classmethod
 418     def _match_id(cls, url):
 419         if '_VALID_URL_RE' not in cls.__dict__:
 420             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 421         m = cls._VALID_URL_RE.match(url)
 422         assert m
 423         return compat_str(m.group('id'))
 424
 425     @classmethod
 426     def working(cls):
 427         """Getter method for _WORKING."""
 428         return cls._WORKING
 429
 430     def initialize(self):
 431         """Initializes an instance (authentication, etc)."""
 432         self._initialize_geo_bypass({
 433             'countries': self._GEO_COUNTRIES,
 434             'ip_blocks': self._GEO_IP_BLOCKS,
 435         })
 436         if not self._ready:
 437             self._real_initialize()
 438             self._ready = True
 439
 440     def _initialize_geo_bypass(self, geo_bypass_context):
 441         """
 442         Initialize geo restriction bypass mechanism.
 443
 444         This method is used to initialize geo bypass mechanism based on faking
 445         X-Forwarded-For HTTP header. A random country from provided country list
 446         is selected and a random IP belonging to this country is generated. This
 447         IP will be passed as X-Forwarded-For HTTP header in all subsequent
 448         HTTP requests.
 449
 450         This method will be used for initial geo bypass mechanism initialization
 451         during the instance initialization with _GEO_COUNTRIES and
 452         _GEO_IP_BLOCKS.
 453
 454         You may also manually call it from extractor's code if geo bypass
 455         information is not available beforehand (e.g. obtained during
 456         extraction) or due to some other reason. In this case you should pass
 457         this information in geo bypass context passed as first argument. It may
 458         contain following fields:
 459
 460         countries:  List of geo unrestricted countries (similar
 461                     to _GEO_COUNTRIES)
 462         ip_blocks:  List of geo unrestricted IP blocks in CIDR notation
 463                     (similar to _GEO_IP_BLOCKS)
 464
 465         """
 466         if not self._x_forwarded_for_ip:
 467
 468             # Geo bypass mechanism is explicitly disabled by user
 469             if not self._downloader.params.get('geo_bypass', True):
 470                 return
 471
 472             if not geo_bypass_context:
 473                 geo_bypass_context = {}
 474
 475             # Backward compatibility: previously _initialize_geo_bypass
 476             # expected a list of countries, some 3rd party code may still use
 477             # it this way
 478             if isinstance(geo_bypass_context, (list, tuple)):
 479                 geo_bypass_context = {
 480                     'countries': geo_bypass_context,
 481                 }
 482
 483             # The whole point of geo bypass mechanism is to fake IP
 484             # as X-Forwarded-For HTTP header based on some IP block or
 485             # country code.
 486
 487             # Path 1: bypassing based on IP block in CIDR notation
 488
 489             # Explicit IP block specified by user, use it right away
 490             # regardless of whether extractor is geo bypassable or not
 491             ip_block = self._downloader.params.get('geo_bypass_ip_block', None)
 492
 493             # Otherwise use random IP block from geo bypass context but only
 494             # if extractor is known as geo bypassable
 495             if not ip_block:
 496                 ip_blocks = geo_bypass_context.get('ip_blocks')
 497                 if self._GEO_BYPASS and ip_blocks:
 498                     ip_block = random.choice(ip_blocks)
 499
 500             if ip_block:
 501                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
 502                 if self._downloader.params.get('verbose', False):
 503                     self._downloader.to_screen(
 504                         '[debug] Using fake IP %s as X-Forwarded-For.'
 505                         % self._x_forwarded_for_ip)
 506                 return
 507
 508             # Path 2: bypassing based on country code
 509
 510             # Explicit country code specified by user, use it right away
 511             # regardless of whether extractor is geo bypassable or not
 512             country = self._downloader.params.get('geo_bypass_country', None)
 513
 514             # Otherwise use random country code from geo bypass context but
 515             # only if extractor is known as geo bypassable
 516             if not country:
 517                 countries = geo_bypass_context.get('countries')
 518                 if self._GEO_BYPASS and countries:
 519                     country = random.choice(countries)
 520
 521             if country:
 522                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
 523                 if self._downloader.params.get('verbose', False):
 524                     self._downloader.to_screen(
 525                         '[debug] Using fake IP %s (%s) as X-Forwarded-For.'
 526                         % (self._x_forwarded_for_ip, country.upper()))
 527
 528     def extract(self, url):
 529         """Extracts URL information and returns it in list of dicts."""
 530         try:
 531             for _ in range(2):
 532                 try:
 533                     self.initialize()
 534                     ie_result = self._real_extract(url)
 535                     if self._x_forwarded_for_ip:
 536                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
 537                     return ie_result
 538                 except GeoRestrictedError as e:
 539                     if self.__maybe_fake_ip_and_retry(e.countries):
 540                         continue
 541                     raise
 542         except ExtractorError:
 543             raise
 544         except compat_http_client.IncompleteRead as e:
 545             raise ExtractorError('A network error has occurred.', cause=e, expected=True)
 546         except (KeyError, StopIteration) as e:
 547             raise ExtractorError('An extractor error has occurred.', cause=e)
 548
 549     def __maybe_fake_ip_and_retry(self, countries):
 550         if (not self._downloader.params.get('geo_bypass_country', None)
 551                 and self._GEO_BYPASS
 552                 and self._downloader.params.get('geo_bypass', True)
 553                 and not self._x_forwarded_for_ip
 554                 and countries):
 555             country_code = random.choice(countries)
 556             self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
 557             if self._x_forwarded_for_ip:
 558                 self.report_warning(
 559                     'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
 560                     % (self._x_forwarded_for_ip, country_code.upper()))
 561                 return True
 562         return False
 563
 564     def set_downloader(self, downloader):
 565         """Sets the downloader for this IE."""
 566         self._downloader = downloader
 567
 568     def _real_initialize(self):
 569         """Real initialization process. Redefine in subclasses."""
 570         pass
 571
 572     def _real_extract(self, url):
 573         """Real extraction process. Redefine in subclasses."""
 574         pass
 575
 576     @classmethod
 577     def ie_key(cls):
 578         """A string for getting the InfoExtractor with get_info_extractor"""
 579         return compat_str(cls.__name__[:-2])
 580
 581     @property
 582     def IE_NAME(self):
 583         return compat_str(type(self).__name__[:-2])
 584
 585     @staticmethod
 586     def __can_accept_status_code(err, expected_status):
 587         assert isinstance(err, compat_urllib_error.HTTPError)
 588         if expected_status is None:
 589             return False
 590         if isinstance(expected_status, compat_integer_types):
 591             return err.code == expected_status
 592         elif isinstance(expected_status, (list, tuple)):
 593             return err.code in expected_status
 594         elif callable(expected_status):
 595             return expected_status(err.code) is True
 596         else:
 597             assert False
 598
 599     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None):
 600         """
 601         Return the response handle.
 602
 603         See _download_webpage docstring for arguments specification.
 604         """
 605         if note is None:
 606             self.report_download_webpage(video_id)
 607         elif note is not False:
 608             if video_id is None:
 609                 self.to_screen('%s' % (note,))
 610             else:
 611                 self.to_screen('%s: %s' % (video_id, note))
 612
 613         # Some sites check X-Forwarded-For HTTP header in order to figure out
 614         # the origin of the client behind proxy. This allows bypassing geo
 615         # restriction by faking this header's value to IP that belongs to some
 616         # geo unrestricted country. We will do so once we encounter any
 617         # geo restriction error.
 618         if self._x_forwarded_for_ip:
 619             if 'X-Forwarded-For' not in headers:
 620                 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
 621
 622         if isinstance(url_or_request, compat_urllib_request.Request):
 623             url_or_request = update_Request(
 624                 url_or_request, data=data, headers=headers, query=query)
 625         else:
 626             if query:
 627                 url_or_request = update_url_query(url_or_request, query)
 628             if data is not None or headers:
 629                 url_or_request = sanitized_Request(url_or_request, data, headers)
 630         exceptions = [compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error]
 631         if hasattr(ssl, 'CertificateError'):
 632             exceptions.append(ssl.CertificateError)
 633         try:
 634             return self._downloader.urlopen(url_or_request)
 635         except tuple(exceptions) as err:
 636             if isinstance(err, compat_urllib_error.HTTPError):
 637                 if self.__can_accept_status_code(err, expected_status):
 638                     # Retain reference to error to prevent file object from
 639                     # being closed before it can be read. Works around the
 640                     # effects of <https://bugs.python.org/issue15002>
 641                     # introduced in Python 3.4.1.
 642                     err.fp._error = err
 643                     return err.fp
 644
 645             if errnote is False:
 646                 return False
 647             if errnote is None:
 648                 errnote = 'Unable to download webpage'
 649
 650             errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
 651             if fatal:
 652                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 653             else:
 654                 self._downloader.report_warning(errmsg)
 655                 return False
 656
 657     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
 658         """
 659         Return a tuple (page content as string, URL handle).
 660
 661         See _download_webpage docstring for arguments specification.
 662         """
 663         # Strip hashes from the URL (#1038)
 664         if isinstance(url_or_request, (compat_str, str)):
 665             url_or_request = url_or_request.partition('#')[0]
 666
 667         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
 668         if urlh is False:
 669             assert not fatal
 670             return False
 671         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 672         return (content, urlh)
 673
 674     @staticmethod
 675     def _guess_encoding_from_content(content_type, webpage_bytes):
 676         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 677         if m:
 678             encoding = m.group(1)
 679         else:
 680             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 681                           webpage_bytes[:1024])
 682             if m:
 683                 encoding = m.group(1).decode('ascii')
 684             elif webpage_bytes.startswith(b'\xff\xfe'):
 685                 encoding = 'utf-16'
 686             else:
 687                 encoding = 'utf-8'
 688
 689         return encoding
 690
 691     def __check_blocked(self, content):
 692         first_block = content[:512]
 693         if ('<title>Access to this site is blocked</title>' in content
 694                 and 'Websense' in first_block):
 695             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 696             blocked_iframe = self._html_search_regex(
 697                 r'<iframe src="([^"]+)"', content,
 698                 'Websense information URL', default=None)
 699             if blocked_iframe:
 700                 msg += ' Visit %s for more details' % blocked_iframe
 701             raise ExtractorError(msg, expected=True)
 702         if '<title>The URL you requested has been blocked</title>' in first_block:
 703             msg = (
 704                 'Access to this webpage has been blocked by Indian censorship. '
 705                 'Use a VPN or proxy server (with --proxy) to route around it.')
 706             block_msg = self._html_search_regex(
 707                 r'</h1><p>(.*?)</p>',
 708                 content, 'block message', default=None)
 709             if block_msg:
 710                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 711             raise ExtractorError(msg, expected=True)
 712         if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
 713                 and 'blocklist.rkn.gov.ru' in content):
 714             raise ExtractorError(
 715                 'Access to this webpage has been blocked by decision of the Russian government. '
 716                 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
 717                 expected=True)
 718
 719     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 720         content_type = urlh.headers.get('Content-Type', '')
 721         webpage_bytes = urlh.read()
 722         if prefix is not None:
 723             webpage_bytes = prefix + webpage_bytes
 724         if not encoding:
 725             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
 726         if self._downloader.params.get('dump_intermediate_pages', False):
 727             self.to_screen('Dumping request to ' + urlh.geturl())
 728             dump = base64.b64encode(webpage_bytes).decode('ascii')
 729             self._downloader.to_screen(dump)
 730         if self._downloader.params.get('write_pages', False):
 731             basen = '%s_%s' % (video_id, urlh.geturl())
 732             if len(basen) > 240:
 733                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 734                 basen = basen[:240 - len(h)] + h
 735             raw_filename = basen + '.dump'
 736             filename = sanitize_filename(raw_filename, restricted=True)
 737             self.to_screen('Saving request to ' + filename)
 738             # Working around MAX_PATH limitation on Windows (see
 739             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 740             if compat_os_name == 'nt':
 741                 absfilepath = os.path.abspath(filename)
 742                 if len(absfilepath) > 259:
 743                     filename = '\\\\?\\' + absfilepath
 744             with open(filename, 'wb') as outf:
 745                 outf.write(webpage_bytes)
 746
 747         try:
 748             content = webpage_bytes.decode(encoding, 'replace')
 749         except LookupError:
 750             content = webpage_bytes.decode('utf-8', 'replace')
 751
 752         self.__check_blocked(content)
 753
 754         return content
 755
 756     def _download_webpage(
 757             self, url_or_request, video_id, note=None, errnote=None,
 758             fatal=True, tries=1, timeout=5, encoding=None, data=None,
 759             headers={}, query={}, expected_status=None):
 760         """
 761         Return the data of the page as a string.
 762
 763         Arguments:
 764         url_or_request -- plain text URL as a string or
 765             a compat_urllib_request.Requestobject
 766         video_id -- Video/playlist/item identifier (string)
 767
 768         Keyword arguments:
 769         note -- note printed before downloading (string)
 770         errnote -- note printed in case of an error (string)
 771         fatal -- flag denoting whether error should be considered fatal,
 772             i.e. whether it should cause ExtractionError to be raised,
 773             otherwise a warning will be reported and extraction continued
 774         tries -- number of tries
 775         timeout -- sleep interval between tries
 776         encoding -- encoding for a page content decoding, guessed automatically
 777             when not explicitly specified
 778         data -- POST data (bytes)
 779         headers -- HTTP headers (dict)
 780         query -- URL query (dict)
 781         expected_status -- allows to accept failed HTTP requests (non 2xx
 782             status code) by explicitly specifying a set of accepted status
 783             codes. Can be any of the following entities:
 784                 - an integer type specifying an exact failed status code to
 785                   accept
 786                 - a list or a tuple of integer types specifying a list of
 787                   failed status codes to accept
 788                 - a callable accepting an actual failed status code and
 789                   returning True if it should be accepted
 790             Note that this argument does not affect success status codes (2xx)
 791             which are always accepted.
 792         """
 793
 794         success = False
 795         try_count = 0
 796         while success is False:
 797             try:
 798                 res = self._download_webpage_handle(
 799                     url_or_request, video_id, note, errnote, fatal,
 800                     encoding=encoding, data=data, headers=headers, query=query,
 801                     expected_status=expected_status)
 802                 success = True
 803             except compat_http_client.IncompleteRead as e:
 804                 try_count += 1
 805                 if try_count >= tries:
 806                     raise e
 807                 self._sleep(timeout, video_id)
 808         if res is False:
 809             return res
 810         else:
 811             content, _ = res
 812             return content
 813
 814     def _download_xml_handle(
 815             self, url_or_request, video_id, note='Downloading XML',
 816             errnote='Unable to download XML', transform_source=None,
 817             fatal=True, encoding=None, data=None, headers={}, query={},
 818             expected_status=None):
 819         """
 820         Return a tuple (xml as an compat_etree_Element, URL handle).
 821
 822         See _download_webpage docstring for arguments specification.
 823         """
 824         res = self._download_webpage_handle(
 825             url_or_request, video_id, note, errnote, fatal=fatal,
 826             encoding=encoding, data=data, headers=headers, query=query,
 827             expected_status=expected_status)
 828         if res is False:
 829             return res
 830         xml_string, urlh = res
 831         return self._parse_xml(
 832             xml_string, video_id, transform_source=transform_source,
 833             fatal=fatal), urlh
 834
 835     def _download_xml(
 836             self, url_or_request, video_id,
 837             note='Downloading XML', errnote='Unable to download XML',
 838             transform_source=None, fatal=True, encoding=None,
 839             data=None, headers={}, query={}, expected_status=None):
 840         """
 841         Return the xml as an compat_etree_Element.
 842
 843         See _download_webpage docstring for arguments specification.
 844         """
 845         res = self._download_xml_handle(
 846             url_or_request, video_id, note=note, errnote=errnote,
 847             transform_source=transform_source, fatal=fatal, encoding=encoding,
 848             data=data, headers=headers, query=query,
 849             expected_status=expected_status)
 850         return res if res is False else res[0]
 851
 852     def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
 853         if transform_source:
 854             xml_string = transform_source(xml_string)
 855         try:
 856             return compat_etree_fromstring(xml_string.encode('utf-8'))
 857         except compat_xml_parse_error as ve:
 858             errmsg = '%s: Failed to parse XML ' % video_id
 859             if fatal:
 860                 raise ExtractorError(errmsg, cause=ve)
 861             else:
 862                 self.report_warning(errmsg + str(ve))
 863
 864     def _download_json_handle(
 865             self, url_or_request, video_id, note='Downloading JSON metadata',
 866             errnote='Unable to download JSON metadata', transform_source=None,
 867             fatal=True, encoding=None, data=None, headers={}, query={},
 868             expected_status=None):
 869         """
 870         Return a tuple (JSON object, URL handle).
 871
 872         See _download_webpage docstring for arguments specification.
 873         """
 874         res = self._download_webpage_handle(
 875             url_or_request, video_id, note, errnote, fatal=fatal,
 876             encoding=encoding, data=data, headers=headers, query=query,
 877             expected_status=expected_status)
 878         if res is False:
 879             return res
 880         json_string, urlh = res
 881         return self._parse_json(
 882             json_string, video_id, transform_source=transform_source,
 883             fatal=fatal), urlh
 884
 885     def _download_json(
 886             self, url_or_request, video_id, note='Downloading JSON metadata',
 887             errnote='Unable to download JSON metadata', transform_source=None,
 888             fatal=True, encoding=None, data=None, headers={}, query={},
 889             expected_status=None):
 890         """
 891         Return the JSON object as a dict.
 892
 893         See _download_webpage docstring for arguments specification.
 894         """
 895         res = self._download_json_handle(
 896             url_or_request, video_id, note=note, errnote=errnote,
 897             transform_source=transform_source, fatal=fatal, encoding=encoding,
 898             data=data, headers=headers, query=query,
 899             expected_status=expected_status)
 900         return res if res is False else res[0]
 901
 902     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
 903         if transform_source:
 904             json_string = transform_source(json_string)
 905         try:
 906             return json.loads(json_string)
 907         except ValueError as ve:
 908             errmsg = '%s: Failed to parse JSON ' % video_id
 909             if fatal:
 910                 raise ExtractorError(errmsg, cause=ve)
 911             else:
 912                 self.report_warning(errmsg + str(ve))
 913
 914     def report_warning(self, msg, video_id=None):
 915         idstr = '' if video_id is None else '%s: ' % video_id
 916         self._downloader.report_warning(
 917             '[%s] %s%s' % (self.IE_NAME, idstr, msg))
 918
 919     def to_screen(self, msg):
 920         """Print msg to screen, prefixing it with '[ie_name]'"""
 921         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
 922
 923     def report_extraction(self, id_or_name):
 924         """Report information extraction."""
 925         self.to_screen('%s: Extracting information' % id_or_name)
 926
 927     def report_download_webpage(self, video_id):
 928         """Report webpage download."""
 929         self.to_screen('%s: Downloading webpage' % video_id)
 930
 931     def report_age_confirmation(self):
 932         """Report attempt to confirm age."""
 933         self.to_screen('Confirming age')
 934
 935     def report_login(self):
 936         """Report attempt to log in."""
 937         self.to_screen('Logging in')
 938
 939     @staticmethod
 940     def raise_login_required(msg='This video is only available for registered users'):
 941         raise ExtractorError(
 942             '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
 943             expected=True)
 944
 945     @staticmethod
 946     def raise_geo_restricted(msg='This video is not available from your location due to geo restriction', countries=None):
 947         raise GeoRestrictedError(msg, countries=countries)
 948
 949     # Methods for following #608
 950     @staticmethod
 951     def url_result(url, ie=None, video_id=None, video_title=None):
 952         """Returns a URL that points to a page that should be processed"""
 953         # TODO: ie should be the class used for getting the info
 954         video_info = {'_type': 'url',
 955                       'url': url,
 956                       'ie_key': ie}
 957         if video_id is not None:
 958             video_info['id'] = video_id
 959         if video_title is not None:
 960             video_info['title'] = video_title
 961         return video_info
 962
 963     def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None):
 964         urls = orderedSet(
 965             self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
 966             for m in matches)
 967         return self.playlist_result(
 968             urls, playlist_id=playlist_id, playlist_title=playlist_title)
 969
 970     @staticmethod
 971     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
 972         """Returns a playlist"""
 973         video_info = {'_type': 'playlist',
 974                       'entries': entries}
 975         if playlist_id:
 976             video_info['id'] = playlist_id
 977         if playlist_title:
 978             video_info['title'] = playlist_title
 979         if playlist_description:
 980             video_info['description'] = playlist_description
 981         return video_info
 982
 983     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 984         """
 985         Perform a regex search on the given string, using a single or a list of
 986         patterns returning the first matching group.
 987         In case of failure return a default value or raise a WARNING or a
 988         RegexNotFoundError, depending on fatal, specifying the field name.
 989         """
 990         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 991             mobj = re.search(pattern, string, flags)
 992         else:
 993             for p in pattern:
 994                 mobj = re.search(p, string, flags)
 995                 if mobj:
 996                     break
 997
 998         if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
 999             _name = '\033[0;34m%s\033[0m' % name
1000         else:
1001             _name = name
1002
1003         if mobj:
1004             if group is None:
1005                 # return the first matching group
1006                 return next(g for g in mobj.groups() if g is not None)
1007             else:
1008                 return mobj.group(group)
1009         elif default is not NO_DEFAULT:
1010             return default
1011         elif fatal:
1012             raise RegexNotFoundError('Unable to extract %s' % _name)
1013         else:
1014             self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
1015             return None
1016
1017     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1018         """
1019         Like _search_regex, but strips HTML tags and unescapes entities.
1020         """
1021         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
1022         if res:
1023             return clean_html(res).strip()
1024         else:
1025             return res
1026
1027     def _get_netrc_login_info(self, netrc_machine=None):
1028         username = None
1029         password = None
1030         netrc_machine = netrc_machine or self._NETRC_MACHINE
1031
1032         if self._downloader.params.get('usenetrc', False):
1033             try:
1034                 info = netrc.netrc().authenticators(netrc_machine)
1035                 if info is not None:
1036                     username = info[0]
1037                     password = info[2]
1038                 else:
1039                     raise netrc.NetrcParseError(
1040                         'No authenticators for %s' % netrc_machine)
1041             except (IOError, netrc.NetrcParseError) as err:
1042                 self._downloader.report_warning(
1043                     'parsing .netrc: %s' % error_to_compat_str(err))
1044
1045         return username, password
1046
1047     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
1048         """
1049         Get the login info as (username, password)
1050         First look for the manually specified credentials using username_option
1051         and password_option as keys in params dictionary. If no such credentials
1052         available look in the netrc file using the netrc_machine or _NETRC_MACHINE
1053         value.
1054         If there's no info available, return (None, None)
1055         """
1056         if self._downloader is None:
1057             return (None, None)
1058
1059         downloader_params = self._downloader.params
1060
1061         # Attempt to use provided username and password or .netrc data
1062         if downloader_params.get(username_option) is not None:
1063             username = downloader_params[username_option]
1064             password = downloader_params[password_option]
1065         else:
1066             username, password = self._get_netrc_login_info(netrc_machine)
1067
1068         return username, password
1069
1070     def _get_tfa_info(self, note='two-factor verification code'):
1071         """
1072         Get the two-factor authentication info
1073         TODO - asking the user will be required for sms/phone verify
1074         currently just uses the command line option
1075         If there's no info available, return None
1076         """
1077         if self._downloader is None:
1078             return None
1079         downloader_params = self._downloader.params
1080
1081         if downloader_params.get('twofactor') is not None:
1082             return downloader_params['twofactor']
1083
1084         return compat_getpass('Type %s and press [Return]: ' % note)
1085
1086     # Helper functions for extracting OpenGraph info
1087     @staticmethod
1088     def _og_regexes(prop):
1089         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
1090         property_re = (r'(?:name|property)=(?:\'og[:-]%(prop)s\'|"og[:-]%(prop)s"|\s*og[:-]%(prop)s\b)'
1091                        % {'prop': re.escape(prop)})
1092         template = r'<meta[^>]+?%s[^>]+?%s'
1093         return [
1094             template % (property_re, content_re),
1095             template % (content_re, property_re),
1096         ]
1097
1098     @staticmethod
1099     def _meta_regex(prop):
1100         return r'''(?isx)<meta
1101                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
1102                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1103
1104     def _og_search_property(self, prop, html, name=None, **kargs):
1105         if not isinstance(prop, (list, tuple)):
1106             prop = [prop]
1107         if name is None:
1108             name = 'OpenGraph %s' % prop[0]
1109         og_regexes = []
1110         for p in prop:
1111             og_regexes.extend(self._og_regexes(p))
1112         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
1113         if escaped is None:
1114             return None
1115         return unescapeHTML(escaped)
1116
1117     def _og_search_thumbnail(self, html, **kargs):
1118         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
1119
1120     def _og_search_description(self, html, **kargs):
1121         return self._og_search_property('description', html, fatal=False, **kargs)
1122
1123     def _og_search_title(self, html, **kargs):
1124         return self._og_search_property('title', html, **kargs)
1125
1126     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
1127         regexes = self._og_regexes('video') + self._og_regexes('video:url')
1128         if secure:
1129             regexes = self._og_regexes('video:secure_url') + regexes
1130         return self._html_search_regex(regexes, html, name, **kargs)
1131
1132     def _og_search_url(self, html, **kargs):
1133         return self._og_search_property('url', html, **kargs)
1134
1135     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1136         if not isinstance(name, (list, tuple)):
1137             name = [name]
1138         if display_name is None:
1139             display_name = name[0]
1140         return self._html_search_regex(
1141             [self._meta_regex(n) for n in name],
1142             html, display_name, fatal=fatal, group='content', **kwargs)
1143
1144     def _dc_search_uploader(self, html):
1145         return self._html_search_meta('dc.creator', html, 'uploader')
1146
1147     def _rta_search(self, html):
1148         # See http://www.rtalabel.org/index.php?content=howtofaq#single
1149         if re.search(r'(?ix)<meta\s+name="rating"\s+'
1150                      r'     content="RTA-5042-1996-1400-1577-RTA"',
1151                      html):
1152             return 18
1153         return 0
1154
1155     def _media_rating_search(self, html):
1156         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1157         rating = self._html_search_meta('rating', html)
1158
1159         if not rating:
1160             return None
1161
1162         RATING_TABLE = {
1163             'safe for kids': 0,
1164             'general': 8,
1165             '14 years': 14,
1166             'mature': 17,
1167             'restricted': 19,
1168         }
1169         return RATING_TABLE.get(rating.lower())
1170
1171     def _family_friendly_search(self, html):
1172         # See http://schema.org/VideoObject
1173         family_friendly = self._html_search_meta(
1174             'isFamilyFriendly', html, default=None)
1175
1176         if not family_friendly:
1177             return None
1178
1179         RATING_TABLE = {
1180             '1': 0,
1181             'true': 0,
1182             '0': 18,
1183             'false': 18,
1184         }
1185         return RATING_TABLE.get(family_friendly.lower())
1186
1187     def _twitter_search_player(self, html):
1188         return self._html_search_meta('twitter:player', html,
1189                                       'twitter card player')
1190
1191     def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
1192         json_ld_list = list(re.finditer(JSON_LD_RE, html))
1193         default = kwargs.get('default', NO_DEFAULT)
1194         # JSON-LD may be malformed and thus `fatal` should be respected.
1195         # At the same time `default` may be passed that assumes `fatal=False`
1196         # for _search_regex. Let's simulate the same behavior here as well.
1197         fatal = kwargs.get('fatal', True) if default == NO_DEFAULT else False
1198         json_ld = []
1199         for mobj in json_ld_list:
1200             json_ld_item = self._parse_json(
1201                 mobj.group('json_ld'), video_id, fatal=fatal)
1202             if not json_ld_item:
1203                 continue
1204             if isinstance(json_ld_item, dict):
1205                 json_ld.append(json_ld_item)
1206             elif isinstance(json_ld_item, (list, tuple)):
1207                 json_ld.extend(json_ld_item)
1208         if json_ld:
1209             json_ld = self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
1210         if json_ld:
1211             return json_ld
1212         if default is not NO_DEFAULT:
1213             return default
1214         elif fatal:
1215             raise RegexNotFoundError('Unable to extract JSON-LD')
1216         else:
1217             self._downloader.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
1218             return {}
1219
1220     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1221         if isinstance(json_ld, compat_str):
1222             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1223         if not json_ld:
1224             return {}
1225         info = {}
1226         if not isinstance(json_ld, (list, tuple, dict)):
1227             return info
1228         if isinstance(json_ld, dict):
1229             json_ld = [json_ld]
1230
1231         INTERACTION_TYPE_MAP = {
1232             'CommentAction': 'comment',
1233             'AgreeAction': 'like',
1234             'DisagreeAction': 'dislike',
1235             'LikeAction': 'like',
1236             'DislikeAction': 'dislike',
1237             'ListenAction': 'view',
1238             'WatchAction': 'view',
1239             'ViewAction': 'view',
1240         }
1241
1242         def extract_interaction_type(e):
1243             interaction_type = e.get('interactionType')
1244             if isinstance(interaction_type, dict):
1245                 interaction_type = interaction_type.get('@type')
1246             return str_or_none(interaction_type)
1247
1248         def extract_interaction_statistic(e):
1249             interaction_statistic = e.get('interactionStatistic')
1250             if isinstance(interaction_statistic, dict):
1251                 interaction_statistic = [interaction_statistic]
1252             if not isinstance(interaction_statistic, list):
1253                 return
1254             for is_e in interaction_statistic:
1255                 if not isinstance(is_e, dict):
1256                     continue
1257                 if is_e.get('@type') != 'InteractionCounter':
1258                     continue
1259                 interaction_type = extract_interaction_type(is_e)
1260                 if not interaction_type:
1261                     continue
1262                 # For interaction count some sites provide string instead of
1263                 # an integer (as per spec) with non digit characters (e.g. ",")
1264                 # so extracting count with more relaxed str_to_int
1265                 interaction_count = str_to_int(is_e.get('userInteractionCount'))
1266                 if interaction_count is None:
1267                     continue
1268                 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1269                 if not count_kind:
1270                     continue
1271                 count_key = '%s_count' % count_kind
1272                 if info.get(count_key) is not None:
1273                     continue
1274                 info[count_key] = interaction_count
1275
1276         def extract_video_object(e):
1277             assert e['@type'] == 'VideoObject'
1278             author = e.get('author')
1279             info.update({
1280                 'url': url_or_none(e.get('contentUrl')),
1281                 'title': unescapeHTML(e.get('name')),
1282                 'description': unescapeHTML(e.get('description')),
1283                 'thumbnail': url_or_none(e.get('thumbnailUrl') or e.get('thumbnailURL')),
1284                 'duration': parse_duration(e.get('duration')),
1285                 'timestamp': unified_timestamp(e.get('uploadDate')),
1286                 # author can be an instance of 'Organization' or 'Person' types.
1287                 # both types can have 'name' property(inherited from 'Thing' type). [1]
1288                 # however some websites are using 'Text' type instead.
1289                 # 1. https://schema.org/VideoObject
1290                 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, compat_str) else None,
1291                 'filesize': float_or_none(e.get('contentSize')),
1292                 'tbr': int_or_none(e.get('bitrate')),
1293                 'width': int_or_none(e.get('width')),
1294                 'height': int_or_none(e.get('height')),
1295                 'view_count': int_or_none(e.get('interactionCount')),
1296             })
1297             extract_interaction_statistic(e)
1298
1299         for e in json_ld:
1300             if '@context' in e:
1301                 item_type = e.get('@type')
1302                 if expected_type is not None and expected_type != item_type:
1303                     continue
1304                 if item_type in ('TVEpisode', 'Episode'):
1305                     episode_name = unescapeHTML(e.get('name'))
1306                     info.update({
1307                         'episode': episode_name,
1308                         'episode_number': int_or_none(e.get('episodeNumber')),
1309                         'description': unescapeHTML(e.get('description')),
1310                     })
1311                     if not info.get('title') and episode_name:
1312                         info['title'] = episode_name
1313                     part_of_season = e.get('partOfSeason')
1314                     if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
1315                         info.update({
1316                             'season': unescapeHTML(part_of_season.get('name')),
1317                             'season_number': int_or_none(part_of_season.get('seasonNumber')),
1318                         })
1319                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1320                     if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
1321                         info['series'] = unescapeHTML(part_of_series.get('name'))
1322                 elif item_type == 'Movie':
1323                     info.update({
1324                         'title': unescapeHTML(e.get('name')),
1325                         'description': unescapeHTML(e.get('description')),
1326                         'duration': parse_duration(e.get('duration')),
1327                         'timestamp': unified_timestamp(e.get('dateCreated')),
1328                     })
1329                 elif item_type in ('Article', 'NewsArticle'):
1330                     info.update({
1331                         'timestamp': parse_iso8601(e.get('datePublished')),
1332                         'title': unescapeHTML(e.get('headline')),
1333                         'description': unescapeHTML(e.get('articleBody')),
1334                     })
1335                 elif item_type == 'VideoObject':
1336                     extract_video_object(e)
1337                     if expected_type is None:
1338                         continue
1339                     else:
1340                         break
1341                 video = e.get('video')
1342                 if isinstance(video, dict) and video.get('@type') == 'VideoObject':
1343                     extract_video_object(video)
1344                 if expected_type is None:
1345                     continue
1346                 else:
1347                     break
1348         return dict((k, v) for k, v in info.items() if v is not None)
1349
1350     @staticmethod
1351     def _hidden_inputs(html):
1352         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1353         hidden_inputs = {}
1354         for input in re.findall(r'(?i)(<input[^>]+>)', html):
1355             attrs = extract_attributes(input)
1356             if not input:
1357                 continue
1358             if attrs.get('type') not in ('hidden', 'submit'):
1359                 continue
1360             name = attrs.get('name') or attrs.get('id')
1361             value = attrs.get('value')
1362             if name and value is not None:
1363                 hidden_inputs[name] = value
1364         return hidden_inputs
1365
1366     def _form_hidden_inputs(self, form_id, html):
1367         form = self._search_regex(
1368             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1369             html, '%s form' % form_id, group='form')
1370         return self._hidden_inputs(form)
1371
1372     def _sort_formats(self, formats, field_preference=None):
1373         if not formats:
1374             raise ExtractorError('No video formats found')
1375
1376         for f in formats:
1377             # Automatically determine tbr when missing based on abr and vbr (improves
1378             # formats sorting in some cases)
1379             if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None:
1380                 f['tbr'] = f['abr'] + f['vbr']
1381
1382         def _formats_key(f):
1383             # TODO remove the following workaround
1384             from ..utils import determine_ext
1385             if not f.get('ext') and 'url' in f:
1386                 f['ext'] = determine_ext(f['url'])
1387
1388             if isinstance(field_preference, (list, tuple)):
1389                 return tuple(
1390                     f.get(field)
1391                     if f.get(field) is not None
1392                     else ('' if field == 'format_id' else -1)
1393                     for field in field_preference)
1394
1395             preference = f.get('preference')
1396             if preference is None:
1397                 preference = 0
1398                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
1399                     preference -= 0.5
1400
1401             protocol = f.get('protocol') or determine_protocol(f)
1402             proto_preference = 0 if protocol in ['http', 'https'] else (-0.5 if protocol == 'rtsp' else -0.1)
1403
1404             if f.get('vcodec') == 'none':  # audio only
1405                 preference -= 50
1406                 if self._downloader.params.get('prefer_free_formats'):
1407                     ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
1408                 else:
1409                     ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
1410                 ext_preference = 0
1411                 try:
1412                     audio_ext_preference = ORDER.index(f['ext'])
1413                 except ValueError:
1414                     audio_ext_preference = -1
1415             else:
1416                 if f.get('acodec') == 'none':  # video only
1417                     preference -= 40
1418                 if self._downloader.params.get('prefer_free_formats'):
1419                     ORDER = ['flv', 'mp4', 'webm']
1420                 else:
1421                     ORDER = ['webm', 'flv', 'mp4']
1422                 try:
1423                     ext_preference = ORDER.index(f['ext'])
1424                 except ValueError:
1425                     ext_preference = -1
1426                 audio_ext_preference = 0
1427
1428             return (
1429                 preference,
1430                 f.get('language_preference') if f.get('language_preference') is not None else -1,
1431                 f.get('quality') if f.get('quality') is not None else -1,
1432                 f.get('tbr') if f.get('tbr') is not None else -1,
1433                 f.get('filesize') if f.get('filesize') is not None else -1,
1434                 f.get('vbr') if f.get('vbr') is not None else -1,
1435                 f.get('height') if f.get('height') is not None else -1,
1436                 f.get('width') if f.get('width') is not None else -1,
1437                 proto_preference,
1438                 ext_preference,
1439                 f.get('abr') if f.get('abr') is not None else -1,
1440                 audio_ext_preference,
1441                 f.get('fps') if f.get('fps') is not None else -1,
1442                 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
1443                 f.get('source_preference') if f.get('source_preference') is not None else -1,
1444                 f.get('format_id') if f.get('format_id') is not None else '',
1445             )
1446         formats.sort(key=_formats_key)
1447
1448     def _check_formats(self, formats, video_id):
1449         if formats:
1450             formats[:] = filter(
1451                 lambda f: self._is_valid_url(
1452                     f['url'], video_id,
1453                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1454                 formats)
1455
1456     @staticmethod
1457     def _remove_duplicate_formats(formats):
1458         format_urls = set()
1459         unique_formats = []
1460         for f in formats:
1461             if f['url'] not in format_urls:
1462                 format_urls.add(f['url'])
1463                 unique_formats.append(f)
1464         formats[:] = unique_formats
1465
1466     def _is_valid_url(self, url, video_id, item='video', headers={}):
1467         url = self._proto_relative_url(url, scheme='http:')
1468         # For now assume non HTTP(S) URLs always valid
1469         if not (url.startswith('http://') or url.startswith('https://')):
1470             return True
1471         try:
1472             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1473             return True
1474         except ExtractorError as e:
1475             self.to_screen(
1476                 '%s: %s URL is invalid, skipping: %s'
1477                 % (video_id, item, error_to_compat_str(e.cause)))
1478             return False
1479
1480     def http_scheme(self):
1481         """ Either "http:" or "https:", depending on the user's preferences """
1482         return (
1483             'http:'
1484             if self._downloader.params.get('prefer_insecure', False)
1485             else 'https:')
1486
1487     def _proto_relative_url(self, url, scheme=None):
1488         if url is None:
1489             return url
1490         if url.startswith('//'):
1491             if scheme is None:
1492                 scheme = self.http_scheme()
1493             return scheme + url
1494         else:
1495             return url
1496
1497     def _sleep(self, timeout, video_id, msg_template=None):
1498         if msg_template is None:
1499             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1500         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1501         self.to_screen(msg)
1502         time.sleep(timeout)
1503
1504     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
1505                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
1506                              fatal=True, m3u8_id=None, data=None, headers={}, query={}):
1507         manifest = self._download_xml(
1508             manifest_url, video_id, 'Downloading f4m manifest',
1509             'Unable to download f4m manifest',
1510             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1511             # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
1512             transform_source=transform_source,
1513             fatal=fatal, data=data, headers=headers, query=query)
1514
1515         if manifest is False:
1516             return []
1517
1518         return self._parse_f4m_formats(
1519             manifest, manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1520             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1521
1522     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None,
1523                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
1524                            fatal=True, m3u8_id=None):
1525         if not isinstance(manifest, compat_etree_Element) and not fatal:
1526             return []
1527
1528         # currently youtube-dl cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1529         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1530         if akamai_pv is not None and ';' in akamai_pv.text:
1531             playerVerificationChallenge = akamai_pv.text.split(';')[0]
1532             if playerVerificationChallenge.strip() != '':
1533                 return []
1534
1535         formats = []
1536         manifest_version = '1.0'
1537         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1538         if not media_nodes:
1539             manifest_version = '2.0'
1540             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1541         # Remove unsupported DRM protected media from final formats
1542         # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
1543         media_nodes = remove_encrypted_media(media_nodes)
1544         if not media_nodes:
1545             return formats
1546
1547         manifest_base_url = get_base_url(manifest)
1548
1549         bootstrap_info = xpath_element(
1550             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1551             'bootstrap info', default=None)
1552
1553         vcodec = None
1554         mime_type = xpath_text(
1555             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1556             'base URL', default=None)
1557         if mime_type and mime_type.startswith('audio/'):
1558             vcodec = 'none'
1559
1560         for i, media_el in enumerate(media_nodes):
1561             tbr = int_or_none(media_el.attrib.get('bitrate'))
1562             width = int_or_none(media_el.attrib.get('width'))
1563             height = int_or_none(media_el.attrib.get('height'))
1564             format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
1565             # If <bootstrapInfo> is present, the specified f4m is a
1566             # stream-level manifest, and only set-level manifests may refer to
1567             # external resources.  See section 11.4 and section 4 of F4M spec
1568             if bootstrap_info is None:
1569                 media_url = None
1570                 # @href is introduced in 2.0, see section 11.6 of F4M spec
1571                 if manifest_version == '2.0':
1572                     media_url = media_el.attrib.get('href')
1573                 if media_url is None:
1574                     media_url = media_el.attrib.get('url')
1575                 if not media_url:
1576                     continue
1577                 manifest_url = (
1578                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
1579                     else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1580                 # If media_url is itself a f4m manifest do the recursive extraction
1581                 # since bitrates in parent manifest (this one) and media_url manifest
1582                 # may differ leading to inability to resolve the format by requested
1583                 # bitrate in f4m downloader
1584                 ext = determine_ext(manifest_url)
1585                 if ext == 'f4m':
1586                     f4m_formats = self._extract_f4m_formats(
1587                         manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1588                         transform_source=transform_source, fatal=fatal)
1589                     # Sometimes stream-level manifest contains single media entry that
1590                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1591                     # At the same time parent's media entry in set-level manifest may
1592                     # contain it. We will copy it from parent in such cases.
1593                     if len(f4m_formats) == 1:
1594                         f = f4m_formats[0]
1595                         f.update({
1596                             'tbr': f.get('tbr') or tbr,
1597                             'width': f.get('width') or width,
1598                             'height': f.get('height') or height,
1599                             'format_id': f.get('format_id') if not tbr else format_id,
1600                             'vcodec': vcodec,
1601                         })
1602                     formats.extend(f4m_formats)
1603                     continue
1604                 elif ext == 'm3u8':
1605                     formats.extend(self._extract_m3u8_formats(
1606                         manifest_url, video_id, 'mp4', preference=preference,
1607                         m3u8_id=m3u8_id, fatal=fatal))
1608                     continue
1609             formats.append({
1610                 'format_id': format_id,
1611                 'url': manifest_url,
1612                 'manifest_url': manifest_url,
1613                 'ext': 'flv' if bootstrap_info is not None else None,
1614                 'protocol': 'f4m',
1615                 'tbr': tbr,
1616                 'width': width,
1617                 'height': height,
1618                 'vcodec': vcodec,
1619                 'preference': preference,
1620             })
1621         return formats
1622
1623     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, m3u8_id=None):
1624         return {
1625             'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1626             'url': m3u8_url,
1627             'ext': ext,
1628             'protocol': 'm3u8',
1629             'preference': preference - 100 if preference else -100,
1630             'resolution': 'multiple',
1631             'format_note': 'Quality selection URL',
1632         }
1633
1634     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
1635                               entry_protocol='m3u8', preference=None,
1636                               m3u8_id=None, note=None, errnote=None,
1637                               fatal=True, live=False, data=None, headers={},
1638                               query={}):
1639         res = self._download_webpage_handle(
1640             m3u8_url, video_id,
1641             note=note or 'Downloading m3u8 information',
1642             errnote=errnote or 'Failed to download m3u8 information',
1643             fatal=fatal, data=data, headers=headers, query=query)
1644
1645         if res is False:
1646             return []
1647
1648         m3u8_doc, urlh = res
1649         m3u8_url = urlh.geturl()
1650
1651         return self._parse_m3u8_formats(
1652             m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
1653             preference=preference, m3u8_id=m3u8_id, live=live)
1654
1655     def _parse_m3u8_formats(self, m3u8_doc, m3u8_url, ext=None,
1656                             entry_protocol='m3u8', preference=None,
1657                             m3u8_id=None, live=False):
1658         if '#EXT-X-FAXS-CM:' in m3u8_doc:  # Adobe Flash Access
1659             return []
1660
1661         if re.search(r'#EXT-X-SESSION-KEY:.*?URI="skd://', m3u8_doc):  # Apple FairPlay
1662             return []
1663
1664         formats = []
1665
1666         format_url = lambda u: (
1667             u
1668             if re.match(r'^https?://', u)
1669             else compat_urlparse.urljoin(m3u8_url, u))
1670
1671         # References:
1672         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
1673         # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
1674         # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
1675
1676         # We should try extracting formats only from master playlists [1, 4.3.4],
1677         # i.e. playlists that describe available qualities. On the other hand
1678         # media playlists [1, 4.3.3] should be returned as is since they contain
1679         # just the media without qualities renditions.
1680         # Fortunately, master playlist can be easily distinguished from media
1681         # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
1682         # master playlist tags MUST NOT appear in a media playlist and vice versa.
1683         # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
1684         # media playlist and MUST NOT appear in master playlist thus we can
1685         # clearly detect media playlist with this criterion.
1686
1687         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
1688             return [{
1689                 'url': m3u8_url,
1690                 'format_id': m3u8_id,
1691                 'ext': ext,
1692                 'protocol': entry_protocol,
1693                 'preference': preference,
1694             }]
1695
1696         groups = {}
1697         last_stream_inf = {}
1698
1699         def extract_media(x_media_line):
1700             media = parse_m3u8_attributes(x_media_line)
1701             # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
1702             media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
1703             if not (media_type and group_id and name):
1704                 return
1705             groups.setdefault(group_id, []).append(media)
1706             if media_type not in ('VIDEO', 'AUDIO'):
1707                 return
1708             media_url = media.get('URI')
1709             if media_url:
1710                 format_id = []
1711                 for v in (m3u8_id, group_id, name):
1712                     if v:
1713                         format_id.append(v)
1714                 f = {
1715                     'format_id': '-'.join(format_id),
1716                     'url': format_url(media_url),
1717                     'manifest_url': m3u8_url,
1718                     'language': media.get('LANGUAGE'),
1719                     'ext': ext,
1720                     'protocol': entry_protocol,
1721                     'preference': preference,
1722                 }
1723                 if media_type == 'AUDIO':
1724                     f['vcodec'] = 'none'
1725                 formats.append(f)
1726
1727         def build_stream_name():
1728             # Despite specification does not mention NAME attribute for
1729             # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
1730             # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
1731             # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
1732             stream_name = last_stream_inf.get('NAME')
1733             if stream_name:
1734                 return stream_name
1735             # If there is no NAME in EXT-X-STREAM-INF it will be obtained
1736             # from corresponding rendition group
1737             stream_group_id = last_stream_inf.get('VIDEO')
1738             if not stream_group_id:
1739                 return
1740             stream_group = groups.get(stream_group_id)
1741             if not stream_group:
1742                 return stream_group_id
1743             rendition = stream_group[0]
1744             return rendition.get('NAME') or stream_group_id
1745
1746         # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
1747         # chance to detect video only formats when EXT-X-STREAM-INF tags
1748         # precede EXT-X-MEDIA tags in HLS manifest such as [3].
1749         for line in m3u8_doc.splitlines():
1750             if line.startswith('#EXT-X-MEDIA:'):
1751                 extract_media(line)
1752
1753         for line in m3u8_doc.splitlines():
1754             if line.startswith('#EXT-X-STREAM-INF:'):
1755                 last_stream_inf = parse_m3u8_attributes(line)
1756             elif line.startswith('#') or not line.strip():
1757                 continue
1758             else:
1759                 tbr = float_or_none(
1760                     last_stream_inf.get('AVERAGE-BANDWIDTH')
1761                     or last_stream_inf.get('BANDWIDTH'), scale=1000)
1762                 format_id = []
1763                 if m3u8_id:
1764                     format_id.append(m3u8_id)
1765                 stream_name = build_stream_name()
1766                 # Bandwidth of live streams may differ over time thus making
1767                 # format_id unpredictable. So it's better to keep provided
1768                 # format_id intact.
1769                 if not live:
1770                     format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats)))
1771                 manifest_url = format_url(line.strip())
1772                 f = {
1773                     'format_id': '-'.join(format_id),
1774                     'url': manifest_url,
1775                     'manifest_url': m3u8_url,
1776                     'tbr': tbr,
1777                     'ext': ext,
1778                     'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
1779                     'protocol': entry_protocol,
1780                     'preference': preference,
1781                 }
1782                 resolution = last_stream_inf.get('RESOLUTION')
1783                 if resolution:
1784                     mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
1785                     if mobj:
1786                         f['width'] = int(mobj.group('width'))
1787                         f['height'] = int(mobj.group('height'))
1788                 # Unified Streaming Platform
1789                 mobj = re.search(
1790                     r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
1791                 if mobj:
1792                     abr, vbr = mobj.groups()
1793                     abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
1794                     f.update({
1795                         'vbr': vbr,
1796                         'abr': abr,
1797                     })
1798                 codecs = parse_codecs(last_stream_inf.get('CODECS'))
1799                 f.update(codecs)
1800                 audio_group_id = last_stream_inf.get('AUDIO')
1801                 # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
1802                 # references a rendition group MUST have a CODECS attribute.
1803                 # However, this is not always respected, for example, [2]
1804                 # contains EXT-X-STREAM-INF tag which references AUDIO
1805                 # rendition group but does not have CODECS and despite
1806                 # referencing an audio group it represents a complete
1807                 # (with audio and video) format. So, for such cases we will
1808                 # ignore references to rendition groups and treat them
1809                 # as complete formats.
1810                 if audio_group_id and codecs and f.get('vcodec') != 'none':
1811                     audio_group = groups.get(audio_group_id)
1812                     if audio_group and audio_group[0].get('URI'):
1813                         # TODO: update acodec for audio only formats with
1814                         # the same GROUP-ID
1815                         f['acodec'] = 'none'
1816                 formats.append(f)
1817
1818                 # for DailyMotion
1819                 progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
1820                 if progressive_uri:
1821                     http_f = f.copy()
1822                     del http_f['manifest_url']
1823                     http_f.update({
1824                         'format_id': f['format_id'].replace('hls-', 'http-'),
1825                         'protocol': 'http',
1826                         'url': progressive_uri,
1827                     })
1828                     formats.append(http_f)
1829
1830                 last_stream_inf = {}
1831         return formats
1832
1833     @staticmethod
1834     def _xpath_ns(path, namespace=None):
1835         if not namespace:
1836             return path
1837         out = []
1838         for c in path.split('/'):
1839             if not c or c == '.':
1840                 out.append(c)
1841             else:
1842                 out.append('{%s}%s' % (namespace, c))
1843         return '/'.join(out)
1844
1845     def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
1846         smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
1847
1848         if smil is False:
1849             assert not fatal
1850             return []
1851
1852         namespace = self._parse_smil_namespace(smil)
1853
1854         return self._parse_smil_formats(
1855             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1856
1857     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1858         smil = self._download_smil(smil_url, video_id, fatal=fatal)
1859         if smil is False:
1860             return {}
1861         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1862
1863     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
1864         return self._download_xml(
1865             smil_url, video_id, 'Downloading SMIL file',
1866             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
1867
1868     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
1869         namespace = self._parse_smil_namespace(smil)
1870
1871         formats = self._parse_smil_formats(
1872             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1873         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1874
1875         video_id = os.path.splitext(url_basename(smil_url))[0]
1876         title = None
1877         description = None
1878         upload_date = None
1879         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1880             name = meta.attrib.get('name')
1881             content = meta.attrib.get('content')
1882             if not name or not content:
1883                 continue
1884             if not title and name == 'title':
1885                 title = content
1886             elif not description and name in ('description', 'abstract'):
1887                 description = content
1888             elif not upload_date and name == 'date':
1889                 upload_date = unified_strdate(content)
1890
1891         thumbnails = [{
1892             'id': image.get('type'),
1893             'url': image.get('src'),
1894             'width': int_or_none(image.get('width')),
1895             'height': int_or_none(image.get('height')),
1896         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
1897
1898         return {
1899             'id': video_id,
1900             'title': title or video_id,
1901             'description': description,
1902             'upload_date': upload_date,
1903             'thumbnails': thumbnails,
1904             'formats': formats,
1905             'subtitles': subtitles,
1906         }
1907
1908     def _parse_smil_namespace(self, smil):
1909         return self._search_regex(
1910             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1911
1912     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
1913         base = smil_url
1914         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1915             b = meta.get('base') or meta.get('httpBase')
1916             if b:
1917                 base = b
1918                 break
1919
1920         formats = []
1921         rtmp_count = 0
1922         http_count = 0
1923         m3u8_count = 0
1924
1925         srcs = []
1926         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
1927         for medium in media:
1928             src = medium.get('src')
1929             if not src or src in srcs:
1930                 continue
1931             srcs.append(src)
1932
1933             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
1934             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
1935             width = int_or_none(medium.get('width'))
1936             height = int_or_none(medium.get('height'))
1937             proto = medium.get('proto')
1938             ext = medium.get('ext')
1939             src_ext = determine_ext(src)
1940             streamer = medium.get('streamer') or base
1941
1942             if proto == 'rtmp' or streamer.startswith('rtmp'):
1943                 rtmp_count += 1
1944                 formats.append({
1945                     'url': streamer,
1946                     'play_path': src,
1947                     'ext': 'flv',
1948                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1949                     'tbr': bitrate,
1950                     'filesize': filesize,
1951                     'width': width,
1952                     'height': height,
1953                 })
1954                 if transform_rtmp_url:
1955                     streamer, src = transform_rtmp_url(streamer, src)
1956                     formats[-1].update({
1957                         'url': streamer,
1958                         'play_path': src,
1959                     })
1960                 continue
1961
1962             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
1963             src_url = src_url.strip()
1964
1965             if proto == 'm3u8' or src_ext == 'm3u8':
1966                 m3u8_formats = self._extract_m3u8_formats(
1967                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
1968                 if len(m3u8_formats) == 1:
1969                     m3u8_count += 1
1970                     m3u8_formats[0].update({
1971                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
1972                         'tbr': bitrate,
1973                         'width': width,
1974                         'height': height,
1975                     })
1976                 formats.extend(m3u8_formats)
1977             elif src_ext == 'f4m':
1978                 f4m_url = src_url
1979                 if not f4m_params:
1980                     f4m_params = {
1981                         'hdcore': '3.2.0',
1982                         'plugin': 'flowplayer-3.2.0.1',
1983                     }
1984                 f4m_url += '&' if '?' in f4m_url else '?'
1985                 f4m_url += compat_urllib_parse_urlencode(f4m_params)
1986                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
1987             elif src_ext == 'mpd':
1988                 formats.extend(self._extract_mpd_formats(
1989                     src_url, video_id, mpd_id='dash', fatal=False))
1990             elif re.search(r'\.ism/[Mm]anifest', src_url):
1991                 formats.extend(self._extract_ism_formats(
1992                     src_url, video_id, ism_id='mss', fatal=False))
1993             elif src_url.startswith('http') and self._is_valid_url(src, video_id):
1994                 http_count += 1
1995                 formats.append({
1996                     'url': src_url,
1997                     'ext': ext or src_ext or 'flv',
1998                     'format_id': 'http-%d' % (bitrate or http_count),
1999                     'tbr': bitrate,
2000                     'filesize': filesize,
2001                     'width': width,
2002                     'height': height,
2003                 })
2004
2005         return formats
2006
2007     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
2008         urls = []
2009         subtitles = {}
2010         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
2011             src = textstream.get('src')
2012             if not src or src in urls:
2013                 continue
2014             urls.append(src)
2015             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
2016             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
2017             subtitles.setdefault(lang, []).append({
2018                 'url': src,
2019                 'ext': ext,
2020             })
2021         return subtitles
2022
2023     def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
2024         xspf = self._download_xml(
2025             xspf_url, playlist_id, 'Downloading xpsf playlist',
2026             'Unable to download xspf manifest', fatal=fatal)
2027         if xspf is False:
2028             return []
2029         return self._parse_xspf(
2030             xspf, playlist_id, xspf_url=xspf_url,
2031             xspf_base_url=base_url(xspf_url))
2032
2033     def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
2034         NS_MAP = {
2035             'xspf': 'http://xspf.org/ns/0/',
2036             's1': 'http://static.streamone.nl/player/ns/0',
2037         }
2038
2039         entries = []
2040         for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
2041             title = xpath_text(
2042                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
2043             description = xpath_text(
2044                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
2045             thumbnail = xpath_text(
2046                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
2047             duration = float_or_none(
2048                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
2049
2050             formats = []
2051             for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
2052                 format_url = urljoin(xspf_base_url, location.text)
2053                 if not format_url:
2054                     continue
2055                 formats.append({
2056                     'url': format_url,
2057                     'manifest_url': xspf_url,
2058                     'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
2059                     'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
2060                     'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
2061                 })
2062             self._sort_formats(formats)
2063
2064             entries.append({
2065                 'id': playlist_id,
2066                 'title': title,
2067                 'description': description,
2068                 'thumbnail': thumbnail,
2069                 'duration': duration,
2070                 'formats': formats,
2071             })
2072         return entries
2073
2074     def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
2075         res = self._download_xml_handle(
2076             mpd_url, video_id,
2077             note=note or 'Downloading MPD manifest',
2078             errnote=errnote or 'Failed to download MPD manifest',
2079             fatal=fatal, data=data, headers=headers, query=query)
2080         if res is False:
2081             return []
2082         mpd_doc, urlh = res
2083         if mpd_doc is None:
2084             return []
2085         mpd_base_url = base_url(urlh.geturl())
2086
2087         return self._parse_mpd_formats(
2088             mpd_doc, mpd_id, mpd_base_url, mpd_url)
2089
2090     def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
2091         """
2092         Parse formats from MPD manifest.
2093         References:
2094          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
2095             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
2096          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
2097         """
2098         if mpd_doc.get('type') == 'dynamic':
2099             return []
2100
2101         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
2102
2103         def _add_ns(path):
2104             return self._xpath_ns(path, namespace)
2105
2106         def is_drm_protected(element):
2107             return element.find(_add_ns('ContentProtection')) is not None
2108
2109         def extract_multisegment_info(element, ms_parent_info):
2110             ms_info = ms_parent_info.copy()
2111
2112             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2113             # common attributes and elements.  We will only extract relevant
2114             # for us.
2115             def extract_common(source):
2116                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2117                 if segment_timeline is not None:
2118                     s_e = segment_timeline.findall(_add_ns('S'))
2119                     if s_e:
2120                         ms_info['total_number'] = 0
2121                         ms_info['s'] = []
2122                         for s in s_e:
2123                             r = int(s.get('r', 0))
2124                             ms_info['total_number'] += 1 + r
2125                             ms_info['s'].append({
2126                                 't': int(s.get('t', 0)),
2127                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2128                                 'd': int(s.attrib['d']),
2129                                 'r': r,
2130                             })
2131                 start_number = source.get('startNumber')
2132                 if start_number:
2133                     ms_info['start_number'] = int(start_number)
2134                 timescale = source.get('timescale')
2135                 if timescale:
2136                     ms_info['timescale'] = int(timescale)
2137                 segment_duration = source.get('duration')
2138                 if segment_duration:
2139                     ms_info['segment_duration'] = float(segment_duration)
2140
2141             def extract_Initialization(source):
2142                 initialization = source.find(_add_ns('Initialization'))
2143                 if initialization is not None:
2144                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
2145
2146             segment_list = element.find(_add_ns('SegmentList'))
2147             if segment_list is not None:
2148                 extract_common(segment_list)
2149                 extract_Initialization(segment_list)
2150                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
2151                 if segment_urls_e:
2152                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
2153             else:
2154                 segment_template = element.find(_add_ns('SegmentTemplate'))
2155                 if segment_template is not None:
2156                     extract_common(segment_template)
2157                     media = segment_template.get('media')
2158                     if media:
2159                         ms_info['media'] = media
2160                     initialization = segment_template.get('initialization')
2161                     if initialization:
2162                         ms_info['initialization'] = initialization
2163                     else:
2164                         extract_Initialization(segment_template)
2165             return ms_info
2166
2167         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
2168         formats = []
2169         for period in mpd_doc.findall(_add_ns('Period')):
2170             period_duration = parse_duration(period.get('duration')) or mpd_duration
2171             period_ms_info = extract_multisegment_info(period, {
2172                 'start_number': 1,
2173                 'timescale': 1,
2174             })
2175             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
2176                 if is_drm_protected(adaptation_set):
2177                     continue
2178                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
2179                 for representation in adaptation_set.findall(_add_ns('Representation')):
2180                     if is_drm_protected(representation):
2181                         continue
2182                     representation_attrib = adaptation_set.attrib.copy()
2183                     representation_attrib.update(representation.attrib)
2184                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
2185                     mime_type = representation_attrib['mimeType']
2186                     content_type = mime_type.split('/')[0]
2187                     if content_type == 'text':
2188                         # TODO implement WebVTT downloading
2189                         pass
2190                     elif content_type in ('video', 'audio'):
2191                         base_url = ''
2192                         for element in (representation, adaptation_set, period, mpd_doc):
2193                             base_url_e = element.find(_add_ns('BaseURL'))
2194                             if base_url_e is not None:
2195                                 base_url = base_url_e.text + base_url
2196                                 if re.match(r'^https?://', base_url):
2197                                     break
2198                         if mpd_base_url and not re.match(r'^https?://', base_url):
2199                             if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
2200                                 mpd_base_url += '/'
2201                             base_url = mpd_base_url + base_url
2202                         representation_id = representation_attrib.get('id')
2203                         lang = representation_attrib.get('lang')
2204                         url_el = representation.find(_add_ns('BaseURL'))
2205                         filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2206                         bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2207                         f = {
2208                             'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
2209                             'manifest_url': mpd_url,
2210                             'ext': mimetype2ext(mime_type),
2211                             'width': int_or_none(representation_attrib.get('width')),
2212                             'height': int_or_none(representation_attrib.get('height')),
2213                             'tbr': float_or_none(bandwidth, 1000),
2214                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2215                             'fps': int_or_none(representation_attrib.get('frameRate')),
2216                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2217                             'format_note': 'DASH %s' % content_type,
2218                             'filesize': filesize,
2219                             'container': mimetype2ext(mime_type) + '_dash',
2220                         }
2221                         f.update(parse_codecs(representation_attrib.get('codecs')))
2222                         representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2223
2224                         def prepare_template(template_name, identifiers):
2225                             tmpl = representation_ms_info[template_name]
2226                             # First of, % characters outside $...$ templates
2227                             # must be escaped by doubling for proper processing
2228                             # by % operator string formatting used further (see
2229                             # https://github.com/ytdl-org/youtube-dl/issues/16867).
2230                             t = ''
2231                             in_template = False
2232                             for c in tmpl:
2233                                 t += c
2234                                 if c == '$':
2235                                     in_template = not in_template
2236                                 elif c == '%' and not in_template:
2237                                     t += c
2238                             # Next, $...$ templates are translated to their
2239                             # %(...) counterparts to be used with % operator
2240                             t = t.replace('$RepresentationID$', representation_id)
2241                             t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2242                             t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2243                             t.replace('$$', '$')
2244                             return t
2245
2246                         # @initialization is a regular template like @media one
2247                         # so it should be handled just the same way (see
2248                         # https://github.com/ytdl-org/youtube-dl/issues/11605)
2249                         if 'initialization' in representation_ms_info:
2250                             initialization_template = prepare_template(
2251                                 'initialization',
2252                                 # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2253                                 # $Time$ shall not be included for @initialization thus
2254                                 # only $Bandwidth$ remains
2255                                 ('Bandwidth', ))
2256                             representation_ms_info['initialization_url'] = initialization_template % {
2257                                 'Bandwidth': bandwidth,
2258                             }
2259
2260                         def location_key(location):
2261                             return 'url' if re.match(r'^https?://', location) else 'path'
2262
2263                         if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2264
2265                             media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2266                             media_location_key = location_key(media_template)
2267
2268                             # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2269                             # can't be used at the same time
2270                             if '%(Number' in media_template and 's' not in representation_ms_info:
2271                                 segment_duration = None
2272                                 if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2273                                     segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2274                                     representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
2275                                 representation_ms_info['fragments'] = [{
2276                                     media_location_key: media_template % {
2277                                         'Number': segment_number,
2278                                         'Bandwidth': bandwidth,
2279                                     },
2280                                     'duration': segment_duration,
2281                                 } for segment_number in range(
2282                                     representation_ms_info['start_number'],
2283                                     representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2284                             else:
2285                                 # $Number*$ or $Time$ in media template with S list available
2286                                 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2287                                 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2288                                 representation_ms_info['fragments'] = []
2289                                 segment_time = 0
2290                                 segment_d = None
2291                                 segment_number = representation_ms_info['start_number']
2292
2293                                 def add_segment_url():
2294                                     segment_url = media_template % {
2295                                         'Time': segment_time,
2296                                         'Bandwidth': bandwidth,
2297                                         'Number': segment_number,
2298                                     }
2299                                     representation_ms_info['fragments'].append({
2300                                         media_location_key: segment_url,
2301                                         'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2302                                     })
2303
2304                                 for num, s in enumerate(representation_ms_info['s']):
2305                                     segment_time = s.get('t') or segment_time
2306                                     segment_d = s['d']
2307                                     add_segment_url()
2308                                     segment_number += 1
2309                                     for r in range(s.get('r', 0)):
2310                                         segment_time += segment_d
2311                                         add_segment_url()
2312                                         segment_number += 1
2313                                     segment_time += segment_d
2314                         elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2315                             # No media template
2316                             # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
2317                             # or any YouTube dashsegments video
2318                             fragments = []
2319                             segment_index = 0
2320                             timescale = representation_ms_info['timescale']
2321                             for s in representation_ms_info['s']:
2322                                 duration = float_or_none(s['d'], timescale)
2323                                 for r in range(s.get('r', 0) + 1):
2324                                     segment_uri = representation_ms_info['segment_urls'][segment_index]
2325                                     fragments.append({
2326                                         location_key(segment_uri): segment_uri,
2327                                         'duration': duration,
2328                                     })
2329                                     segment_index += 1
2330                             representation_ms_info['fragments'] = fragments
2331                         elif 'segment_urls' in representation_ms_info:
2332                             # Segment URLs with no SegmentTimeline
2333                             # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
2334                             # https://github.com/ytdl-org/youtube-dl/pull/14844
2335                             fragments = []
2336                             segment_duration = float_or_none(
2337                                 representation_ms_info['segment_duration'],
2338                                 representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
2339                             for segment_url in representation_ms_info['segment_urls']:
2340                                 fragment = {
2341                                     location_key(segment_url): segment_url,
2342                                 }
2343                                 if segment_duration:
2344                                     fragment['duration'] = segment_duration
2345                                 fragments.append(fragment)
2346                             representation_ms_info['fragments'] = fragments
2347                         # If there is a fragments key available then we correctly recognized fragmented media.
2348                         # Otherwise we will assume unfragmented media with direct access. Technically, such
2349                         # assumption is not necessarily correct since we may simply have no support for
2350                         # some forms of fragmented media renditions yet, but for now we'll use this fallback.
2351                         if 'fragments' in representation_ms_info:
2352                             f.update({
2353                                 # NB: mpd_url may be empty when MPD manifest is parsed from a string
2354                                 'url': mpd_url or base_url,
2355                                 'fragment_base_url': base_url,
2356                                 'fragments': [],
2357                                 'protocol': 'http_dash_segments',
2358                             })
2359                             if 'initialization_url' in representation_ms_info:
2360                                 initialization_url = representation_ms_info['initialization_url']
2361                                 if not f.get('url'):
2362                                     f['url'] = initialization_url
2363                                 f['fragments'].append({location_key(initialization_url): initialization_url})
2364                             f['fragments'].extend(representation_ms_info['fragments'])
2365                         else:
2366                             # Assuming direct URL to unfragmented media.
2367                             f['url'] = base_url
2368                         formats.append(f)
2369                     else:
2370                         self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2371         return formats
2372
2373     def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
2374         res = self._download_xml_handle(
2375             ism_url, video_id,
2376             note=note or 'Downloading ISM manifest',
2377             errnote=errnote or 'Failed to download ISM manifest',
2378             fatal=fatal, data=data, headers=headers, query=query)
2379         if res is False:
2380             return []
2381         ism_doc, urlh = res
2382         if ism_doc is None:
2383             return []
2384
2385         return self._parse_ism_formats(ism_doc, urlh.geturl(), ism_id)
2386
2387     def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None):
2388         """
2389         Parse formats from ISM manifest.
2390         References:
2391          1. [MS-SSTR]: Smooth Streaming Protocol,
2392             https://msdn.microsoft.com/en-us/library/ff469518.aspx
2393         """
2394         if ism_doc.get('IsLive') == 'TRUE' or ism_doc.find('Protection') is not None:
2395             return []
2396
2397         duration = int(ism_doc.attrib['Duration'])
2398         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
2399
2400         formats = []
2401         for stream in ism_doc.findall('StreamIndex'):
2402             stream_type = stream.get('Type')
2403             if stream_type not in ('video', 'audio'):
2404                 continue
2405             url_pattern = stream.attrib['Url']
2406             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
2407             stream_name = stream.get('Name')
2408             for track in stream.findall('QualityLevel'):
2409                 fourcc = track.get('FourCC', 'AACL' if track.get('AudioTag') == '255' else None)
2410                 # TODO: add support for WVC1 and WMAP
2411                 if fourcc not in ('H264', 'AVC1', 'AACL'):
2412                     self.report_warning('%s is not a supported codec' % fourcc)
2413                     continue
2414                 tbr = int(track.attrib['Bitrate']) // 1000
2415                 # [1] does not mention Width and Height attributes. However,
2416                 # they're often present while MaxWidth and MaxHeight are
2417                 # missing, so should be used as fallbacks
2418                 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
2419                 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
2420                 sampling_rate = int_or_none(track.get('SamplingRate'))
2421
2422                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
2423                 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
2424
2425                 fragments = []
2426                 fragment_ctx = {
2427                     'time': 0,
2428                 }
2429                 stream_fragments = stream.findall('c')
2430                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
2431                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
2432                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
2433                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
2434                     if not fragment_ctx['duration']:
2435                         try:
2436                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
2437                         except IndexError:
2438                             next_fragment_time = duration
2439                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
2440                     for _ in range(fragment_repeat):
2441                         fragments.append({
2442                             'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
2443                             'duration': fragment_ctx['duration'] / stream_timescale,
2444                         })
2445                         fragment_ctx['time'] += fragment_ctx['duration']
2446
2447                 format_id = []
2448                 if ism_id:
2449                     format_id.append(ism_id)
2450                 if stream_name:
2451                     format_id.append(stream_name)
2452                 format_id.append(compat_str(tbr))
2453
2454                 formats.append({
2455                     'format_id': '-'.join(format_id),
2456                     'url': ism_url,
2457                     'manifest_url': ism_url,
2458                     'ext': 'ismv' if stream_type == 'video' else 'isma',
2459                     'width': width,
2460                     'height': height,
2461                     'tbr': tbr,
2462                     'asr': sampling_rate,
2463                     'vcodec': 'none' if stream_type == 'audio' else fourcc,
2464                     'acodec': 'none' if stream_type == 'video' else fourcc,
2465                     'protocol': 'ism',
2466                     'fragments': fragments,
2467                     '_download_params': {
2468                         'duration': duration,
2469                         'timescale': stream_timescale,
2470                         'width': width or 0,
2471                         'height': height or 0,
2472                         'fourcc': fourcc,
2473                         'codec_private_data': track.get('CodecPrivateData'),
2474                         'sampling_rate': sampling_rate,
2475                         'channels': int_or_none(track.get('Channels', 2)),
2476                         'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
2477                         'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
2478                     },
2479                 })
2480         return formats
2481
2482     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None):
2483         def absolute_url(item_url):
2484             return urljoin(base_url, item_url)
2485
2486         def parse_content_type(content_type):
2487             if not content_type:
2488                 return {}
2489             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
2490             if ctr:
2491                 mimetype, codecs = ctr.groups()
2492                 f = parse_codecs(codecs)
2493                 f['ext'] = mimetype2ext(mimetype)
2494                 return f
2495             return {}
2496
2497         def _media_formats(src, cur_media_type, type_info={}):
2498             full_url = absolute_url(src)
2499             ext = type_info.get('ext') or determine_ext(full_url)
2500             if ext == 'm3u8':
2501                 is_plain_url = False
2502                 formats = self._extract_m3u8_formats(
2503                     full_url, video_id, ext='mp4',
2504                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
2505                     preference=preference, fatal=False)
2506             elif ext == 'mpd':
2507                 is_plain_url = False
2508                 formats = self._extract_mpd_formats(
2509                     full_url, video_id, mpd_id=mpd_id, fatal=False)
2510             else:
2511                 is_plain_url = True
2512                 formats = [{
2513                     'url': full_url,
2514                     'vcodec': 'none' if cur_media_type == 'audio' else None,
2515                 }]
2516             return is_plain_url, formats
2517
2518         entries = []
2519         # amp-video and amp-audio are very similar to their HTML5 counterparts
2520         # so we wll include them right here (see
2521         # https://www.ampproject.org/docs/reference/components/amp-video)
2522         # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
2523         _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
2524         media_tags = [(media_tag, media_tag_name, media_type, '')
2525                       for media_tag, media_tag_name, media_type
2526                       in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
2527         media_tags.extend(re.findall(
2528             # We only allow video|audio followed by a whitespace or '>'.
2529             # Allowing more characters may end up in significant slow down (see
2530             # https://github.com/ytdl-org/youtube-dl/issues/11979, example URL:
2531             # http://www.porntrex.com/maps/videositemap.xml).
2532             r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
2533         for media_tag, _, media_type, media_content in media_tags:
2534             media_info = {
2535                 'formats': [],
2536                 'subtitles': {},
2537             }
2538             media_attributes = extract_attributes(media_tag)
2539             src = strip_or_none(media_attributes.get('src'))
2540             if src:
2541                 _, formats = _media_formats(src, media_type)
2542                 media_info['formats'].extend(formats)
2543             media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
2544             if media_content:
2545                 for source_tag in re.findall(r'<source[^>]+>', media_content):
2546                     s_attr = extract_attributes(source_tag)
2547                     # data-video-src and data-src are non standard but seen
2548                     # several times in the wild
2549                     src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src')))
2550                     if not src:
2551                         continue
2552                     f = parse_content_type(s_attr.get('type'))
2553                     is_plain_url, formats = _media_formats(src, media_type, f)
2554                     if is_plain_url:
2555                         # width, height, res, label and title attributes are
2556                         # all not standard but seen several times in the wild
2557                         labels = [
2558                             s_attr.get(lbl)
2559                             for lbl in ('label', 'title')
2560                             if str_or_none(s_attr.get(lbl))
2561                         ]
2562                         width = int_or_none(s_attr.get('width'))
2563                         height = (int_or_none(s_attr.get('height'))
2564                                   or int_or_none(s_attr.get('res')))
2565                         if not width or not height:
2566                             for lbl in labels:
2567                                 resolution = parse_resolution(lbl)
2568                                 if not resolution:
2569                                     continue
2570                                 width = width or resolution.get('width')
2571                                 height = height or resolution.get('height')
2572                         for lbl in labels:
2573                             tbr = parse_bitrate(lbl)
2574                             if tbr:
2575                                 break
2576                         else:
2577                             tbr = None
2578                         f.update({
2579                             'width': width,
2580                             'height': height,
2581                             'tbr': tbr,
2582                             'format_id': s_attr.get('label') or s_attr.get('title'),
2583                         })
2584                         f.update(formats[0])
2585                         media_info['formats'].append(f)
2586                     else:
2587                         media_info['formats'].extend(formats)
2588                 for track_tag in re.findall(r'<track[^>]+>', media_content):
2589                     track_attributes = extract_attributes(track_tag)
2590                     kind = track_attributes.get('kind')
2591                     if not kind or kind in ('subtitles', 'captions'):
2592                         src = strip_or_none(track_attributes.get('src'))
2593                         if not src:
2594                             continue
2595                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
2596                         media_info['subtitles'].setdefault(lang, []).append({
2597                             'url': absolute_url(src),
2598                         })
2599             for f in media_info['formats']:
2600                 f.setdefault('http_headers', {})['Referer'] = base_url
2601             if media_info['formats'] or media_info['subtitles']:
2602                 entries.append(media_info)
2603         return entries
2604
2605     def _extract_akamai_formats(self, manifest_url, video_id, hosts={}):
2606         signed = 'hdnea=' in manifest_url
2607         if not signed:
2608             # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
2609             manifest_url = re.sub(
2610                 r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
2611                 '', manifest_url).strip('?')
2612
2613         formats = []
2614
2615         hdcore_sign = 'hdcore=3.7.0'
2616         f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
2617         hds_host = hosts.get('hds')
2618         if hds_host:
2619             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
2620         if 'hdcore=' not in f4m_url:
2621             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
2622         f4m_formats = self._extract_f4m_formats(
2623             f4m_url, video_id, f4m_id='hds', fatal=False)
2624         for entry in f4m_formats:
2625             entry.update({'extra_param_to_segment_url': hdcore_sign})
2626         formats.extend(f4m_formats)
2627
2628         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
2629         hls_host = hosts.get('hls')
2630         if hls_host:
2631             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
2632         m3u8_formats = self._extract_m3u8_formats(
2633             m3u8_url, video_id, 'mp4', 'm3u8_native',
2634             m3u8_id='hls', fatal=False)
2635         formats.extend(m3u8_formats)
2636
2637         http_host = hosts.get('http')
2638         if http_host and m3u8_formats and not signed:
2639             REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
2640             qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
2641             qualities_length = len(qualities)
2642             if len(m3u8_formats) in (qualities_length, qualities_length + 1):
2643                 i = 0
2644                 for f in m3u8_formats:
2645                     if f['vcodec'] != 'none':
2646                         for protocol in ('http', 'https'):
2647                             http_f = f.copy()
2648                             del http_f['manifest_url']
2649                             http_url = re.sub(
2650                                 REPL_REGEX, protocol + r'://%s/\g<1>%s\3' % (http_host, qualities[i]), f['url'])
2651                             http_f.update({
2652                                 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
2653                                 'url': http_url,
2654                                 'protocol': protocol,
2655                             })
2656                             formats.append(http_f)
2657                         i += 1
2658
2659         return formats
2660
2661     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
2662         query = compat_urlparse.urlparse(url).query
2663         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
2664         mobj = re.search(
2665             r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
2666         url_base = mobj.group('url')
2667         http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
2668         formats = []
2669
2670         def manifest_url(manifest):
2671             m_url = '%s/%s' % (http_base_url, manifest)
2672             if query:
2673                 m_url += '?%s' % query
2674             return m_url
2675
2676         if 'm3u8' not in skip_protocols:
2677             formats.extend(self._extract_m3u8_formats(
2678                 manifest_url('playlist.m3u8'), video_id, 'mp4',
2679                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
2680         if 'f4m' not in skip_protocols:
2681             formats.extend(self._extract_f4m_formats(
2682                 manifest_url('manifest.f4m'),
2683                 video_id, f4m_id='hds', fatal=False))
2684         if 'dash' not in skip_protocols:
2685             formats.extend(self._extract_mpd_formats(
2686                 manifest_url('manifest.mpd'),
2687                 video_id, mpd_id='dash', fatal=False))
2688         if re.search(r'(?:/smil:|\.smil)', url_base):
2689             if 'smil' not in skip_protocols:
2690                 rtmp_formats = self._extract_smil_formats(
2691                     manifest_url('jwplayer.smil'),
2692                     video_id, fatal=False)
2693                 for rtmp_format in rtmp_formats:
2694                     rtsp_format = rtmp_format.copy()
2695                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
2696                     del rtsp_format['play_path']
2697                     del rtsp_format['ext']
2698                     rtsp_format.update({
2699                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
2700                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
2701                         'protocol': 'rtsp',
2702                     })
2703                     formats.extend([rtmp_format, rtsp_format])
2704         else:
2705             for protocol in ('rtmp', 'rtsp'):
2706                 if protocol not in skip_protocols:
2707                     formats.append({
2708                         'url': '%s:%s' % (protocol, url_base),
2709                         'format_id': protocol,
2710                         'protocol': protocol,
2711                     })
2712         return formats
2713
2714     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
2715         mobj = re.search(
2716             r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
2717             webpage)
2718         if mobj:
2719             try:
2720                 jwplayer_data = self._parse_json(mobj.group('options'),
2721                                                  video_id=video_id,
2722                                                  transform_source=transform_source)
2723             except ExtractorError:
2724                 pass
2725             else:
2726                 if isinstance(jwplayer_data, dict):
2727                     return jwplayer_data
2728
2729     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
2730         jwplayer_data = self._find_jwplayer_data(
2731             webpage, video_id, transform_source=js_to_json)
2732         return self._parse_jwplayer_data(
2733             jwplayer_data, video_id, *args, **kwargs)
2734
2735     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
2736                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
2737         # JWPlayer backward compatibility: flattened playlists
2738         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
2739         if 'playlist' not in jwplayer_data:
2740             jwplayer_data = {'playlist': [jwplayer_data]}
2741
2742         entries = []
2743
2744         # JWPlayer backward compatibility: single playlist item
2745         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
2746         if not isinstance(jwplayer_data['playlist'], list):
2747             jwplayer_data['playlist'] = [jwplayer_data['playlist']]
2748
2749         for video_data in jwplayer_data['playlist']:
2750             # JWPlayer backward compatibility: flattened sources
2751             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
2752             if 'sources' not in video_data:
2753                 video_data['sources'] = [video_data]
2754
2755             this_video_id = video_id or video_data['mediaid']
2756
2757             formats = self._parse_jwplayer_formats(
2758                 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
2759                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
2760
2761             subtitles = {}
2762             tracks = video_data.get('tracks')
2763             if tracks and isinstance(tracks, list):
2764                 for track in tracks:
2765                     if not isinstance(track, dict):
2766                         continue
2767                     track_kind = track.get('kind')
2768                     if not track_kind or not isinstance(track_kind, compat_str):
2769                         continue
2770                     if track_kind.lower() not in ('captions', 'subtitles'):
2771                         continue
2772                     track_url = urljoin(base_url, track.get('file'))
2773                     if not track_url:
2774                         continue
2775                     subtitles.setdefault(track.get('label') or 'en', []).append({
2776                         'url': self._proto_relative_url(track_url)
2777                     })
2778
2779             entry = {
2780                 'id': this_video_id,
2781                 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
2782                 'description': clean_html(video_data.get('description')),
2783                 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
2784                 'timestamp': int_or_none(video_data.get('pubdate')),
2785                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
2786                 'subtitles': subtitles,
2787             }
2788             # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
2789             if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
2790                 entry.update({
2791                     '_type': 'url_transparent',
2792                     'url': formats[0]['url'],
2793                 })
2794             else:
2795                 self._sort_formats(formats)
2796                 entry['formats'] = formats
2797             entries.append(entry)
2798         if len(entries) == 1:
2799             return entries[0]
2800         else:
2801             return self.playlist_result(entries)
2802
2803     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
2804                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
2805         urls = []
2806         formats = []
2807         for source in jwplayer_sources_data:
2808             if not isinstance(source, dict):
2809                 continue
2810             source_url = urljoin(
2811                 base_url, self._proto_relative_url(source.get('file')))
2812             if not source_url or source_url in urls:
2813                 continue
2814             urls.append(source_url)
2815             source_type = source.get('type') or ''
2816             ext = mimetype2ext(source_type) or determine_ext(source_url)
2817             if source_type == 'hls' or ext == 'm3u8':
2818                 formats.extend(self._extract_m3u8_formats(
2819                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
2820                     m3u8_id=m3u8_id, fatal=False))
2821             elif source_type == 'dash' or ext == 'mpd':
2822                 formats.extend(self._extract_mpd_formats(
2823                     source_url, video_id, mpd_id=mpd_id, fatal=False))
2824             elif ext == 'smil':
2825                 formats.extend(self._extract_smil_formats(
2826                     source_url, video_id, fatal=False))
2827             # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
2828             elif source_type.startswith('audio') or ext in (
2829                     'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
2830                 formats.append({
2831                     'url': source_url,
2832                     'vcodec': 'none',
2833                     'ext': ext,
2834                 })
2835             else:
2836                 height = int_or_none(source.get('height'))
2837                 if height is None:
2838                     # Often no height is provided but there is a label in
2839                     # format like "1080p", "720p SD", or 1080.
2840                     height = int_or_none(self._search_regex(
2841                         r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
2842                         'height', default=None))
2843                 a_format = {
2844                     'url': source_url,
2845                     'width': int_or_none(source.get('width')),
2846                     'height': height,
2847                     'tbr': int_or_none(source.get('bitrate')),
2848                     'ext': ext,
2849                 }
2850                 if source_url.startswith('rtmp'):
2851                     a_format['ext'] = 'flv'
2852                     # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
2853                     # of jwplayer.flash.swf
2854                     rtmp_url_parts = re.split(
2855                         r'((?:mp4|mp3|flv):)', source_url, 1)
2856                     if len(rtmp_url_parts) == 3:
2857                         rtmp_url, prefix, play_path = rtmp_url_parts
2858                         a_format.update({
2859                             'url': rtmp_url,
2860                             'play_path': prefix + play_path,
2861                         })
2862                     if rtmp_params:
2863                         a_format.update(rtmp_params)
2864                 formats.append(a_format)
2865         return formats
2866
2867     def _live_title(self, name):
2868         """ Generate the title for a live video """
2869         now = datetime.datetime.now()
2870         now_str = now.strftime('%Y-%m-%d %H:%M')
2871         return name + ' ' + now_str
2872
2873     def _int(self, v, name, fatal=False, **kwargs):
2874         res = int_or_none(v, **kwargs)
2875         if 'get_attr' in kwargs:
2876             print(getattr(v, kwargs['get_attr']))
2877         if res is None:
2878             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2879             if fatal:
2880                 raise ExtractorError(msg)
2881             else:
2882                 self._downloader.report_warning(msg)
2883         return res
2884
2885     def _float(self, v, name, fatal=False, **kwargs):
2886         res = float_or_none(v, **kwargs)
2887         if res is None:
2888             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2889             if fatal:
2890                 raise ExtractorError(msg)
2891             else:
2892                 self._downloader.report_warning(msg)
2893         return res
2894
2895     def _set_cookie(self, domain, name, value, expire_time=None, port=None,
2896                     path='/', secure=False, discard=False, rest={}, **kwargs):
2897         cookie = compat_cookiejar_Cookie(
2898             0, name, value, port, port is not None, domain, True,
2899             domain.startswith('.'), path, True, secure, expire_time,
2900             discard, None, None, rest)
2901         self._downloader.cookiejar.set_cookie(cookie)
2902
2903     def _get_cookies(self, url):
2904         """ Return a compat_cookies_SimpleCookie with the cookies for the url """
2905         req = sanitized_Request(url)
2906         self._downloader.cookiejar.add_cookie_header(req)
2907         return compat_cookies_SimpleCookie(req.get_header('Cookie'))
2908
2909     def _apply_first_set_cookie_header(self, url_handle, cookie):
2910         """
2911         Apply first Set-Cookie header instead of the last. Experimental.
2912
2913         Some sites (e.g. [1-3]) may serve two cookies under the same name
2914         in Set-Cookie header and expect the first (old) one to be set rather
2915         than second (new). However, as of RFC6265 the newer one cookie
2916         should be set into cookie store what actually happens.
2917         We will workaround this issue by resetting the cookie to
2918         the first one manually.
2919         1. https://new.vk.com/
2920         2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
2921         3. https://learning.oreilly.com/
2922         """
2923         for header, cookies in url_handle.headers.items():
2924             if header.lower() != 'set-cookie':
2925                 continue
2926             if sys.version_info[0] >= 3:
2927                 cookies = cookies.encode('iso-8859-1')
2928             cookies = cookies.decode('utf-8')
2929             cookie_value = re.search(
2930                 r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
2931             if cookie_value:
2932                 value, domain = cookie_value.groups()
2933                 self._set_cookie(domain, cookie, value)
2934                 break
2935
2936     def get_testcases(self, include_onlymatching=False):
2937         t = getattr(self, '_TEST', None)
2938         if t:
2939             assert not hasattr(self, '_TESTS'), \
2940                 '%s has _TEST and _TESTS' % type(self).__name__
2941             tests = [t]
2942         else:
2943             tests = getattr(self, '_TESTS', [])
2944         for t in tests:
2945             if not include_onlymatching and t.get('only_matching', False):
2946                 continue
2947             t['name'] = type(self).__name__[:-len('IE')]
2948             yield t
2949
2950     def is_suitable(self, age_limit):
2951         """ Test whether the extractor is generally suitable for the given
2952         age limit (i.e. pornographic sites are not, all others usually are) """
2953
2954         any_restricted = False
2955         for tc in self.get_testcases(include_onlymatching=False):
2956             if tc.get('playlist', []):
2957                 tc = tc['playlist'][0]
2958             is_restricted = age_restricted(
2959                 tc.get('info_dict', {}).get('age_limit'), age_limit)
2960             if not is_restricted:
2961                 return True
2962             any_restricted = any_restricted or is_restricted
2963         return not any_restricted
2964
2965     def extract_subtitles(self, *args, **kwargs):
2966         if (self._downloader.params.get('writesubtitles', False)
2967                 or self._downloader.params.get('listsubtitles')):
2968             return self._get_subtitles(*args, **kwargs)
2969         return {}
2970
2971     def _get_subtitles(self, *args, **kwargs):
2972         raise NotImplementedError('This method must be implemented by subclasses')
2973
2974     @staticmethod
2975     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
2976         """ Merge subtitle items for one language. Items with duplicated URLs
2977         will be dropped. """
2978         list1_urls = set([item['url'] for item in subtitle_list1])
2979         ret = list(subtitle_list1)
2980         ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
2981         return ret
2982
2983     @classmethod
2984     def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
2985         """ Merge two subtitle dictionaries, language by language. """
2986         ret = dict(subtitle_dict1)
2987         for lang in subtitle_dict2:
2988             ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
2989         return ret
2990
2991     def extract_automatic_captions(self, *args, **kwargs):
2992         if (self._downloader.params.get('writeautomaticsub', False)
2993                 or self._downloader.params.get('listsubtitles')):
2994             return self._get_automatic_captions(*args, **kwargs)
2995         return {}
2996
2997     def _get_automatic_captions(self, *args, **kwargs):
2998         raise NotImplementedError('This method must be implemented by subclasses')
2999
3000     def mark_watched(self, *args, **kwargs):
3001         if (self._downloader.params.get('mark_watched', False)
3002                 and (self._get_login_info()[0] is not None
3003                      or self._downloader.params.get('cookiefile') is not None)):
3004             self._mark_watched(*args, **kwargs)
3005
3006     def _mark_watched(self, *args, **kwargs):
3007         raise NotImplementedError('This method must be implemented by subclasses')
3008
3009     def geo_verification_headers(self):
3010         headers = {}
3011         geo_verification_proxy = self._downloader.params.get('geo_verification_proxy')
3012         if geo_verification_proxy:
3013             headers['Ytdl-request-proxy'] = geo_verification_proxy
3014         return headers
3015
3016     def _generic_id(self, url):
3017         return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
3018
3019     def _generic_title(self, url):
3020         return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
3021
3022
3023 class SearchInfoExtractor(InfoExtractor):
3024     """
3025     Base class for paged search queries extractors.
3026     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
3027     Instances should define _SEARCH_KEY and _MAX_RESULTS.
3028     """
3029
3030     @classmethod
3031     def _make_valid_url(cls):
3032         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
3033
3034     @classmethod
3035     def suitable(cls, url):
3036         return re.match(cls._make_valid_url(), url) is not None
3037
3038     def _real_extract(self, query):
3039         mobj = re.match(self._make_valid_url(), query)
3040         if mobj is None:
3041             raise ExtractorError('Invalid search query "%s"' % query)
3042
3043         prefix = mobj.group('prefix')
3044         query = mobj.group('query')
3045         if prefix == '':
3046             return self._get_n_results(query, 1)
3047         elif prefix == 'all':
3048             return self._get_n_results(query, self._MAX_RESULTS)
3049         else:
3050             n = int(prefix)
3051             if n <= 0:
3052                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
3053             elif n > self._MAX_RESULTS:
3054                 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
3055                 n = self._MAX_RESULTS
3056             return self._get_n_results(query, n)
3057
3058     def _get_n_results(self, query, n):
3059         """Get a specified number of results for a query"""
3060         raise NotImplementedError('This method must be implemented by subclasses')
3061
3062     @property
3063     def SEARCH_KEY(self):
3064         return self._SEARCH_KEY