youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import unicode_literals
   5
   6 import base64
   7 import binascii
   8 import calendar
   9 import codecs
  10 import contextlib
  11 import ctypes
  12 import datetime
  13 import email.utils
  14 import errno
  15 import functools
  16 import gzip
  17 import io
  18 import itertools
  19 import json
  20 import locale
  21 import math
  22 import operator
  23 import os
  24 import pipes
  25 import platform
  26 import re
  27 import socket
  28 import ssl
  29 import subprocess
  30 import sys
  31 import tempfile
  32 import traceback
  33 import xml.etree.ElementTree
  34 import zlib
  35
  36 from .compat import (
  37     compat_HTMLParser,
  38     compat_basestring,
  39     compat_chr,
  40     compat_etree_fromstring,
  41     compat_html_entities,
  42     compat_html_entities_html5,
  43     compat_http_client,
  44     compat_kwargs,
  45     compat_parse_qs,
  46     compat_shlex_quote,
  47     compat_socket_create_connection,
  48     compat_str,
  49     compat_struct_pack,
  50     compat_struct_unpack,
  51     compat_urllib_error,
  52     compat_urllib_parse,
  53     compat_urllib_parse_urlencode,
  54     compat_urllib_parse_urlparse,
  55     compat_urllib_parse_unquote_plus,
  56     compat_urllib_request,
  57     compat_urlparse,
  58     compat_xpath,
  59 )
  60
  61 from .socks import (
  62     ProxyType,
  63     sockssocket,
  64 )
  65
  66
  67 def register_socks_protocols():
  68     # "Register" SOCKS protocols
  69     # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
  70     # URLs with protocols not in urlparse.uses_netloc are not handled correctly
  71     for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
  72         if scheme not in compat_urlparse.uses_netloc:
  73             compat_urlparse.uses_netloc.append(scheme)
  74
  75
  76 # This is not clearly defined otherwise
  77 compiled_regex_type = type(re.compile(''))
  78
  79 std_headers = {
  80     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/47.0 (Chrome)',
  81     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  82     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  83     'Accept-Encoding': 'gzip, deflate',
  84     'Accept-Language': 'en-us,en;q=0.5',
  85 }
  86
  87
  88 NO_DEFAULT = object()
  89
  90 ENGLISH_MONTH_NAMES = [
  91     'January', 'February', 'March', 'April', 'May', 'June',
  92     'July', 'August', 'September', 'October', 'November', 'December']
  93
  94 MONTH_NAMES = {
  95     'en': ENGLISH_MONTH_NAMES,
  96     'fr': [
  97         'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
  98         'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
  99 }
 100
 101 KNOWN_EXTENSIONS = (
 102     'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
 103     'flv', 'f4v', 'f4a', 'f4b',
 104     'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
 105     'mkv', 'mka', 'mk3d',
 106     'avi', 'divx',
 107     'mov',
 108     'asf', 'wmv', 'wma',
 109     '3gp', '3g2',
 110     'mp3',
 111     'flac',
 112     'ape',
 113     'wav',
 114     'f4f', 'f4m', 'm3u8', 'smil')
 115
 116 # needed for sanitizing filenames in restricted mode
 117 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
 118                         itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUYP', ['ss'],
 119                                         'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuypy')))
 120
 121 DATE_FORMATS = (
 122     '%d %B %Y',
 123     '%d %b %Y',
 124     '%B %d %Y',
 125     '%b %d %Y',
 126     '%b %dst %Y %I:%M',
 127     '%b %dnd %Y %I:%M',
 128     '%b %dth %Y %I:%M',
 129     '%Y %m %d',
 130     '%Y-%m-%d',
 131     '%Y/%m/%d',
 132     '%Y/%m/%d %H:%M',
 133     '%Y/%m/%d %H:%M:%S',
 134     '%Y-%m-%d %H:%M:%S',
 135     '%Y-%m-%d %H:%M:%S.%f',
 136     '%d.%m.%Y %H:%M',
 137     '%d.%m.%Y %H.%M',
 138     '%Y-%m-%dT%H:%M:%SZ',
 139     '%Y-%m-%dT%H:%M:%S.%fZ',
 140     '%Y-%m-%dT%H:%M:%S.%f0Z',
 141     '%Y-%m-%dT%H:%M:%S',
 142     '%Y-%m-%dT%H:%M:%S.%f',
 143     '%Y-%m-%dT%H:%M',
 144 )
 145
 146 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
 147 DATE_FORMATS_DAY_FIRST.extend([
 148     '%d-%m-%Y',
 149     '%d.%m.%Y',
 150     '%d.%m.%y',
 151     '%d/%m/%Y',
 152     '%d/%m/%y',
 153     '%d/%m/%Y %H:%M:%S',
 154 ])
 155
 156 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
 157 DATE_FORMATS_MONTH_FIRST.extend([
 158     '%m-%d-%Y',
 159     '%m.%d.%Y',
 160     '%m/%d/%Y',
 161     '%m/%d/%y',
 162     '%m/%d/%Y %H:%M:%S',
 163 ])
 164
 165
 166 def preferredencoding():
 167     """Get preferred encoding.
 168
 169     Returns the best encoding scheme for the system, based on
 170     locale.getpreferredencoding() and some further tweaks.
 171     """
 172     try:
 173         pref = locale.getpreferredencoding()
 174         'TEST'.encode(pref)
 175     except Exception:
 176         pref = 'UTF-8'
 177
 178     return pref
 179
 180
 181 def write_json_file(obj, fn):
 182     """ Encode obj as JSON and write it to fn, atomically if possible """
 183
 184     fn = encodeFilename(fn)
 185     if sys.version_info < (3, 0) and sys.platform != 'win32':
 186         encoding = get_filesystem_encoding()
 187         # os.path.basename returns a bytes object, but NamedTemporaryFile
 188         # will fail if the filename contains non ascii characters unless we
 189         # use a unicode object
 190         path_basename = lambda f: os.path.basename(fn).decode(encoding)
 191         # the same for os.path.dirname
 192         path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
 193     else:
 194         path_basename = os.path.basename
 195         path_dirname = os.path.dirname
 196
 197     args = {
 198         'suffix': '.tmp',
 199         'prefix': path_basename(fn) + '.',
 200         'dir': path_dirname(fn),
 201         'delete': False,
 202     }
 203
 204     # In Python 2.x, json.dump expects a bytestream.
 205     # In Python 3.x, it writes to a character stream
 206     if sys.version_info < (3, 0):
 207         args['mode'] = 'wb'
 208     else:
 209         args.update({
 210             'mode': 'w',
 211             'encoding': 'utf-8',
 212         })
 213
 214     tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
 215
 216     try:
 217         with tf:
 218             json.dump(obj, tf)
 219         if sys.platform == 'win32':
 220             # Need to remove existing file on Windows, else os.rename raises
 221             # WindowsError or FileExistsError.
 222             try:
 223                 os.unlink(fn)
 224             except OSError:
 225                 pass
 226         os.rename(tf.name, fn)
 227     except Exception:
 228         try:
 229             os.remove(tf.name)
 230         except OSError:
 231             pass
 232         raise
 233
 234
 235 if sys.version_info >= (2, 7):
 236     def find_xpath_attr(node, xpath, key, val=None):
 237         """ Find the xpath xpath[@key=val] """
 238         assert re.match(r'^[a-zA-Z_-]+$', key)
 239         expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
 240         return node.find(expr)
 241 else:
 242     def find_xpath_attr(node, xpath, key, val=None):
 243         for f in node.findall(compat_xpath(xpath)):
 244             if key not in f.attrib:
 245                 continue
 246             if val is None or f.attrib.get(key) == val:
 247                 return f
 248         return None
 249
 250 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 251 # the namespace parameter
 252
 253
 254 def xpath_with_ns(path, ns_map):
 255     components = [c.split(':') for c in path.split('/')]
 256     replaced = []
 257     for c in components:
 258         if len(c) == 1:
 259             replaced.append(c[0])
 260         else:
 261             ns, tag = c
 262             replaced.append('{%s}%s' % (ns_map[ns], tag))
 263     return '/'.join(replaced)
 264
 265
 266 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 267     def _find_xpath(xpath):
 268         return node.find(compat_xpath(xpath))
 269
 270     if isinstance(xpath, (str, compat_str)):
 271         n = _find_xpath(xpath)
 272     else:
 273         for xp in xpath:
 274             n = _find_xpath(xp)
 275             if n is not None:
 276                 break
 277
 278     if n is None:
 279         if default is not NO_DEFAULT:
 280             return default
 281         elif fatal:
 282             name = xpath if name is None else name
 283             raise ExtractorError('Could not find XML element %s' % name)
 284         else:
 285             return None
 286     return n
 287
 288
 289 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 290     n = xpath_element(node, xpath, name, fatal=fatal, default=default)
 291     if n is None or n == default:
 292         return n
 293     if n.text is None:
 294         if default is not NO_DEFAULT:
 295             return default
 296         elif fatal:
 297             name = xpath if name is None else name
 298             raise ExtractorError('Could not find XML element\'s text %s' % name)
 299         else:
 300             return None
 301     return n.text
 302
 303
 304 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
 305     n = find_xpath_attr(node, xpath, key)
 306     if n is None:
 307         if default is not NO_DEFAULT:
 308             return default
 309         elif fatal:
 310             name = '%s[@%s]' % (xpath, key) if name is None else name
 311             raise ExtractorError('Could not find XML attribute %s' % name)
 312         else:
 313             return None
 314     return n.attrib[key]
 315
 316
 317 def get_element_by_id(id, html):
 318     """Return the content of the tag with the specified ID in the passed HTML document"""
 319     return get_element_by_attribute('id', id, html)
 320
 321
 322 def get_element_by_class(class_name, html):
 323     return get_element_by_attribute(
 324         'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
 325         html, escape_value=False)
 326
 327
 328 def get_element_by_attribute(attribute, value, html, escape_value=True):
 329     """Return the content of the tag with the specified attribute in the passed HTML document"""
 330
 331     value = re.escape(value) if escape_value else value
 332
 333     m = re.search(r'''(?xs)
 334         <([a-zA-Z0-9:._-]+)
 335          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
 336          \s+%s=['"]?%s['"]?
 337          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
 338         \s*>
 339         (?P<content>.*?)
 340         </\1>
 341     ''' % (re.escape(attribute), value), html)
 342
 343     if not m:
 344         return None
 345     res = m.group('content')
 346
 347     if res.startswith('"') or res.startswith("'"):
 348         res = res[1:-1]
 349
 350     return unescapeHTML(res)
 351
 352
 353 class HTMLAttributeParser(compat_HTMLParser):
 354     """Trivial HTML parser to gather the attributes for a single element"""
 355     def __init__(self):
 356         self.attrs = {}
 357         compat_HTMLParser.__init__(self)
 358
 359     def handle_starttag(self, tag, attrs):
 360         self.attrs = dict(attrs)
 361
 362
 363 def extract_attributes(html_element):
 364     """Given a string for an HTML element such as
 365     <el
 366          a="foo" B="bar" c="&98;az" d=boz
 367          empty= noval entity="&amp;"
 368          sq='"' dq="'"
 369     >
 370     Decode and return a dictionary of attributes.
 371     {
 372         'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
 373         'empty': '', 'noval': None, 'entity': '&',
 374         'sq': '"', 'dq': '\''
 375     }.
 376     NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
 377     but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
 378     """
 379     parser = HTMLAttributeParser()
 380     parser.feed(html_element)
 381     parser.close()
 382     return parser.attrs
 383
 384
 385 def clean_html(html):
 386     """Clean an HTML snippet into a readable string"""
 387
 388     if html is None:  # Convenience for sanitizing descriptions etc.
 389         return html
 390
 391     # Newline vs <br />
 392     html = html.replace('\n', ' ')
 393     html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
 394     html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 395     # Strip html tags
 396     html = re.sub('<.*?>', '', html)
 397     # Replace html entities
 398     html = unescapeHTML(html)
 399     return html.strip()
 400
 401
 402 def sanitize_open(filename, open_mode):
 403     """Try to open the given filename, and slightly tweak it if this fails.
 404
 405     Attempts to open the given filename. If this fails, it tries to change
 406     the filename slightly, step by step, until it's either able to open it
 407     or it fails and raises a final exception, like the standard open()
 408     function.
 409
 410     It returns the tuple (stream, definitive_file_name).
 411     """
 412     try:
 413         if filename == '-':
 414             if sys.platform == 'win32':
 415                 import msvcrt
 416                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 417             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 418         stream = open(encodeFilename(filename), open_mode)
 419         return (stream, filename)
 420     except (IOError, OSError) as err:
 421         if err.errno in (errno.EACCES,):
 422             raise
 423
 424         # In case of error, try to remove win32 forbidden chars
 425         alt_filename = sanitize_path(filename)
 426         if alt_filename == filename:
 427             raise
 428         else:
 429             # An exception here should be caught in the caller
 430             stream = open(encodeFilename(alt_filename), open_mode)
 431             return (stream, alt_filename)
 432
 433
 434 def timeconvert(timestr):
 435     """Convert RFC 2822 defined time string into system timestamp"""
 436     timestamp = None
 437     timetuple = email.utils.parsedate_tz(timestr)
 438     if timetuple is not None:
 439         timestamp = email.utils.mktime_tz(timetuple)
 440     return timestamp
 441
 442
 443 def sanitize_filename(s, restricted=False, is_id=False):
 444     """Sanitizes a string so it could be used as part of a filename.
 445     If restricted is set, use a stricter subset of allowed characters.
 446     Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
 447     """
 448     def replace_insane(char):
 449         if restricted and char in ACCENT_CHARS:
 450             return ACCENT_CHARS[char]
 451         if char == '?' or ord(char) < 32 or ord(char) == 127:
 452             return ''
 453         elif char == '"':
 454             return '' if restricted else '\''
 455         elif char == ':':
 456             return '_-' if restricted else ' -'
 457         elif char in '\\/|*<>':
 458             return '_'
 459         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 460             return '_'
 461         if restricted and ord(char) > 127:
 462             return '_'
 463         return char
 464
 465     # Handle timestamps
 466     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
 467     result = ''.join(map(replace_insane, s))
 468     if not is_id:
 469         while '__' in result:
 470             result = result.replace('__', '_')
 471         result = result.strip('_')
 472         # Common case of "Foreign band name - English song title"
 473         if restricted and result.startswith('-_'):
 474             result = result[2:]
 475         if result.startswith('-'):
 476             result = '_' + result[len('-'):]
 477         result = result.lstrip('.')
 478         if not result:
 479             result = '_'
 480     return result
 481
 482
 483 def sanitize_path(s):
 484     """Sanitizes and normalizes path on Windows"""
 485     if sys.platform != 'win32':
 486         return s
 487     drive_or_unc, _ = os.path.splitdrive(s)
 488     if sys.version_info < (2, 7) and not drive_or_unc:
 489         drive_or_unc, _ = os.path.splitunc(s)
 490     norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
 491     if drive_or_unc:
 492         norm_path.pop(0)
 493     sanitized_path = [
 494         path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|[\s.]$)', '#', path_part)
 495         for path_part in norm_path]
 496     if drive_or_unc:
 497         sanitized_path.insert(0, drive_or_unc + os.path.sep)
 498     return os.path.join(*sanitized_path)
 499
 500
 501 # Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of
 502 # unwanted failures due to missing protocol
 503 def sanitize_url(url):
 504     return 'http:%s' % url if url.startswith('//') else url
 505
 506
 507 def sanitized_Request(url, *args, **kwargs):
 508     return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs)
 509
 510
 511 def orderedSet(iterable):
 512     """ Remove all duplicates from the input iterable """
 513     res = []
 514     for el in iterable:
 515         if el not in res:
 516             res.append(el)
 517     return res
 518
 519
 520 def _htmlentity_transform(entity_with_semicolon):
 521     """Transforms an HTML entity to a character."""
 522     entity = entity_with_semicolon[:-1]
 523
 524     # Known non-numeric HTML entity
 525     if entity in compat_html_entities.name2codepoint:
 526         return compat_chr(compat_html_entities.name2codepoint[entity])
 527
 528     # TODO: HTML5 allows entities without a semicolon. For example,
 529     # '&Eacuteric' should be decoded as 'Éric'.
 530     if entity_with_semicolon in compat_html_entities_html5:
 531         return compat_html_entities_html5[entity_with_semicolon]
 532
 533     mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
 534     if mobj is not None:
 535         numstr = mobj.group(1)
 536         if numstr.startswith('x'):
 537             base = 16
 538             numstr = '0%s' % numstr
 539         else:
 540             base = 10
 541         # See https://github.com/rg3/youtube-dl/issues/7518
 542         try:
 543             return compat_chr(int(numstr, base))
 544         except ValueError:
 545             pass
 546
 547     # Unknown entity in name, return its literal representation
 548     return '&%s;' % entity
 549
 550
 551 def unescapeHTML(s):
 552     if s is None:
 553         return None
 554     assert type(s) == compat_str
 555
 556     return re.sub(
 557         r'&([^;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
 558
 559
 560 def get_subprocess_encoding():
 561     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 562         # For subprocess calls, encode with locale encoding
 563         # Refer to http://stackoverflow.com/a/9951851/35070
 564         encoding = preferredencoding()
 565     else:
 566         encoding = sys.getfilesystemencoding()
 567     if encoding is None:
 568         encoding = 'utf-8'
 569     return encoding
 570
 571
 572 def encodeFilename(s, for_subprocess=False):
 573     """
 574     @param s The name of the file
 575     """
 576
 577     assert type(s) == compat_str
 578
 579     # Python 3 has a Unicode API
 580     if sys.version_info >= (3, 0):
 581         return s
 582
 583     # Pass '' directly to use Unicode APIs on Windows 2000 and up
 584     # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 585     # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 586     if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 587         return s
 588
 589     # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
 590     if sys.platform.startswith('java'):
 591         return s
 592
 593     return s.encode(get_subprocess_encoding(), 'ignore')
 594
 595
 596 def decodeFilename(b, for_subprocess=False):
 597
 598     if sys.version_info >= (3, 0):
 599         return b
 600
 601     if not isinstance(b, bytes):
 602         return b
 603
 604     return b.decode(get_subprocess_encoding(), 'ignore')
 605
 606
 607 def encodeArgument(s):
 608     if not isinstance(s, compat_str):
 609         # Legacy code that uses byte strings
 610         # Uncomment the following line after fixing all post processors
 611         # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
 612         s = s.decode('ascii')
 613     return encodeFilename(s, True)
 614
 615
 616 def decodeArgument(b):
 617     return decodeFilename(b, True)
 618
 619
 620 def decodeOption(optval):
 621     if optval is None:
 622         return optval
 623     if isinstance(optval, bytes):
 624         optval = optval.decode(preferredencoding())
 625
 626     assert isinstance(optval, compat_str)
 627     return optval
 628
 629
 630 def formatSeconds(secs):
 631     if secs > 3600:
 632         return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
 633     elif secs > 60:
 634         return '%d:%02d' % (secs // 60, secs % 60)
 635     else:
 636         return '%d' % secs
 637
 638
 639 def make_HTTPS_handler(params, **kwargs):
 640     opts_no_check_certificate = params.get('nocheckcertificate', False)
 641     if hasattr(ssl, 'create_default_context'):  # Python >= 3.4 or 2.7.9
 642         context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
 643         if opts_no_check_certificate:
 644             context.check_hostname = False
 645             context.verify_mode = ssl.CERT_NONE
 646         try:
 647             return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 648         except TypeError:
 649             # Python 2.7.8
 650             # (create_default_context present but HTTPSHandler has no context=)
 651             pass
 652
 653     if sys.version_info < (3, 2):
 654         return YoutubeDLHTTPSHandler(params, **kwargs)
 655     else:  # Python < 3.4
 656         context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
 657         context.verify_mode = (ssl.CERT_NONE
 658                                if opts_no_check_certificate
 659                                else ssl.CERT_REQUIRED)
 660         context.set_default_verify_paths()
 661         return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 662
 663
 664 def bug_reports_message():
 665     if ytdl_is_updateable():
 666         update_cmd = 'type  youtube-dl -U  to update'
 667     else:
 668         update_cmd = 'see  https://yt-dl.org/update  on how to update'
 669     msg = '; please report this issue on https://yt-dl.org/bug .'
 670     msg += ' Make sure you are using the latest version; %s.' % update_cmd
 671     msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
 672     return msg
 673
 674
 675 class ExtractorError(Exception):
 676     """Error during info extraction."""
 677
 678     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
 679         """ tb, if given, is the original traceback (so that it can be printed out).
 680         If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
 681         """
 682
 683         if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
 684             expected = True
 685         if video_id is not None:
 686             msg = video_id + ': ' + msg
 687         if cause:
 688             msg += ' (caused by %r)' % cause
 689         if not expected:
 690             msg += bug_reports_message()
 691         super(ExtractorError, self).__init__(msg)
 692
 693         self.traceback = tb
 694         self.exc_info = sys.exc_info()  # preserve original exception
 695         self.cause = cause
 696         self.video_id = video_id
 697
 698     def format_traceback(self):
 699         if self.traceback is None:
 700             return None
 701         return ''.join(traceback.format_tb(self.traceback))
 702
 703
 704 class UnsupportedError(ExtractorError):
 705     def __init__(self, url):
 706         super(UnsupportedError, self).__init__(
 707             'Unsupported URL: %s' % url, expected=True)
 708         self.url = url
 709
 710
 711 class RegexNotFoundError(ExtractorError):
 712     """Error when a regex didn't match"""
 713     pass
 714
 715
 716 class DownloadError(Exception):
 717     """Download Error exception.
 718
 719     This exception may be thrown by FileDownloader objects if they are not
 720     configured to continue on errors. They will contain the appropriate
 721     error message.
 722     """
 723
 724     def __init__(self, msg, exc_info=None):
 725         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
 726         super(DownloadError, self).__init__(msg)
 727         self.exc_info = exc_info
 728
 729
 730 class SameFileError(Exception):
 731     """Same File exception.
 732
 733     This exception will be thrown by FileDownloader objects if they detect
 734     multiple files would have to be downloaded to the same file on disk.
 735     """
 736     pass
 737
 738
 739 class PostProcessingError(Exception):
 740     """Post Processing exception.
 741
 742     This exception may be raised by PostProcessor's .run() method to
 743     indicate an error in the postprocessing task.
 744     """
 745
 746     def __init__(self, msg):
 747         self.msg = msg
 748
 749
 750 class MaxDownloadsReached(Exception):
 751     """ --max-downloads limit has been reached. """
 752     pass
 753
 754
 755 class UnavailableVideoError(Exception):
 756     """Unavailable Format exception.
 757
 758     This exception will be thrown when a video is requested
 759     in a format that is not available for that video.
 760     """
 761     pass
 762
 763
 764 class ContentTooShortError(Exception):
 765     """Content Too Short exception.
 766
 767     This exception may be raised by FileDownloader objects when a file they
 768     download is too small for what the server announced first, indicating
 769     the connection was probably interrupted.
 770     """
 771
 772     def __init__(self, downloaded, expected):
 773         # Both in bytes
 774         self.downloaded = downloaded
 775         self.expected = expected
 776
 777
 778 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
 779     # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
 780     # expected HTTP responses to meet HTTP/1.0 or later (see also
 781     # https://github.com/rg3/youtube-dl/issues/6727)
 782     if sys.version_info < (3, 0):
 783         kwargs[b'strict'] = True
 784     hc = http_class(*args, **kwargs)
 785     source_address = ydl_handler._params.get('source_address')
 786     if source_address is not None:
 787         sa = (source_address, 0)
 788         if hasattr(hc, 'source_address'):  # Python 2.7+
 789             hc.source_address = sa
 790         else:  # Python 2.6
 791             def _hc_connect(self, *args, **kwargs):
 792                 sock = compat_socket_create_connection(
 793                     (self.host, self.port), self.timeout, sa)
 794                 if is_https:
 795                     self.sock = ssl.wrap_socket(
 796                         sock, self.key_file, self.cert_file,
 797                         ssl_version=ssl.PROTOCOL_TLSv1)
 798                 else:
 799                     self.sock = sock
 800             hc.connect = functools.partial(_hc_connect, hc)
 801
 802     return hc
 803
 804
 805 def handle_youtubedl_headers(headers):
 806     filtered_headers = headers
 807
 808     if 'Youtubedl-no-compression' in filtered_headers:
 809         filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
 810         del filtered_headers['Youtubedl-no-compression']
 811
 812     return filtered_headers
 813
 814
 815 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 816     """Handler for HTTP requests and responses.
 817
 818     This class, when installed with an OpenerDirector, automatically adds
 819     the standard headers to every HTTP request and handles gzipped and
 820     deflated responses from web servers. If compression is to be avoided in
 821     a particular request, the original request in the program code only has
 822     to include the HTTP header "Youtubedl-no-compression", which will be
 823     removed before making the real request.
 824
 825     Part of this code was copied from:
 826
 827     http://techknack.net/python-urllib2-handlers/
 828
 829     Andrew Rowls, the author of that code, agreed to release it to the
 830     public domain.
 831     """
 832
 833     def __init__(self, params, *args, **kwargs):
 834         compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
 835         self._params = params
 836
 837     def http_open(self, req):
 838         conn_class = compat_http_client.HTTPConnection
 839
 840         socks_proxy = req.headers.get('Ytdl-socks-proxy')
 841         if socks_proxy:
 842             conn_class = make_socks_conn_class(conn_class, socks_proxy)
 843             del req.headers['Ytdl-socks-proxy']
 844
 845         return self.do_open(functools.partial(
 846             _create_http_connection, self, conn_class, False),
 847             req)
 848
 849     @staticmethod
 850     def deflate(data):
 851         try:
 852             return zlib.decompress(data, -zlib.MAX_WBITS)
 853         except zlib.error:
 854             return zlib.decompress(data)
 855
 856     @staticmethod
 857     def addinfourl_wrapper(stream, headers, url, code):
 858         if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 859             return compat_urllib_request.addinfourl(stream, headers, url, code)
 860         ret = compat_urllib_request.addinfourl(stream, headers, url)
 861         ret.code = code
 862         return ret
 863
 864     def http_request(self, req):
 865         # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
 866         # always respected by websites, some tend to give out URLs with non percent-encoded
 867         # non-ASCII characters (see telemb.py, ard.py [#3412])
 868         # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
 869         # To work around aforementioned issue we will replace request's original URL with
 870         # percent-encoded one
 871         # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
 872         # the code of this workaround has been moved here from YoutubeDL.urlopen()
 873         url = req.get_full_url()
 874         url_escaped = escape_url(url)
 875
 876         # Substitute URL if any change after escaping
 877         if url != url_escaped:
 878             req = update_Request(req, url=url_escaped)
 879
 880         for h, v in std_headers.items():
 881             # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
 882             # The dict keys are capitalized because of this bug by urllib
 883             if h.capitalize() not in req.headers:
 884                 req.add_header(h, v)
 885
 886         req.headers = handle_youtubedl_headers(req.headers)
 887
 888         if sys.version_info < (2, 7) and '#' in req.get_full_url():
 889             # Python 2.6 is brain-dead when it comes to fragments
 890             req._Request__original = req._Request__original.partition('#')[0]
 891             req._Request__r_type = req._Request__r_type.partition('#')[0]
 892
 893         return req
 894
 895     def http_response(self, req, resp):
 896         old_resp = resp
 897         # gzip
 898         if resp.headers.get('Content-encoding', '') == 'gzip':
 899             content = resp.read()
 900             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
 901             try:
 902                 uncompressed = io.BytesIO(gz.read())
 903             except IOError as original_ioerror:
 904                 # There may be junk add the end of the file
 905                 # See http://stackoverflow.com/q/4928560/35070 for details
 906                 for i in range(1, 1024):
 907                     try:
 908                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
 909                         uncompressed = io.BytesIO(gz.read())
 910                     except IOError:
 911                         continue
 912                     break
 913                 else:
 914                     raise original_ioerror
 915             resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
 916             resp.msg = old_resp.msg
 917             del resp.headers['Content-encoding']
 918         # deflate
 919         if resp.headers.get('Content-encoding', '') == 'deflate':
 920             gz = io.BytesIO(self.deflate(resp.read()))
 921             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 922             resp.msg = old_resp.msg
 923             del resp.headers['Content-encoding']
 924         # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
 925         # https://github.com/rg3/youtube-dl/issues/6457).
 926         if 300 <= resp.code < 400:
 927             location = resp.headers.get('Location')
 928             if location:
 929                 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
 930                 if sys.version_info >= (3, 0):
 931                     location = location.encode('iso-8859-1').decode('utf-8')
 932                 else:
 933                     location = location.decode('utf-8')
 934                 location_escaped = escape_url(location)
 935                 if location != location_escaped:
 936                     del resp.headers['Location']
 937                     if sys.version_info < (3, 0):
 938                         location_escaped = location_escaped.encode('utf-8')
 939                     resp.headers['Location'] = location_escaped
 940         return resp
 941
 942     https_request = http_request
 943     https_response = http_response
 944
 945
 946 def make_socks_conn_class(base_class, socks_proxy):
 947     assert issubclass(base_class, (
 948         compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
 949
 950     url_components = compat_urlparse.urlparse(socks_proxy)
 951     if url_components.scheme.lower() == 'socks5':
 952         socks_type = ProxyType.SOCKS5
 953     elif url_components.scheme.lower() in ('socks', 'socks4'):
 954         socks_type = ProxyType.SOCKS4
 955     elif url_components.scheme.lower() == 'socks4a':
 956         socks_type = ProxyType.SOCKS4A
 957
 958     def unquote_if_non_empty(s):
 959         if not s:
 960             return s
 961         return compat_urllib_parse_unquote_plus(s)
 962
 963     proxy_args = (
 964         socks_type,
 965         url_components.hostname, url_components.port or 1080,
 966         True,  # Remote DNS
 967         unquote_if_non_empty(url_components.username),
 968         unquote_if_non_empty(url_components.password),
 969     )
 970
 971     class SocksConnection(base_class):
 972         def connect(self):
 973             self.sock = sockssocket()
 974             self.sock.setproxy(*proxy_args)
 975             if type(self.timeout) in (int, float):
 976                 self.sock.settimeout(self.timeout)
 977             self.sock.connect((self.host, self.port))
 978
 979             if isinstance(self, compat_http_client.HTTPSConnection):
 980                 if hasattr(self, '_context'):  # Python > 2.6
 981                     self.sock = self._context.wrap_socket(
 982                         self.sock, server_hostname=self.host)
 983                 else:
 984                     self.sock = ssl.wrap_socket(self.sock)
 985
 986     return SocksConnection
 987
 988
 989 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
 990     def __init__(self, params, https_conn_class=None, *args, **kwargs):
 991         compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
 992         self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
 993         self._params = params
 994
 995     def https_open(self, req):
 996         kwargs = {}
 997         conn_class = self._https_conn_class
 998
 999         if hasattr(self, '_context'):  # python > 2.6
1000             kwargs['context'] = self._context
1001         if hasattr(self, '_check_hostname'):  # python 3.x
1002             kwargs['check_hostname'] = self._check_hostname
1003
1004         socks_proxy = req.headers.get('Ytdl-socks-proxy')
1005         if socks_proxy:
1006             conn_class = make_socks_conn_class(conn_class, socks_proxy)
1007             del req.headers['Ytdl-socks-proxy']
1008
1009         return self.do_open(functools.partial(
1010             _create_http_connection, self, conn_class, True),
1011             req, **kwargs)
1012
1013
1014 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
1015     def __init__(self, cookiejar=None):
1016         compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1017
1018     def http_response(self, request, response):
1019         # Python 2 will choke on next HTTP request in row if there are non-ASCII
1020         # characters in Set-Cookie HTTP header of last response (see
1021         # https://github.com/rg3/youtube-dl/issues/6769).
1022         # In order to at least prevent crashing we will percent encode Set-Cookie
1023         # header before HTTPCookieProcessor starts processing it.
1024         # if sys.version_info < (3, 0) and response.headers:
1025         #     for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
1026         #         set_cookie = response.headers.get(set_cookie_header)
1027         #         if set_cookie:
1028         #             set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
1029         #             if set_cookie != set_cookie_escaped:
1030         #                 del response.headers[set_cookie_header]
1031         #                 response.headers[set_cookie_header] = set_cookie_escaped
1032         return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1033
1034     https_request = compat_urllib_request.HTTPCookieProcessor.http_request
1035     https_response = http_response
1036
1037
1038 def extract_timezone(date_str):
1039     m = re.search(
1040         r'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
1041         date_str)
1042     if not m:
1043         timezone = datetime.timedelta()
1044     else:
1045         date_str = date_str[:-len(m.group('tz'))]
1046         if not m.group('sign'):
1047             timezone = datetime.timedelta()
1048         else:
1049             sign = 1 if m.group('sign') == '+' else -1
1050             timezone = datetime.timedelta(
1051                 hours=sign * int(m.group('hours')),
1052                 minutes=sign * int(m.group('minutes')))
1053     return timezone, date_str
1054
1055
1056 def parse_iso8601(date_str, delimiter='T', timezone=None):
1057     """ Return a UNIX timestamp from the given date """
1058
1059     if date_str is None:
1060         return None
1061
1062     date_str = re.sub(r'\.[0-9]+', '', date_str)
1063
1064     if timezone is None:
1065         timezone, date_str = extract_timezone(date_str)
1066
1067     try:
1068         date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
1069         dt = datetime.datetime.strptime(date_str, date_format) - timezone
1070         return calendar.timegm(dt.timetuple())
1071     except ValueError:
1072         pass
1073
1074
1075 def date_formats(day_first=True):
1076     return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1077
1078
1079 def unified_strdate(date_str, day_first=True):
1080     """Return a string with the date in the format YYYYMMDD"""
1081
1082     if date_str is None:
1083         return None
1084     upload_date = None
1085     # Replace commas
1086     date_str = date_str.replace(',', ' ')
1087     # Remove AM/PM + timezone
1088     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1089     _, date_str = extract_timezone(date_str)
1090
1091     for expression in date_formats(day_first):
1092         try:
1093             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1094         except ValueError:
1095             pass
1096     if upload_date is None:
1097         timetuple = email.utils.parsedate_tz(date_str)
1098         if timetuple:
1099             try:
1100                 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1101             except ValueError:
1102                 pass
1103     if upload_date is not None:
1104         return compat_str(upload_date)
1105
1106
1107 def unified_timestamp(date_str, day_first=True):
1108     if date_str is None:
1109         return None
1110
1111     date_str = date_str.replace(',', ' ')
1112
1113     pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1114     timezone, date_str = extract_timezone(date_str)
1115
1116     # Remove AM/PM + timezone
1117     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1118
1119     for expression in date_formats(day_first):
1120         try:
1121             dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1122             return calendar.timegm(dt.timetuple())
1123         except ValueError:
1124             pass
1125     timetuple = email.utils.parsedate_tz(date_str)
1126     if timetuple:
1127         return calendar.timegm(timetuple) + pm_delta * 3600
1128
1129
1130 def determine_ext(url, default_ext='unknown_video'):
1131     if url is None:
1132         return default_ext
1133     guess = url.partition('?')[0].rpartition('.')[2]
1134     if re.match(r'^[A-Za-z0-9]+$', guess):
1135         return guess
1136     # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1137     elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1138         return guess.rstrip('/')
1139     else:
1140         return default_ext
1141
1142
1143 def subtitles_filename(filename, sub_lang, sub_format):
1144     return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
1145
1146
1147 def date_from_str(date_str):
1148     """
1149     Return a datetime object from a string in the format YYYYMMDD or
1150     (now|today)[+-][0-9](day|week|month|year)(s)?"""
1151     today = datetime.date.today()
1152     if date_str in ('now', 'today'):
1153         return today
1154     if date_str == 'yesterday':
1155         return today - datetime.timedelta(days=1)
1156     match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
1157     if match is not None:
1158         sign = match.group('sign')
1159         time = int(match.group('time'))
1160         if sign == '-':
1161             time = -time
1162         unit = match.group('unit')
1163         # A bad approximation?
1164         if unit == 'month':
1165             unit = 'day'
1166             time *= 30
1167         elif unit == 'year':
1168             unit = 'day'
1169             time *= 365
1170         unit += 's'
1171         delta = datetime.timedelta(**{unit: time})
1172         return today + delta
1173     return datetime.datetime.strptime(date_str, '%Y%m%d').date()
1174
1175
1176 def hyphenate_date(date_str):
1177     """
1178     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1179     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1180     if match is not None:
1181         return '-'.join(match.groups())
1182     else:
1183         return date_str
1184
1185
1186 class DateRange(object):
1187     """Represents a time interval between two dates"""
1188
1189     def __init__(self, start=None, end=None):
1190         """start and end must be strings in the format accepted by date"""
1191         if start is not None:
1192             self.start = date_from_str(start)
1193         else:
1194             self.start = datetime.datetime.min.date()
1195         if end is not None:
1196             self.end = date_from_str(end)
1197         else:
1198             self.end = datetime.datetime.max.date()
1199         if self.start > self.end:
1200             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1201
1202     @classmethod
1203     def day(cls, day):
1204         """Returns a range that only contains the given day"""
1205         return cls(day, day)
1206
1207     def __contains__(self, date):
1208         """Check if the date is in the range"""
1209         if not isinstance(date, datetime.date):
1210             date = date_from_str(date)
1211         return self.start <= date <= self.end
1212
1213     def __str__(self):
1214         return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
1215
1216
1217 def platform_name():
1218     """ Returns the platform name as a compat_str """
1219     res = platform.platform()
1220     if isinstance(res, bytes):
1221         res = res.decode(preferredencoding())
1222
1223     assert isinstance(res, compat_str)
1224     return res
1225
1226
1227 def _windows_write_string(s, out):
1228     """ Returns True if the string was written using special methods,
1229     False if it has yet to be written out."""
1230     # Adapted from http://stackoverflow.com/a/3259271/35070
1231
1232     import ctypes
1233     import ctypes.wintypes
1234
1235     WIN_OUTPUT_IDS = {
1236         1: -11,
1237         2: -12,
1238     }
1239
1240     try:
1241         fileno = out.fileno()
1242     except AttributeError:
1243         # If the output stream doesn't have a fileno, it's virtual
1244         return False
1245     except io.UnsupportedOperation:
1246         # Some strange Windows pseudo files?
1247         return False
1248     if fileno not in WIN_OUTPUT_IDS:
1249         return False
1250
1251     GetStdHandle = ctypes.WINFUNCTYPE(
1252         ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1253         (b'GetStdHandle', ctypes.windll.kernel32))
1254     h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1255
1256     WriteConsoleW = ctypes.WINFUNCTYPE(
1257         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1258         ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1259         ctypes.wintypes.LPVOID)((b'WriteConsoleW', ctypes.windll.kernel32))
1260     written = ctypes.wintypes.DWORD(0)
1261
1262     GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b'GetFileType', ctypes.windll.kernel32))
1263     FILE_TYPE_CHAR = 0x0002
1264     FILE_TYPE_REMOTE = 0x8000
1265     GetConsoleMode = ctypes.WINFUNCTYPE(
1266         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1267         ctypes.POINTER(ctypes.wintypes.DWORD))(
1268         (b'GetConsoleMode', ctypes.windll.kernel32))
1269     INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1270
1271     def not_a_console(handle):
1272         if handle == INVALID_HANDLE_VALUE or handle is None:
1273             return True
1274         return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1275                 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1276
1277     if not_a_console(h):
1278         return False
1279
1280     def next_nonbmp_pos(s):
1281         try:
1282             return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1283         except StopIteration:
1284             return len(s)
1285
1286     while s:
1287         count = min(next_nonbmp_pos(s), 1024)
1288
1289         ret = WriteConsoleW(
1290             h, s, count if count else 2, ctypes.byref(written), None)
1291         if ret == 0:
1292             raise OSError('Failed to write string')
1293         if not count:  # We just wrote a non-BMP character
1294             assert written.value == 2
1295             s = s[1:]
1296         else:
1297             assert written.value > 0
1298             s = s[written.value:]
1299     return True
1300
1301
1302 def write_string(s, out=None, encoding=None):
1303     if out is None:
1304         out = sys.stderr
1305     assert type(s) == compat_str
1306
1307     if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1308         if _windows_write_string(s, out):
1309             return
1310
1311     if ('b' in getattr(out, 'mode', '') or
1312             sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
1313         byt = s.encode(encoding or preferredencoding(), 'ignore')
1314         out.write(byt)
1315     elif hasattr(out, 'buffer'):
1316         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1317         byt = s.encode(enc, 'ignore')
1318         out.buffer.write(byt)
1319     else:
1320         out.write(s)
1321     out.flush()
1322
1323
1324 def bytes_to_intlist(bs):
1325     if not bs:
1326         return []
1327     if isinstance(bs[0], int):  # Python 3
1328         return list(bs)
1329     else:
1330         return [ord(c) for c in bs]
1331
1332
1333 def intlist_to_bytes(xs):
1334     if not xs:
1335         return b''
1336     return compat_struct_pack('%dB' % len(xs), *xs)
1337
1338
1339 # Cross-platform file locking
1340 if sys.platform == 'win32':
1341     import ctypes.wintypes
1342     import msvcrt
1343
1344     class OVERLAPPED(ctypes.Structure):
1345         _fields_ = [
1346             ('Internal', ctypes.wintypes.LPVOID),
1347             ('InternalHigh', ctypes.wintypes.LPVOID),
1348             ('Offset', ctypes.wintypes.DWORD),
1349             ('OffsetHigh', ctypes.wintypes.DWORD),
1350             ('hEvent', ctypes.wintypes.HANDLE),
1351         ]
1352
1353     kernel32 = ctypes.windll.kernel32
1354     LockFileEx = kernel32.LockFileEx
1355     LockFileEx.argtypes = [
1356         ctypes.wintypes.HANDLE,     # hFile
1357         ctypes.wintypes.DWORD,      # dwFlags
1358         ctypes.wintypes.DWORD,      # dwReserved
1359         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1360         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1361         ctypes.POINTER(OVERLAPPED)  # Overlapped
1362     ]
1363     LockFileEx.restype = ctypes.wintypes.BOOL
1364     UnlockFileEx = kernel32.UnlockFileEx
1365     UnlockFileEx.argtypes = [
1366         ctypes.wintypes.HANDLE,     # hFile
1367         ctypes.wintypes.DWORD,      # dwReserved
1368         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1369         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1370         ctypes.POINTER(OVERLAPPED)  # Overlapped
1371     ]
1372     UnlockFileEx.restype = ctypes.wintypes.BOOL
1373     whole_low = 0xffffffff
1374     whole_high = 0x7fffffff
1375
1376     def _lock_file(f, exclusive):
1377         overlapped = OVERLAPPED()
1378         overlapped.Offset = 0
1379         overlapped.OffsetHigh = 0
1380         overlapped.hEvent = 0
1381         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1382         handle = msvcrt.get_osfhandle(f.fileno())
1383         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1384                           whole_low, whole_high, f._lock_file_overlapped_p):
1385             raise OSError('Locking file failed: %r' % ctypes.FormatError())
1386
1387     def _unlock_file(f):
1388         assert f._lock_file_overlapped_p
1389         handle = msvcrt.get_osfhandle(f.fileno())
1390         if not UnlockFileEx(handle, 0,
1391                             whole_low, whole_high, f._lock_file_overlapped_p):
1392             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1393
1394 else:
1395     # Some platforms, such as Jython, is missing fcntl
1396     try:
1397         import fcntl
1398
1399         def _lock_file(f, exclusive):
1400             fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1401
1402         def _unlock_file(f):
1403             fcntl.flock(f, fcntl.LOCK_UN)
1404     except ImportError:
1405         UNSUPPORTED_MSG = 'file locking is not supported on this platform'
1406
1407         def _lock_file(f, exclusive):
1408             raise IOError(UNSUPPORTED_MSG)
1409
1410         def _unlock_file(f):
1411             raise IOError(UNSUPPORTED_MSG)
1412
1413
1414 class locked_file(object):
1415     def __init__(self, filename, mode, encoding=None):
1416         assert mode in ['r', 'a', 'w']
1417         self.f = io.open(filename, mode, encoding=encoding)
1418         self.mode = mode
1419
1420     def __enter__(self):
1421         exclusive = self.mode != 'r'
1422         try:
1423             _lock_file(self.f, exclusive)
1424         except IOError:
1425             self.f.close()
1426             raise
1427         return self
1428
1429     def __exit__(self, etype, value, traceback):
1430         try:
1431             _unlock_file(self.f)
1432         finally:
1433             self.f.close()
1434
1435     def __iter__(self):
1436         return iter(self.f)
1437
1438     def write(self, *args):
1439         return self.f.write(*args)
1440
1441     def read(self, *args):
1442         return self.f.read(*args)
1443
1444
1445 def get_filesystem_encoding():
1446     encoding = sys.getfilesystemencoding()
1447     return encoding if encoding is not None else 'utf-8'
1448
1449
1450 def shell_quote(args):
1451     quoted_args = []
1452     encoding = get_filesystem_encoding()
1453     for a in args:
1454         if isinstance(a, bytes):
1455             # We may get a filename encoded with 'encodeFilename'
1456             a = a.decode(encoding)
1457         quoted_args.append(pipes.quote(a))
1458     return ' '.join(quoted_args)
1459
1460
1461 def smuggle_url(url, data):
1462     """ Pass additional data in a URL for internal use. """
1463
1464     url, idata = unsmuggle_url(url, {})
1465     data.update(idata)
1466     sdata = compat_urllib_parse_urlencode(
1467         {'__youtubedl_smuggle': json.dumps(data)})
1468     return url + '#' + sdata
1469
1470
1471 def unsmuggle_url(smug_url, default=None):
1472     if '#__youtubedl_smuggle' not in smug_url:
1473         return smug_url, default
1474     url, _, sdata = smug_url.rpartition('#')
1475     jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1476     data = json.loads(jsond)
1477     return url, data
1478
1479
1480 def format_bytes(bytes):
1481     if bytes is None:
1482         return 'N/A'
1483     if type(bytes) is str:
1484         bytes = float(bytes)
1485     if bytes == 0.0:
1486         exponent = 0
1487     else:
1488         exponent = int(math.log(bytes, 1024.0))
1489     suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1490     converted = float(bytes) / float(1024 ** exponent)
1491     return '%.2f%s' % (converted, suffix)
1492
1493
1494 def lookup_unit_table(unit_table, s):
1495     units_re = '|'.join(re.escape(u) for u in unit_table)
1496     m = re.match(
1497         r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
1498     if not m:
1499         return None
1500     num_str = m.group('num').replace(',', '.')
1501     mult = unit_table[m.group('unit')]
1502     return int(float(num_str) * mult)
1503
1504
1505 def parse_filesize(s):
1506     if s is None:
1507         return None
1508
1509     # The lower-case forms are of course incorrect and unofficial,
1510     # but we support those too
1511     _UNIT_TABLE = {
1512         'B': 1,
1513         'b': 1,
1514         'bytes': 1,
1515         'KiB': 1024,
1516         'KB': 1000,
1517         'kB': 1024,
1518         'Kb': 1000,
1519         'kb': 1000,
1520         'kilobytes': 1000,
1521         'kibibytes': 1024,
1522         'MiB': 1024 ** 2,
1523         'MB': 1000 ** 2,
1524         'mB': 1024 ** 2,
1525         'Mb': 1000 ** 2,
1526         'mb': 1000 ** 2,
1527         'megabytes': 1000 ** 2,
1528         'mebibytes': 1024 ** 2,
1529         'GiB': 1024 ** 3,
1530         'GB': 1000 ** 3,
1531         'gB': 1024 ** 3,
1532         'Gb': 1000 ** 3,
1533         'gb': 1000 ** 3,
1534         'gigabytes': 1000 ** 3,
1535         'gibibytes': 1024 ** 3,
1536         'TiB': 1024 ** 4,
1537         'TB': 1000 ** 4,
1538         'tB': 1024 ** 4,
1539         'Tb': 1000 ** 4,
1540         'tb': 1000 ** 4,
1541         'terabytes': 1000 ** 4,
1542         'tebibytes': 1024 ** 4,
1543         'PiB': 1024 ** 5,
1544         'PB': 1000 ** 5,
1545         'pB': 1024 ** 5,
1546         'Pb': 1000 ** 5,
1547         'pb': 1000 ** 5,
1548         'petabytes': 1000 ** 5,
1549         'pebibytes': 1024 ** 5,
1550         'EiB': 1024 ** 6,
1551         'EB': 1000 ** 6,
1552         'eB': 1024 ** 6,
1553         'Eb': 1000 ** 6,
1554         'eb': 1000 ** 6,
1555         'exabytes': 1000 ** 6,
1556         'exbibytes': 1024 ** 6,
1557         'ZiB': 1024 ** 7,
1558         'ZB': 1000 ** 7,
1559         'zB': 1024 ** 7,
1560         'Zb': 1000 ** 7,
1561         'zb': 1000 ** 7,
1562         'zettabytes': 1000 ** 7,
1563         'zebibytes': 1024 ** 7,
1564         'YiB': 1024 ** 8,
1565         'YB': 1000 ** 8,
1566         'yB': 1024 ** 8,
1567         'Yb': 1000 ** 8,
1568         'yb': 1000 ** 8,
1569         'yottabytes': 1000 ** 8,
1570         'yobibytes': 1024 ** 8,
1571     }
1572
1573     return lookup_unit_table(_UNIT_TABLE, s)
1574
1575
1576 def parse_count(s):
1577     if s is None:
1578         return None
1579
1580     s = s.strip()
1581
1582     if re.match(r'^[\d,.]+$', s):
1583         return str_to_int(s)
1584
1585     _UNIT_TABLE = {
1586         'k': 1000,
1587         'K': 1000,
1588         'm': 1000 ** 2,
1589         'M': 1000 ** 2,
1590         'kk': 1000 ** 2,
1591         'KK': 1000 ** 2,
1592     }
1593
1594     return lookup_unit_table(_UNIT_TABLE, s)
1595
1596
1597 def month_by_name(name, lang='en'):
1598     """ Return the number of a month by (locale-independently) English name """
1599
1600     month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
1601
1602     try:
1603         return month_names.index(name) + 1
1604     except ValueError:
1605         return None
1606
1607
1608 def month_by_abbreviation(abbrev):
1609     """ Return the number of a month by (locale-independently) English
1610         abbreviations """
1611
1612     try:
1613         return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1614     except ValueError:
1615         return None
1616
1617
1618 def fix_xml_ampersands(xml_str):
1619     """Replace all the '&' by '&amp;' in XML"""
1620     return re.sub(
1621         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1622         '&amp;',
1623         xml_str)
1624
1625
1626 def setproctitle(title):
1627     assert isinstance(title, compat_str)
1628
1629     # ctypes in Jython is not complete
1630     # http://bugs.jython.org/issue2148
1631     if sys.platform.startswith('java'):
1632         return
1633
1634     try:
1635         libc = ctypes.cdll.LoadLibrary('libc.so.6')
1636     except OSError:
1637         return
1638     title_bytes = title.encode('utf-8')
1639     buf = ctypes.create_string_buffer(len(title_bytes))
1640     buf.value = title_bytes
1641     try:
1642         libc.prctl(15, buf, 0, 0, 0)
1643     except AttributeError:
1644         return  # Strange libc, just skip this
1645
1646
1647 def remove_start(s, start):
1648     return s[len(start):] if s is not None and s.startswith(start) else s
1649
1650
1651 def remove_end(s, end):
1652     return s[:-len(end)] if s is not None and s.endswith(end) else s
1653
1654
1655 def remove_quotes(s):
1656     if s is None or len(s) < 2:
1657         return s
1658     for quote in ('"', "'", ):
1659         if s[0] == quote and s[-1] == quote:
1660             return s[1:-1]
1661     return s
1662
1663
1664 def url_basename(url):
1665     path = compat_urlparse.urlparse(url).path
1666     return path.strip('/').split('/')[-1]
1667
1668
1669 class HEADRequest(compat_urllib_request.Request):
1670     def get_method(self):
1671         return 'HEAD'
1672
1673
1674 class PUTRequest(compat_urllib_request.Request):
1675     def get_method(self):
1676         return 'PUT'
1677
1678
1679 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1680     if get_attr:
1681         if v is not None:
1682             v = getattr(v, get_attr, None)
1683     if v == '':
1684         v = None
1685     if v is None:
1686         return default
1687     try:
1688         return int(v) * invscale // scale
1689     except ValueError:
1690         return default
1691
1692
1693 def str_or_none(v, default=None):
1694     return default if v is None else compat_str(v)
1695
1696
1697 def str_to_int(int_str):
1698     """ A more relaxed version of int_or_none """
1699     if int_str is None:
1700         return None
1701     int_str = re.sub(r'[,\.\+]', '', int_str)
1702     return int(int_str)
1703
1704
1705 def float_or_none(v, scale=1, invscale=1, default=None):
1706     if v is None:
1707         return default
1708     try:
1709         return float(v) * invscale / scale
1710     except ValueError:
1711         return default
1712
1713
1714 def strip_or_none(v):
1715     return None if v is None else v.strip()
1716
1717
1718 def parse_duration(s):
1719     if not isinstance(s, compat_basestring):
1720         return None
1721
1722     s = s.strip()
1723
1724     days, hours, mins, secs, ms = [None] * 5
1725     m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?$', s)
1726     if m:
1727         days, hours, mins, secs, ms = m.groups()
1728     else:
1729         m = re.match(
1730             r'''(?ix)(?:P?T)?
1731                 (?:
1732                     (?P<days>[0-9]+)\s*d(?:ays?)?\s*
1733                 )?
1734                 (?:
1735                     (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
1736                 )?
1737                 (?:
1738                     (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
1739                 )?
1740                 (?:
1741                     (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
1742                 )?$''', s)
1743         if m:
1744             days, hours, mins, secs, ms = m.groups()
1745         else:
1746             m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)$', s)
1747             if m:
1748                 hours, mins = m.groups()
1749             else:
1750                 return None
1751
1752     duration = 0
1753     if secs:
1754         duration += float(secs)
1755     if mins:
1756         duration += float(mins) * 60
1757     if hours:
1758         duration += float(hours) * 60 * 60
1759     if days:
1760         duration += float(days) * 24 * 60 * 60
1761     if ms:
1762         duration += float(ms)
1763     return duration
1764
1765
1766 def prepend_extension(filename, ext, expected_real_ext=None):
1767     name, real_ext = os.path.splitext(filename)
1768     return (
1769         '{0}.{1}{2}'.format(name, ext, real_ext)
1770         if not expected_real_ext or real_ext[1:] == expected_real_ext
1771         else '{0}.{1}'.format(filename, ext))
1772
1773
1774 def replace_extension(filename, ext, expected_real_ext=None):
1775     name, real_ext = os.path.splitext(filename)
1776     return '{0}.{1}'.format(
1777         name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1778         ext)
1779
1780
1781 def check_executable(exe, args=[]):
1782     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1783     args can be a list of arguments for a short output (like -version) """
1784     try:
1785         subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1786     except OSError:
1787         return False
1788     return exe
1789
1790
1791 def get_exe_version(exe, args=['--version'],
1792                     version_re=None, unrecognized='present'):
1793     """ Returns the version of the specified executable,
1794     or False if the executable is not present """
1795     try:
1796         out, _ = subprocess.Popen(
1797             [encodeArgument(exe)] + args,
1798             stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1799     except OSError:
1800         return False
1801     if isinstance(out, bytes):  # Python 2.x
1802         out = out.decode('ascii', 'ignore')
1803     return detect_exe_version(out, version_re, unrecognized)
1804
1805
1806 def detect_exe_version(output, version_re=None, unrecognized='present'):
1807     assert isinstance(output, compat_str)
1808     if version_re is None:
1809         version_re = r'version\s+([-0-9._a-zA-Z]+)'
1810     m = re.search(version_re, output)
1811     if m:
1812         return m.group(1)
1813     else:
1814         return unrecognized
1815
1816
1817 class PagedList(object):
1818     def __len__(self):
1819         # This is only useful for tests
1820         return len(self.getslice())
1821
1822
1823 class OnDemandPagedList(PagedList):
1824     def __init__(self, pagefunc, pagesize, use_cache=False):
1825         self._pagefunc = pagefunc
1826         self._pagesize = pagesize
1827         self._use_cache = use_cache
1828         if use_cache:
1829             self._cache = {}
1830
1831     def getslice(self, start=0, end=None):
1832         res = []
1833         for pagenum in itertools.count(start // self._pagesize):
1834             firstid = pagenum * self._pagesize
1835             nextfirstid = pagenum * self._pagesize + self._pagesize
1836             if start >= nextfirstid:
1837                 continue
1838
1839             page_results = None
1840             if self._use_cache:
1841                 page_results = self._cache.get(pagenum)
1842             if page_results is None:
1843                 page_results = list(self._pagefunc(pagenum))
1844             if self._use_cache:
1845                 self._cache[pagenum] = page_results
1846
1847             startv = (
1848                 start % self._pagesize
1849                 if firstid <= start < nextfirstid
1850                 else 0)
1851
1852             endv = (
1853                 ((end - 1) % self._pagesize) + 1
1854                 if (end is not None and firstid <= end <= nextfirstid)
1855                 else None)
1856
1857             if startv != 0 or endv is not None:
1858                 page_results = page_results[startv:endv]
1859             res.extend(page_results)
1860
1861             # A little optimization - if current page is not "full", ie. does
1862             # not contain page_size videos then we can assume that this page
1863             # is the last one - there are no more ids on further pages -
1864             # i.e. no need to query again.
1865             if len(page_results) + startv < self._pagesize:
1866                 break
1867
1868             # If we got the whole page, but the next page is not interesting,
1869             # break out early as well
1870             if end == nextfirstid:
1871                 break
1872         return res
1873
1874
1875 class InAdvancePagedList(PagedList):
1876     def __init__(self, pagefunc, pagecount, pagesize):
1877         self._pagefunc = pagefunc
1878         self._pagecount = pagecount
1879         self._pagesize = pagesize
1880
1881     def getslice(self, start=0, end=None):
1882         res = []
1883         start_page = start // self._pagesize
1884         end_page = (
1885             self._pagecount if end is None else (end // self._pagesize + 1))
1886         skip_elems = start - start_page * self._pagesize
1887         only_more = None if end is None else end - start
1888         for pagenum in range(start_page, end_page):
1889             page = list(self._pagefunc(pagenum))
1890             if skip_elems:
1891                 page = page[skip_elems:]
1892                 skip_elems = None
1893             if only_more is not None:
1894                 if len(page) < only_more:
1895                     only_more -= len(page)
1896                 else:
1897                     page = page[:only_more]
1898                     res.extend(page)
1899                     break
1900             res.extend(page)
1901         return res
1902
1903
1904 def uppercase_escape(s):
1905     unicode_escape = codecs.getdecoder('unicode_escape')
1906     return re.sub(
1907         r'\\U[0-9a-fA-F]{8}',
1908         lambda m: unicode_escape(m.group(0))[0],
1909         s)
1910
1911
1912 def lowercase_escape(s):
1913     unicode_escape = codecs.getdecoder('unicode_escape')
1914     return re.sub(
1915         r'\\u[0-9a-fA-F]{4}',
1916         lambda m: unicode_escape(m.group(0))[0],
1917         s)
1918
1919
1920 def escape_rfc3986(s):
1921     """Escape non-ASCII characters as suggested by RFC 3986"""
1922     if sys.version_info < (3, 0) and isinstance(s, compat_str):
1923         s = s.encode('utf-8')
1924     return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1925
1926
1927 def escape_url(url):
1928     """Escape URL as suggested by RFC 3986"""
1929     url_parsed = compat_urllib_parse_urlparse(url)
1930     return url_parsed._replace(
1931         netloc=url_parsed.netloc.encode('idna').decode('ascii'),
1932         path=escape_rfc3986(url_parsed.path),
1933         params=escape_rfc3986(url_parsed.params),
1934         query=escape_rfc3986(url_parsed.query),
1935         fragment=escape_rfc3986(url_parsed.fragment)
1936     ).geturl()
1937
1938
1939 def read_batch_urls(batch_fd):
1940     def fixup(url):
1941         if not isinstance(url, compat_str):
1942             url = url.decode('utf-8', 'replace')
1943         BOM_UTF8 = '\xef\xbb\xbf'
1944         if url.startswith(BOM_UTF8):
1945             url = url[len(BOM_UTF8):]
1946         url = url.strip()
1947         if url.startswith(('#', ';', ']')):
1948             return False
1949         return url
1950
1951     with contextlib.closing(batch_fd) as fd:
1952         return [url for url in map(fixup, fd) if url]
1953
1954
1955 def urlencode_postdata(*args, **kargs):
1956     return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
1957
1958
1959 def update_url_query(url, query):
1960     if not query:
1961         return url
1962     parsed_url = compat_urlparse.urlparse(url)
1963     qs = compat_parse_qs(parsed_url.query)
1964     qs.update(query)
1965     return compat_urlparse.urlunparse(parsed_url._replace(
1966         query=compat_urllib_parse_urlencode(qs, True)))
1967
1968
1969 def update_Request(req, url=None, data=None, headers={}, query={}):
1970     req_headers = req.headers.copy()
1971     req_headers.update(headers)
1972     req_data = data or req.data
1973     req_url = update_url_query(url or req.get_full_url(), query)
1974     req_get_method = req.get_method()
1975     if req_get_method == 'HEAD':
1976         req_type = HEADRequest
1977     elif req_get_method == 'PUT':
1978         req_type = PUTRequest
1979     else:
1980         req_type = compat_urllib_request.Request
1981     new_req = req_type(
1982         req_url, data=req_data, headers=req_headers,
1983         origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
1984     if hasattr(req, 'timeout'):
1985         new_req.timeout = req.timeout
1986     return new_req
1987
1988
1989 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
1990     if isinstance(key_or_keys, (list, tuple)):
1991         for key in key_or_keys:
1992             if key not in d or d[key] is None or skip_false_values and not d[key]:
1993                 continue
1994             return d[key]
1995         return default
1996     return d.get(key_or_keys, default)
1997
1998
1999 def try_get(src, getter, expected_type=None):
2000     try:
2001         v = getter(src)
2002     except (AttributeError, KeyError, TypeError, IndexError):
2003         pass
2004     else:
2005         if expected_type is None or isinstance(v, expected_type):
2006             return v
2007
2008
2009 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
2010     return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
2011
2012
2013 US_RATINGS = {
2014     'G': 0,
2015     'PG': 10,
2016     'PG-13': 13,
2017     'R': 16,
2018     'NC': 18,
2019 }
2020
2021
2022 TV_PARENTAL_GUIDELINES = {
2023     'TV-Y': 0,
2024     'TV-Y7': 7,
2025     'TV-G': 0,
2026     'TV-PG': 0,
2027     'TV-14': 14,
2028     'TV-MA': 17,
2029 }
2030
2031
2032 def parse_age_limit(s):
2033     if type(s) == int:
2034         return s if 0 <= s <= 21 else None
2035     if not isinstance(s, compat_basestring):
2036         return None
2037     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
2038     if m:
2039         return int(m.group('age'))
2040     if s in US_RATINGS:
2041         return US_RATINGS[s]
2042     return TV_PARENTAL_GUIDELINES.get(s)
2043
2044
2045 def strip_jsonp(code):
2046     return re.sub(
2047         r'(?s)^[a-zA-Z0-9_.$]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
2048
2049
2050 def js_to_json(code):
2051     def fix_kv(m):
2052         v = m.group(0)
2053         if v in ('true', 'false', 'null'):
2054             return v
2055         elif v.startswith('/*') or v == ',':
2056             return ""
2057
2058         if v[0] in ("'", '"'):
2059             v = re.sub(r'(?s)\\.|"', lambda m: {
2060                 '"': '\\"',
2061                 "\\'": "'",
2062                 '\\\n': '',
2063                 '\\x': '\\u00',
2064             }.get(m.group(0), m.group(0)), v[1:-1])
2065
2066         INTEGER_TABLE = (
2067             (r'^(0[xX][0-9a-fA-F]+)\s*:?$', 16),
2068             (r'^(0+[0-7]+)\s*:?$', 8),
2069         )
2070
2071         for regex, base in INTEGER_TABLE:
2072             im = re.match(regex, v)
2073             if im:
2074                 i = int(im.group(1), base)
2075                 return '"%d":' % i if v.endswith(':') else '%d' % i
2076
2077         return '"%s"' % v
2078
2079     return re.sub(r'''(?sx)
2080         "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
2081         '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
2082         /\*.*?\*/|,(?=\s*[\]}])|
2083         [a-zA-Z_][.a-zA-Z_0-9]*|
2084         \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:\s*:)?|
2085         [0-9]+(?=\s*:)
2086         ''', fix_kv, code)
2087
2088
2089 def qualities(quality_ids):
2090     """ Get a numeric quality value out of a list of possible values """
2091     def q(qid):
2092         try:
2093             return quality_ids.index(qid)
2094         except ValueError:
2095             return -1
2096     return q
2097
2098
2099 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
2100
2101
2102 def limit_length(s, length):
2103     """ Add ellipses to overly long strings """
2104     if s is None:
2105         return None
2106     ELLIPSES = '...'
2107     if len(s) > length:
2108         return s[:length - len(ELLIPSES)] + ELLIPSES
2109     return s
2110
2111
2112 def version_tuple(v):
2113     return tuple(int(e) for e in re.split(r'[-.]', v))
2114
2115
2116 def is_outdated_version(version, limit, assume_new=True):
2117     if not version:
2118         return not assume_new
2119     try:
2120         return version_tuple(version) < version_tuple(limit)
2121     except ValueError:
2122         return not assume_new
2123
2124
2125 def ytdl_is_updateable():
2126     """ Returns if youtube-dl can be updated with -U """
2127     from zipimport import zipimporter
2128
2129     return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
2130
2131
2132 def args_to_str(args):
2133     # Get a short string representation for a subprocess command
2134     return ' '.join(compat_shlex_quote(a) for a in args)
2135
2136
2137 def error_to_compat_str(err):
2138     err_str = str(err)
2139     # On python 2 error byte string must be decoded with proper
2140     # encoding rather than ascii
2141     if sys.version_info[0] < 3:
2142         err_str = err_str.decode(preferredencoding())
2143     return err_str
2144
2145
2146 def mimetype2ext(mt):
2147     if mt is None:
2148         return None
2149
2150     ext = {
2151         'audio/mp4': 'm4a',
2152         # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
2153         # it's the most popular one
2154         'audio/mpeg': 'mp3',
2155     }.get(mt)
2156     if ext is not None:
2157         return ext
2158
2159     _, _, res = mt.rpartition('/')
2160     res = res.split(';')[0].strip().lower()
2161
2162     return {
2163         '3gpp': '3gp',
2164         'smptett+xml': 'tt',
2165         'srt': 'srt',
2166         'ttaf+xml': 'dfxp',
2167         'ttml+xml': 'ttml',
2168         'vtt': 'vtt',
2169         'x-flv': 'flv',
2170         'x-mp4-fragmented': 'mp4',
2171         'x-ms-wmv': 'wmv',
2172         'mpegurl': 'm3u8',
2173         'x-mpegurl': 'm3u8',
2174         'vnd.apple.mpegurl': 'm3u8',
2175         'dash+xml': 'mpd',
2176         'f4m': 'f4m',
2177         'f4m+xml': 'f4m',
2178         'hds+xml': 'f4m',
2179         'vnd.ms-sstr+xml': 'ism',
2180         'quicktime': 'mov',
2181     }.get(res, res)
2182
2183
2184 def parse_codecs(codecs_str):
2185     # http://tools.ietf.org/html/rfc6381
2186     if not codecs_str:
2187         return {}
2188     splited_codecs = list(filter(None, map(
2189         lambda str: str.strip(), codecs_str.strip().strip(',').split(','))))
2190     vcodec, acodec = None, None
2191     for full_codec in splited_codecs:
2192         codec = full_codec.split('.')[0]
2193         if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v'):
2194             if not vcodec:
2195                 vcodec = full_codec
2196         elif codec in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3'):
2197             if not acodec:
2198                 acodec = full_codec
2199         else:
2200             write_string('WARNING: Unknown codec %s' % full_codec, sys.stderr)
2201     if not vcodec and not acodec:
2202         if len(splited_codecs) == 2:
2203             return {
2204                 'vcodec': vcodec,
2205                 'acodec': acodec,
2206             }
2207         elif len(splited_codecs) == 1:
2208             return {
2209                 'vcodec': 'none',
2210                 'acodec': vcodec,
2211             }
2212     else:
2213         return {
2214             'vcodec': vcodec or 'none',
2215             'acodec': acodec or 'none',
2216         }
2217     return {}
2218
2219
2220 def urlhandle_detect_ext(url_handle):
2221     getheader = url_handle.headers.get
2222
2223     cd = getheader('Content-Disposition')
2224     if cd:
2225         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
2226         if m:
2227             e = determine_ext(m.group('filename'), default_ext=None)
2228             if e:
2229                 return e
2230
2231     return mimetype2ext(getheader('Content-Type'))
2232
2233
2234 def encode_data_uri(data, mime_type):
2235     return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
2236
2237
2238 def age_restricted(content_limit, age_limit):
2239     """ Returns True iff the content should be blocked """
2240
2241     if age_limit is None:  # No limit set
2242         return False
2243     if content_limit is None:
2244         return False  # Content available for everyone
2245     return age_limit < content_limit
2246
2247
2248 def is_html(first_bytes):
2249     """ Detect whether a file contains HTML by examining its first bytes. """
2250
2251     BOMS = [
2252         (b'\xef\xbb\xbf', 'utf-8'),
2253         (b'\x00\x00\xfe\xff', 'utf-32-be'),
2254         (b'\xff\xfe\x00\x00', 'utf-32-le'),
2255         (b'\xff\xfe', 'utf-16-le'),
2256         (b'\xfe\xff', 'utf-16-be'),
2257     ]
2258     for bom, enc in BOMS:
2259         if first_bytes.startswith(bom):
2260             s = first_bytes[len(bom):].decode(enc, 'replace')
2261             break
2262     else:
2263         s = first_bytes.decode('utf-8', 'replace')
2264
2265     return re.match(r'^\s*<', s)
2266
2267
2268 def determine_protocol(info_dict):
2269     protocol = info_dict.get('protocol')
2270     if protocol is not None:
2271         return protocol
2272
2273     url = info_dict['url']
2274     if url.startswith('rtmp'):
2275         return 'rtmp'
2276     elif url.startswith('mms'):
2277         return 'mms'
2278     elif url.startswith('rtsp'):
2279         return 'rtsp'
2280
2281     ext = determine_ext(url)
2282     if ext == 'm3u8':
2283         return 'm3u8'
2284     elif ext == 'f4m':
2285         return 'f4m'
2286
2287     return compat_urllib_parse_urlparse(url).scheme
2288
2289
2290 def render_table(header_row, data):
2291     """ Render a list of rows, each as a list of values """
2292     table = [header_row] + data
2293     max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
2294     format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
2295     return '\n'.join(format_str % tuple(row) for row in table)
2296
2297
2298 def _match_one(filter_part, dct):
2299     COMPARISON_OPERATORS = {
2300         '<': operator.lt,
2301         '<=': operator.le,
2302         '>': operator.gt,
2303         '>=': operator.ge,
2304         '=': operator.eq,
2305         '!=': operator.ne,
2306     }
2307     operator_rex = re.compile(r'''(?x)\s*
2308         (?P<key>[a-z_]+)
2309         \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
2310         (?:
2311             (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
2312             (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
2313         )
2314         \s*$
2315         ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
2316     m = operator_rex.search(filter_part)
2317     if m:
2318         op = COMPARISON_OPERATORS[m.group('op')]
2319         if m.group('strval') is not None:
2320             if m.group('op') not in ('=', '!='):
2321                 raise ValueError(
2322                     'Operator %s does not support string values!' % m.group('op'))
2323             comparison_value = m.group('strval')
2324         else:
2325             try:
2326                 comparison_value = int(m.group('intval'))
2327             except ValueError:
2328                 comparison_value = parse_filesize(m.group('intval'))
2329                 if comparison_value is None:
2330                     comparison_value = parse_filesize(m.group('intval') + 'B')
2331                 if comparison_value is None:
2332                     raise ValueError(
2333                         'Invalid integer value %r in filter part %r' % (
2334                             m.group('intval'), filter_part))
2335         actual_value = dct.get(m.group('key'))
2336         if actual_value is None:
2337             return m.group('none_inclusive')
2338         return op(actual_value, comparison_value)
2339
2340     UNARY_OPERATORS = {
2341         '': lambda v: v is not None,
2342         '!': lambda v: v is None,
2343     }
2344     operator_rex = re.compile(r'''(?x)\s*
2345         (?P<op>%s)\s*(?P<key>[a-z_]+)
2346         \s*$
2347         ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
2348     m = operator_rex.search(filter_part)
2349     if m:
2350         op = UNARY_OPERATORS[m.group('op')]
2351         actual_value = dct.get(m.group('key'))
2352         return op(actual_value)
2353
2354     raise ValueError('Invalid filter part %r' % filter_part)
2355
2356
2357 def match_str(filter_str, dct):
2358     """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2359
2360     return all(
2361         _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
2362
2363
2364 def match_filter_func(filter_str):
2365     def _match_func(info_dict):
2366         if match_str(filter_str, info_dict):
2367             return None
2368         else:
2369             video_title = info_dict.get('title', info_dict.get('id', 'video'))
2370             return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
2371     return _match_func
2372
2373
2374 def parse_dfxp_time_expr(time_expr):
2375     if not time_expr:
2376         return
2377
2378     mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
2379     if mobj:
2380         return float(mobj.group('time_offset'))
2381
2382     mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
2383     if mobj:
2384         return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
2385
2386
2387 def srt_subtitles_timecode(seconds):
2388     return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
2389
2390
2391 def dfxp2srt(dfxp_data):
2392     _x = functools.partial(xpath_with_ns, ns_map={
2393         'ttml': 'http://www.w3.org/ns/ttml',
2394         'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
2395         'ttaf1_0604': 'http://www.w3.org/2006/04/ttaf1',
2396     })
2397
2398     class TTMLPElementParser(object):
2399         out = ''
2400
2401         def start(self, tag, attrib):
2402             if tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
2403                 self.out += '\n'
2404
2405         def end(self, tag):
2406             pass
2407
2408         def data(self, data):
2409             self.out += data
2410
2411         def close(self):
2412             return self.out.strip()
2413
2414     def parse_node(node):
2415         target = TTMLPElementParser()
2416         parser = xml.etree.ElementTree.XMLParser(target=target)
2417         parser.feed(xml.etree.ElementTree.tostring(node))
2418         return parser.close()
2419
2420     dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
2421     out = []
2422     paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall(_x('.//ttaf1_0604:p')) or dfxp.findall('.//p')
2423
2424     if not paras:
2425         raise ValueError('Invalid dfxp/TTML subtitle')
2426
2427     for para, index in zip(paras, itertools.count(1)):
2428         begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
2429         end_time = parse_dfxp_time_expr(para.attrib.get('end'))
2430         dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2431         if begin_time is None:
2432             continue
2433         if not end_time:
2434             if not dur:
2435                 continue
2436             end_time = begin_time + dur
2437         out.append('%d\n%s --> %s\n%s\n\n' % (
2438             index,
2439             srt_subtitles_timecode(begin_time),
2440             srt_subtitles_timecode(end_time),
2441             parse_node(para)))
2442
2443     return ''.join(out)
2444
2445
2446 def cli_option(params, command_option, param):
2447     param = params.get(param)
2448     if param:
2449         param = compat_str(param)
2450     return [command_option, param] if param is not None else []
2451
2452
2453 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2454     param = params.get(param)
2455     assert isinstance(param, bool)
2456     if separator:
2457         return [command_option + separator + (true_value if param else false_value)]
2458     return [command_option, true_value if param else false_value]
2459
2460
2461 def cli_valueless_option(params, command_option, param, expected_value=True):
2462     param = params.get(param)
2463     return [command_option] if param == expected_value else []
2464
2465
2466 def cli_configuration_args(params, param, default=[]):
2467     ex_args = params.get(param)
2468     if ex_args is None:
2469         return default
2470     assert isinstance(ex_args, list)
2471     return ex_args
2472
2473
2474 class ISO639Utils(object):
2475     # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2476     _lang_map = {
2477         'aa': 'aar',
2478         'ab': 'abk',
2479         'ae': 'ave',
2480         'af': 'afr',
2481         'ak': 'aka',
2482         'am': 'amh',
2483         'an': 'arg',
2484         'ar': 'ara',
2485         'as': 'asm',
2486         'av': 'ava',
2487         'ay': 'aym',
2488         'az': 'aze',
2489         'ba': 'bak',
2490         'be': 'bel',
2491         'bg': 'bul',
2492         'bh': 'bih',
2493         'bi': 'bis',
2494         'bm': 'bam',
2495         'bn': 'ben',
2496         'bo': 'bod',
2497         'br': 'bre',
2498         'bs': 'bos',
2499         'ca': 'cat',
2500         'ce': 'che',
2501         'ch': 'cha',
2502         'co': 'cos',
2503         'cr': 'cre',
2504         'cs': 'ces',
2505         'cu': 'chu',
2506         'cv': 'chv',
2507         'cy': 'cym',
2508         'da': 'dan',
2509         'de': 'deu',
2510         'dv': 'div',
2511         'dz': 'dzo',
2512         'ee': 'ewe',
2513         'el': 'ell',
2514         'en': 'eng',
2515         'eo': 'epo',
2516         'es': 'spa',
2517         'et': 'est',
2518         'eu': 'eus',
2519         'fa': 'fas',
2520         'ff': 'ful',
2521         'fi': 'fin',
2522         'fj': 'fij',
2523         'fo': 'fao',
2524         'fr': 'fra',
2525         'fy': 'fry',
2526         'ga': 'gle',
2527         'gd': 'gla',
2528         'gl': 'glg',
2529         'gn': 'grn',
2530         'gu': 'guj',
2531         'gv': 'glv',
2532         'ha': 'hau',
2533         'he': 'heb',
2534         'hi': 'hin',
2535         'ho': 'hmo',
2536         'hr': 'hrv',
2537         'ht': 'hat',
2538         'hu': 'hun',
2539         'hy': 'hye',
2540         'hz': 'her',
2541         'ia': 'ina',
2542         'id': 'ind',
2543         'ie': 'ile',
2544         'ig': 'ibo',
2545         'ii': 'iii',
2546         'ik': 'ipk',
2547         'io': 'ido',
2548         'is': 'isl',
2549         'it': 'ita',
2550         'iu': 'iku',
2551         'ja': 'jpn',
2552         'jv': 'jav',
2553         'ka': 'kat',
2554         'kg': 'kon',
2555         'ki': 'kik',
2556         'kj': 'kua',
2557         'kk': 'kaz',
2558         'kl': 'kal',
2559         'km': 'khm',
2560         'kn': 'kan',
2561         'ko': 'kor',
2562         'kr': 'kau',
2563         'ks': 'kas',
2564         'ku': 'kur',
2565         'kv': 'kom',
2566         'kw': 'cor',
2567         'ky': 'kir',
2568         'la': 'lat',
2569         'lb': 'ltz',
2570         'lg': 'lug',
2571         'li': 'lim',
2572         'ln': 'lin',
2573         'lo': 'lao',
2574         'lt': 'lit',
2575         'lu': 'lub',
2576         'lv': 'lav',
2577         'mg': 'mlg',
2578         'mh': 'mah',
2579         'mi': 'mri',
2580         'mk': 'mkd',
2581         'ml': 'mal',
2582         'mn': 'mon',
2583         'mr': 'mar',
2584         'ms': 'msa',
2585         'mt': 'mlt',
2586         'my': 'mya',
2587         'na': 'nau',
2588         'nb': 'nob',
2589         'nd': 'nde',
2590         'ne': 'nep',
2591         'ng': 'ndo',
2592         'nl': 'nld',
2593         'nn': 'nno',
2594         'no': 'nor',
2595         'nr': 'nbl',
2596         'nv': 'nav',
2597         'ny': 'nya',
2598         'oc': 'oci',
2599         'oj': 'oji',
2600         'om': 'orm',
2601         'or': 'ori',
2602         'os': 'oss',
2603         'pa': 'pan',
2604         'pi': 'pli',
2605         'pl': 'pol',
2606         'ps': 'pus',
2607         'pt': 'por',
2608         'qu': 'que',
2609         'rm': 'roh',
2610         'rn': 'run',
2611         'ro': 'ron',
2612         'ru': 'rus',
2613         'rw': 'kin',
2614         'sa': 'san',
2615         'sc': 'srd',
2616         'sd': 'snd',
2617         'se': 'sme',
2618         'sg': 'sag',
2619         'si': 'sin',
2620         'sk': 'slk',
2621         'sl': 'slv',
2622         'sm': 'smo',
2623         'sn': 'sna',
2624         'so': 'som',
2625         'sq': 'sqi',
2626         'sr': 'srp',
2627         'ss': 'ssw',
2628         'st': 'sot',
2629         'su': 'sun',
2630         'sv': 'swe',
2631         'sw': 'swa',
2632         'ta': 'tam',
2633         'te': 'tel',
2634         'tg': 'tgk',
2635         'th': 'tha',
2636         'ti': 'tir',
2637         'tk': 'tuk',
2638         'tl': 'tgl',
2639         'tn': 'tsn',
2640         'to': 'ton',
2641         'tr': 'tur',
2642         'ts': 'tso',
2643         'tt': 'tat',
2644         'tw': 'twi',
2645         'ty': 'tah',
2646         'ug': 'uig',
2647         'uk': 'ukr',
2648         'ur': 'urd',
2649         'uz': 'uzb',
2650         've': 'ven',
2651         'vi': 'vie',
2652         'vo': 'vol',
2653         'wa': 'wln',
2654         'wo': 'wol',
2655         'xh': 'xho',
2656         'yi': 'yid',
2657         'yo': 'yor',
2658         'za': 'zha',
2659         'zh': 'zho',
2660         'zu': 'zul',
2661     }
2662
2663     @classmethod
2664     def short2long(cls, code):
2665         """Convert language code from ISO 639-1 to ISO 639-2/T"""
2666         return cls._lang_map.get(code[:2])
2667
2668     @classmethod
2669     def long2short(cls, code):
2670         """Convert language code from ISO 639-2/T to ISO 639-1"""
2671         for short_name, long_name in cls._lang_map.items():
2672             if long_name == code:
2673                 return short_name
2674
2675
2676 class ISO3166Utils(object):
2677     # From http://data.okfn.org/data/core/country-list
2678     _country_map = {
2679         'AF': 'Afghanistan',
2680         'AX': 'Åland Islands',
2681         'AL': 'Albania',
2682         'DZ': 'Algeria',
2683         'AS': 'American Samoa',
2684         'AD': 'Andorra',
2685         'AO': 'Angola',
2686         'AI': 'Anguilla',
2687         'AQ': 'Antarctica',
2688         'AG': 'Antigua and Barbuda',
2689         'AR': 'Argentina',
2690         'AM': 'Armenia',
2691         'AW': 'Aruba',
2692         'AU': 'Australia',
2693         'AT': 'Austria',
2694         'AZ': 'Azerbaijan',
2695         'BS': 'Bahamas',
2696         'BH': 'Bahrain',
2697         'BD': 'Bangladesh',
2698         'BB': 'Barbados',
2699         'BY': 'Belarus',
2700         'BE': 'Belgium',
2701         'BZ': 'Belize',
2702         'BJ': 'Benin',
2703         'BM': 'Bermuda',
2704         'BT': 'Bhutan',
2705         'BO': 'Bolivia, Plurinational State of',
2706         'BQ': 'Bonaire, Sint Eustatius and Saba',
2707         'BA': 'Bosnia and Herzegovina',
2708         'BW': 'Botswana',
2709         'BV': 'Bouvet Island',
2710         'BR': 'Brazil',
2711         'IO': 'British Indian Ocean Territory',
2712         'BN': 'Brunei Darussalam',
2713         'BG': 'Bulgaria',
2714         'BF': 'Burkina Faso',
2715         'BI': 'Burundi',
2716         'KH': 'Cambodia',
2717         'CM': 'Cameroon',
2718         'CA': 'Canada',
2719         'CV': 'Cape Verde',
2720         'KY': 'Cayman Islands',
2721         'CF': 'Central African Republic',
2722         'TD': 'Chad',
2723         'CL': 'Chile',
2724         'CN': 'China',
2725         'CX': 'Christmas Island',
2726         'CC': 'Cocos (Keeling) Islands',
2727         'CO': 'Colombia',
2728         'KM': 'Comoros',
2729         'CG': 'Congo',
2730         'CD': 'Congo, the Democratic Republic of the',
2731         'CK': 'Cook Islands',
2732         'CR': 'Costa Rica',
2733         'CI': 'Côte d\'Ivoire',
2734         'HR': 'Croatia',
2735         'CU': 'Cuba',
2736         'CW': 'Curaçao',
2737         'CY': 'Cyprus',
2738         'CZ': 'Czech Republic',
2739         'DK': 'Denmark',
2740         'DJ': 'Djibouti',
2741         'DM': 'Dominica',
2742         'DO': 'Dominican Republic',
2743         'EC': 'Ecuador',
2744         'EG': 'Egypt',
2745         'SV': 'El Salvador',
2746         'GQ': 'Equatorial Guinea',
2747         'ER': 'Eritrea',
2748         'EE': 'Estonia',
2749         'ET': 'Ethiopia',
2750         'FK': 'Falkland Islands (Malvinas)',
2751         'FO': 'Faroe Islands',
2752         'FJ': 'Fiji',
2753         'FI': 'Finland',
2754         'FR': 'France',
2755         'GF': 'French Guiana',
2756         'PF': 'French Polynesia',
2757         'TF': 'French Southern Territories',
2758         'GA': 'Gabon',
2759         'GM': 'Gambia',
2760         'GE': 'Georgia',
2761         'DE': 'Germany',
2762         'GH': 'Ghana',
2763         'GI': 'Gibraltar',
2764         'GR': 'Greece',
2765         'GL': 'Greenland',
2766         'GD': 'Grenada',
2767         'GP': 'Guadeloupe',
2768         'GU': 'Guam',
2769         'GT': 'Guatemala',
2770         'GG': 'Guernsey',
2771         'GN': 'Guinea',
2772         'GW': 'Guinea-Bissau',
2773         'GY': 'Guyana',
2774         'HT': 'Haiti',
2775         'HM': 'Heard Island and McDonald Islands',
2776         'VA': 'Holy See (Vatican City State)',
2777         'HN': 'Honduras',
2778         'HK': 'Hong Kong',
2779         'HU': 'Hungary',
2780         'IS': 'Iceland',
2781         'IN': 'India',
2782         'ID': 'Indonesia',
2783         'IR': 'Iran, Islamic Republic of',
2784         'IQ': 'Iraq',
2785         'IE': 'Ireland',
2786         'IM': 'Isle of Man',
2787         'IL': 'Israel',
2788         'IT': 'Italy',
2789         'JM': 'Jamaica',
2790         'JP': 'Japan',
2791         'JE': 'Jersey',
2792         'JO': 'Jordan',
2793         'KZ': 'Kazakhstan',
2794         'KE': 'Kenya',
2795         'KI': 'Kiribati',
2796         'KP': 'Korea, Democratic People\'s Republic of',
2797         'KR': 'Korea, Republic of',
2798         'KW': 'Kuwait',
2799         'KG': 'Kyrgyzstan',
2800         'LA': 'Lao People\'s Democratic Republic',
2801         'LV': 'Latvia',
2802         'LB': 'Lebanon',
2803         'LS': 'Lesotho',
2804         'LR': 'Liberia',
2805         'LY': 'Libya',
2806         'LI': 'Liechtenstein',
2807         'LT': 'Lithuania',
2808         'LU': 'Luxembourg',
2809         'MO': 'Macao',
2810         'MK': 'Macedonia, the Former Yugoslav Republic of',
2811         'MG': 'Madagascar',
2812         'MW': 'Malawi',
2813         'MY': 'Malaysia',
2814         'MV': 'Maldives',
2815         'ML': 'Mali',
2816         'MT': 'Malta',
2817         'MH': 'Marshall Islands',
2818         'MQ': 'Martinique',
2819         'MR': 'Mauritania',
2820         'MU': 'Mauritius',
2821         'YT': 'Mayotte',
2822         'MX': 'Mexico',
2823         'FM': 'Micronesia, Federated States of',
2824         'MD': 'Moldova, Republic of',
2825         'MC': 'Monaco',
2826         'MN': 'Mongolia',
2827         'ME': 'Montenegro',
2828         'MS': 'Montserrat',
2829         'MA': 'Morocco',
2830         'MZ': 'Mozambique',
2831         'MM': 'Myanmar',
2832         'NA': 'Namibia',
2833         'NR': 'Nauru',
2834         'NP': 'Nepal',
2835         'NL': 'Netherlands',
2836         'NC': 'New Caledonia',
2837         'NZ': 'New Zealand',
2838         'NI': 'Nicaragua',
2839         'NE': 'Niger',
2840         'NG': 'Nigeria',
2841         'NU': 'Niue',
2842         'NF': 'Norfolk Island',
2843         'MP': 'Northern Mariana Islands',
2844         'NO': 'Norway',
2845         'OM': 'Oman',
2846         'PK': 'Pakistan',
2847         'PW': 'Palau',
2848         'PS': 'Palestine, State of',
2849         'PA': 'Panama',
2850         'PG': 'Papua New Guinea',
2851         'PY': 'Paraguay',
2852         'PE': 'Peru',
2853         'PH': 'Philippines',
2854         'PN': 'Pitcairn',
2855         'PL': 'Poland',
2856         'PT': 'Portugal',
2857         'PR': 'Puerto Rico',
2858         'QA': 'Qatar',
2859         'RE': 'Réunion',
2860         'RO': 'Romania',
2861         'RU': 'Russian Federation',
2862         'RW': 'Rwanda',
2863         'BL': 'Saint Barthélemy',
2864         'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2865         'KN': 'Saint Kitts and Nevis',
2866         'LC': 'Saint Lucia',
2867         'MF': 'Saint Martin (French part)',
2868         'PM': 'Saint Pierre and Miquelon',
2869         'VC': 'Saint Vincent and the Grenadines',
2870         'WS': 'Samoa',
2871         'SM': 'San Marino',
2872         'ST': 'Sao Tome and Principe',
2873         'SA': 'Saudi Arabia',
2874         'SN': 'Senegal',
2875         'RS': 'Serbia',
2876         'SC': 'Seychelles',
2877         'SL': 'Sierra Leone',
2878         'SG': 'Singapore',
2879         'SX': 'Sint Maarten (Dutch part)',
2880         'SK': 'Slovakia',
2881         'SI': 'Slovenia',
2882         'SB': 'Solomon Islands',
2883         'SO': 'Somalia',
2884         'ZA': 'South Africa',
2885         'GS': 'South Georgia and the South Sandwich Islands',
2886         'SS': 'South Sudan',
2887         'ES': 'Spain',
2888         'LK': 'Sri Lanka',
2889         'SD': 'Sudan',
2890         'SR': 'Suriname',
2891         'SJ': 'Svalbard and Jan Mayen',
2892         'SZ': 'Swaziland',
2893         'SE': 'Sweden',
2894         'CH': 'Switzerland',
2895         'SY': 'Syrian Arab Republic',
2896         'TW': 'Taiwan, Province of China',
2897         'TJ': 'Tajikistan',
2898         'TZ': 'Tanzania, United Republic of',
2899         'TH': 'Thailand',
2900         'TL': 'Timor-Leste',
2901         'TG': 'Togo',
2902         'TK': 'Tokelau',
2903         'TO': 'Tonga',
2904         'TT': 'Trinidad and Tobago',
2905         'TN': 'Tunisia',
2906         'TR': 'Turkey',
2907         'TM': 'Turkmenistan',
2908         'TC': 'Turks and Caicos Islands',
2909         'TV': 'Tuvalu',
2910         'UG': 'Uganda',
2911         'UA': 'Ukraine',
2912         'AE': 'United Arab Emirates',
2913         'GB': 'United Kingdom',
2914         'US': 'United States',
2915         'UM': 'United States Minor Outlying Islands',
2916         'UY': 'Uruguay',
2917         'UZ': 'Uzbekistan',
2918         'VU': 'Vanuatu',
2919         'VE': 'Venezuela, Bolivarian Republic of',
2920         'VN': 'Viet Nam',
2921         'VG': 'Virgin Islands, British',
2922         'VI': 'Virgin Islands, U.S.',
2923         'WF': 'Wallis and Futuna',
2924         'EH': 'Western Sahara',
2925         'YE': 'Yemen',
2926         'ZM': 'Zambia',
2927         'ZW': 'Zimbabwe',
2928     }
2929
2930     @classmethod
2931     def short2full(cls, code):
2932         """Convert an ISO 3166-2 country code to the corresponding full name"""
2933         return cls._country_map.get(code.upper())
2934
2935
2936 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2937     def __init__(self, proxies=None):
2938         # Set default handlers
2939         for type in ('http', 'https'):
2940             setattr(self, '%s_open' % type,
2941                     lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
2942                         meth(r, proxy, type))
2943         return compat_urllib_request.ProxyHandler.__init__(self, proxies)
2944
2945     def proxy_open(self, req, proxy, type):
2946         req_proxy = req.headers.get('Ytdl-request-proxy')
2947         if req_proxy is not None:
2948             proxy = req_proxy
2949             del req.headers['Ytdl-request-proxy']
2950
2951         if proxy == '__noproxy__':
2952             return None  # No Proxy
2953         if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
2954             req.add_header('Ytdl-socks-proxy', proxy)
2955             # youtube-dl's http/https handlers do wrapping the socket with socks
2956             return None
2957         return compat_urllib_request.ProxyHandler.proxy_open(
2958             self, req, proxy, type)
2959
2960
2961 def ohdave_rsa_encrypt(data, exponent, modulus):
2962     '''
2963     Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
2964
2965     Input:
2966         data: data to encrypt, bytes-like object
2967         exponent, modulus: parameter e and N of RSA algorithm, both integer
2968     Output: hex string of encrypted data
2969
2970     Limitation: supports one block encryption only
2971     '''
2972
2973     payload = int(binascii.hexlify(data[::-1]), 16)
2974     encrypted = pow(payload, exponent, modulus)
2975     return '%x' % encrypted
2976
2977
2978 def encode_base_n(num, n, table=None):
2979     FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
2980     if not table:
2981         table = FULL_TABLE[:n]
2982
2983     if n > len(table):
2984         raise ValueError('base %d exceeds table length %d' % (n, len(table)))
2985
2986     if num == 0:
2987         return table[0]
2988
2989     ret = ''
2990     while num:
2991         ret = table[num % n] + ret
2992         num = num // n
2993     return ret
2994
2995
2996 def decode_packed_codes(code):
2997     mobj = re.search(
2998         r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)",
2999         code)
3000     obfucasted_code, base, count, symbols = mobj.groups()
3001     base = int(base)
3002     count = int(count)
3003     symbols = symbols.split('|')
3004     symbol_table = {}
3005
3006     while count:
3007         count -= 1
3008         base_n_count = encode_base_n(count, base)
3009         symbol_table[base_n_count] = symbols[count] or base_n_count
3010
3011     return re.sub(
3012         r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
3013         obfucasted_code)
3014
3015
3016 def parse_m3u8_attributes(attrib):
3017     info = {}
3018     for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
3019         if val.startswith('"'):
3020             val = val[1:-1]
3021         info[key] = val
3022     return info
3023
3024
3025 def urshift(val, n):
3026     return val >> n if val >= 0 else (val + 0x100000000) >> n
3027
3028
3029 # Based on png2str() written by @gdkchan and improved by @yokrysty
3030 # Originally posted at https://github.com/rg3/youtube-dl/issues/9706
3031 def decode_png(png_data):
3032     # Reference: https://www.w3.org/TR/PNG/
3033     header = png_data[8:]
3034
3035     if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
3036         raise IOError('Not a valid PNG file.')
3037
3038     int_map = {1: '>B', 2: '>H', 4: '>I'}
3039     unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
3040
3041     chunks = []
3042
3043     while header:
3044         length = unpack_integer(header[:4])
3045         header = header[4:]
3046
3047         chunk_type = header[:4]
3048         header = header[4:]
3049
3050         chunk_data = header[:length]
3051         header = header[length:]
3052
3053         header = header[4:]  # Skip CRC
3054
3055         chunks.append({
3056             'type': chunk_type,
3057             'length': length,
3058             'data': chunk_data
3059         })
3060
3061     ihdr = chunks[0]['data']
3062
3063     width = unpack_integer(ihdr[:4])
3064     height = unpack_integer(ihdr[4:8])
3065
3066     idat = b''
3067
3068     for chunk in chunks:
3069         if chunk['type'] == b'IDAT':
3070             idat += chunk['data']
3071
3072     if not idat:
3073         raise IOError('Unable to read PNG data.')
3074
3075     decompressed_data = bytearray(zlib.decompress(idat))
3076
3077     stride = width * 3
3078     pixels = []
3079
3080     def _get_pixel(idx):
3081         x = idx % stride
3082         y = idx // stride
3083         return pixels[y][x]
3084
3085     for y in range(height):
3086         basePos = y * (1 + stride)
3087         filter_type = decompressed_data[basePos]
3088
3089         current_row = []
3090
3091         pixels.append(current_row)
3092
3093         for x in range(stride):
3094             color = decompressed_data[1 + basePos + x]
3095             basex = y * stride + x
3096             left = 0
3097             up = 0
3098
3099             if x > 2:
3100                 left = _get_pixel(basex - 3)
3101             if y > 0:
3102                 up = _get_pixel(basex - stride)
3103
3104             if filter_type == 1:  # Sub
3105                 color = (color + left) & 0xff
3106             elif filter_type == 2:  # Up
3107                 color = (color + up) & 0xff
3108             elif filter_type == 3:  # Average
3109                 color = (color + ((left + up) >> 1)) & 0xff
3110             elif filter_type == 4:  # Paeth
3111                 a = left
3112                 b = up
3113                 c = 0
3114
3115                 if x > 2 and y > 0:
3116                     c = _get_pixel(basex - stride - 3)
3117
3118                 p = a + b - c
3119
3120                 pa = abs(p - a)
3121                 pb = abs(p - b)
3122                 pc = abs(p - c)
3123
3124                 if pa <= pb and pa <= pc:
3125                     color = (color + a) & 0xff
3126                 elif pb <= pc:
3127                     color = (color + b) & 0xff
3128                 else:
3129                     color = (color + c) & 0xff
3130
3131             current_row.append(color)
3132
3133     return width, height, pixels