4 from __future__ import unicode_literals
36 import xml.etree.ElementTree
40 compat_HTMLParseError,
46 compat_ctypes_WINFUNCTYPE,
47 compat_etree_fromstring,
50 compat_html_entities_html5,
62 compat_urllib_parse_urlencode,
63 compat_urllib_parse_urlparse,
64 compat_urllib_parse_unquote_plus,
65 compat_urllib_request,
76 def register_socks_protocols():
77 # "Register" SOCKS protocols
78 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
79 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
80 for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
81 if scheme not in compat_urlparse.uses_netloc:
82 compat_urlparse.uses_netloc.append(scheme)
85 # This is not clearly defined otherwise
86 compiled_regex_type = type(re.compile(''))
89 def random_user_agent():
90 _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
1669 return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
1673 'User-Agent': random_user_agent(),
1674 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
1675 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
1676 'Accept-Encoding': 'gzip, deflate',
1677 'Accept-Language': 'en-us,en;q=0.5',
1682 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
1686 NO_DEFAULT = object()
1688 ENGLISH_MONTH_NAMES = [
1689 'January', 'February', 'March', 'April', 'May', 'June',
1690 'July', 'August', 'September', 'October', 'November', 'December']
1693 'en': ENGLISH_MONTH_NAMES,
1695 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
1696 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
1699 KNOWN_EXTENSIONS = (
1700 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
1701 'flv', 'f4v', 'f4a', 'f4b',
1702 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
1703 'mkv', 'mka', 'mk3d',
1706 'asf', 'wmv', 'wma',
1712 'f4f', 'f4m', 'm3u8', 'smil')
1714 # needed for sanitizing filenames in restricted mode
1715 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
1716 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
1717 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
1740 '%Y/%m/%d %H:%M:%S',
1742 '%Y-%m-%d %H:%M:%S',
1743 '%Y-%m-%d %H:%M:%S.%f',
1746 '%Y-%m-%dT%H:%M:%SZ',
1747 '%Y-%m-%dT%H:%M:%S.%fZ',
1748 '%Y-%m-%dT%H:%M:%S.%f0Z',
1749 '%Y-%m-%dT%H:%M:%S',
1750 '%Y-%m-%dT%H:%M:%S.%f',
1752 '%b %d %Y at %H:%M',
1753 '%b %d %Y at %H:%M:%S',
1754 '%B %d %Y at %H:%M',
1755 '%B %d %Y at %H:%M:%S',
1758 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
1759 DATE_FORMATS_DAY_FIRST.extend([
1765 '%d/%m/%Y %H:%M:%S',
1768 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
1769 DATE_FORMATS_MONTH_FIRST.extend([
1774 '%m/%d/%Y %H:%M:%S',
1777 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
1778 JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>'
1781 def preferredencoding():
1782 """Get preferred encoding.
1784 Returns the best encoding scheme for the system, based on
1785 locale.getpreferredencoding() and some further tweaks.
1788 pref = locale.getpreferredencoding()
1796 def write_json_file(obj, fn):
1797 """ Encode obj as JSON and write it to fn, atomically if possible """
1799 fn = encodeFilename(fn)
1800 if sys.version_info < (3, 0) and sys.platform != 'win32':
1801 encoding = get_filesystem_encoding()
1802 # os.path.basename returns a bytes object, but NamedTemporaryFile
1803 # will fail if the filename contains non ascii characters unless we
1804 # use a unicode object
1805 path_basename = lambda f: os.path.basename(fn).decode(encoding)
1806 # the same for os.path.dirname
1807 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
1809 path_basename = os.path.basename
1810 path_dirname = os.path.dirname
1814 'prefix': path_basename(fn) + '.',
1815 'dir': path_dirname(fn),
1819 # In Python 2.x, json.dump expects a bytestream.
1820 # In Python 3.x, it writes to a character stream
1821 if sys.version_info < (3, 0):
1826 'encoding': 'utf-8',
1829 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
1834 if sys.platform == 'win32':
1835 # Need to remove existing file on Windows, else os.rename raises
1836 # WindowsError or FileExistsError.
1844 os.chmod(tf.name, 0o666 & ~mask)
1847 os.rename(tf.name, fn)
1856 if sys.version_info >= (2, 7):
1857 def find_xpath_attr(node, xpath, key, val=None):
1858 """ Find the xpath xpath[@key=val] """
1859 assert re.match(r'^[a-zA-Z_-]+$', key)
1860 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
1861 return node.find(expr)
1863 def find_xpath_attr(node, xpath, key, val=None):
1864 for f in node.findall(compat_xpath(xpath)):
1865 if key not in f.attrib:
1867 if val is None or f.attrib.get(key) == val:
1871 # On python2.6 the xml.etree.ElementTree.Element methods don't support
1872 # the namespace parameter
1875 def xpath_with_ns(path, ns_map):
1876 components = [c.split(':') for c in path.split('/')]
1878 for c in components:
1880 replaced.append(c[0])
1883 replaced.append('{%s}%s' % (ns_map[ns], tag))
1884 return '/'.join(replaced)
1887 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
1888 def _find_xpath(xpath):
1889 return node.find(compat_xpath(xpath))
1891 if isinstance(xpath, (str, compat_str)):
1892 n = _find_xpath(xpath)
1900 if default is not NO_DEFAULT:
1903 name = xpath if name is None else name
1904 raise ExtractorError('Could not find XML element %s' % name)
1910 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
1911 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
1912 if n is None or n == default:
1915 if default is not NO_DEFAULT:
1918 name = xpath if name is None else name
1919 raise ExtractorError('Could not find XML element\'s text %s' % name)
1925 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
1926 n = find_xpath_attr(node, xpath, key)
1928 if default is not NO_DEFAULT:
1931 name = '%s[@%s]' % (xpath, key) if name is None else name
1932 raise ExtractorError('Could not find XML attribute %s' % name)
1935 return n.attrib[key]
1938 def get_element_by_id(id, html):
1939 """Return the content of the tag with the specified ID in the passed HTML document"""
1940 return get_element_by_attribute('id', id, html)
1943 def get_element_by_class(class_name, html):
1944 """Return the content of the first tag with the specified class in the passed HTML document"""
1945 retval = get_elements_by_class(class_name, html)
1946 return retval[0] if retval else None
1949 def get_element_by_attribute(attribute, value, html, escape_value=True):
1950 retval = get_elements_by_attribute(attribute, value, html, escape_value)
1951 return retval[0] if retval else None
1954 def get_elements_by_class(class_name, html):
1955 """Return the content of all tags with the specified class in the passed HTML document as a list"""
1956 return get_elements_by_attribute(
1957 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
1958 html, escape_value=False)
1961 def get_elements_by_attribute(attribute, value, html, escape_value=True):
1962 """Return the content of the tag with the specified attribute in the passed HTML document"""
1964 value = re.escape(value) if escape_value else value
1967 for m in re.finditer(r'''(?xs)
1969 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
1971 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
1975 ''' % (re.escape(attribute), value), html):
1976 res = m.group('content')
1978 if res.startswith('"') or res.startswith("'"):
1981 retlist.append(unescapeHTML(res))
1986 class HTMLAttributeParser(compat_HTMLParser):
1987 """Trivial HTML parser to gather the attributes for a single element"""
1990 compat_HTMLParser.__init__(self)
1992 def handle_starttag(self, tag, attrs):
1993 self.attrs = dict(attrs)
1996 def extract_attributes(html_element):
1997 """Given a string for an HTML element such as
1999 a="foo" B="bar" c="&98;az" d=boz
2000 empty= noval entity="&"
2003 Decode and return a dictionary of attributes.
2005 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
2006 'empty': '', 'noval': None, 'entity': '&',
2007 'sq': '"', 'dq': '\''
2009 NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
2010 but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
2012 parser = HTMLAttributeParser()
2014 parser.feed(html_element)
2016 # Older Python may throw HTMLParseError in case of malformed HTML
2017 except compat_HTMLParseError:
2022 def clean_html(html):
2023 """Clean an HTML snippet into a readable string"""
2025 if html is None: # Convenience for sanitizing descriptions etc.
2029 html = html.replace('\n', ' ')
2030 html = re.sub(r'(?u)\s*<\s*br\s*/?\s*>\s*', '\n', html)
2031 html = re.sub(r'(?u)<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
2033 html = re.sub('<.*?>', '', html)
2034 # Replace html entities
2035 html = unescapeHTML(html)
2039 def sanitize_open(filename, open_mode):
2040 """Try to open the given filename, and slightly tweak it if this fails.
2042 Attempts to open the given filename. If this fails, it tries to change
2043 the filename slightly, step by step, until it's either able to open it
2044 or it fails and raises a final exception, like the standard open()
2047 It returns the tuple (stream, definitive_file_name).
2051 if sys.platform == 'win32':
2053 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
2054 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
2055 stream = open(encodeFilename(filename), open_mode)
2056 return (stream, filename)
2057 except (IOError, OSError) as err:
2058 if err.errno in (errno.EACCES,):
2061 # In case of error, try to remove win32 forbidden chars
2062 alt_filename = sanitize_path(filename)
2063 if alt_filename == filename:
2066 # An exception here should be caught in the caller
2067 stream = open(encodeFilename(alt_filename), open_mode)
2068 return (stream, alt_filename)
2071 def timeconvert(timestr):
2072 """Convert RFC 2822 defined time string into system timestamp"""
2074 timetuple = email.utils.parsedate_tz(timestr)
2075 if timetuple is not None:
2076 timestamp = email.utils.mktime_tz(timetuple)
2080 def sanitize_filename(s, restricted=False, is_id=False):
2081 """Sanitizes a string so it could be used as part of a filename.
2082 If restricted is set, use a stricter subset of allowed characters.
2083 Set is_id if this is not an arbitrary string, but an ID that should be kept
2086 def replace_insane(char):
2087 if restricted and char in ACCENT_CHARS:
2088 return ACCENT_CHARS[char]
2089 if char == '?' or ord(char) < 32 or ord(char) == 127:
2092 return '' if restricted else '\''
2094 return '_-' if restricted else ' -'
2095 elif char in '\\/|*<>':
2097 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
2099 if restricted and ord(char) > 127:
2104 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
2105 result = ''.join(map(replace_insane, s))
2107 while '__' in result:
2108 result = result.replace('__', '_')
2109 result = result.strip('_')
2110 # Common case of "Foreign band name - English song title"
2111 if restricted and result.startswith('-_'):
2113 if result.startswith('-'):
2114 result = '_' + result[len('-'):]
2115 result = result.lstrip('.')
2121 def sanitize_path(s):
2122 """Sanitizes and normalizes path on Windows"""
2123 if sys.platform != 'win32':
2125 drive_or_unc, _ = os.path.splitdrive(s)
2126 if sys.version_info < (2, 7) and not drive_or_unc:
2127 drive_or_unc, _ = os.path.splitunc(s)
2128 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
2132 path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
2133 for path_part in norm_path]
2135 sanitized_path.insert(0, drive_or_unc + os.path.sep)
2136 return os.path.join(*sanitized_path)
2139 def sanitize_url(url):
2140 # Prepend protocol-less URLs with `http:` scheme in order to mitigate
2141 # the number of unwanted failures due to missing protocol
2142 if url.startswith('//'):
2143 return 'http:%s' % url
2144 # Fix some common typos seen so far
2146 # https://github.com/ytdl-org/youtube-dl/issues/15649
2147 (r'^httpss://', r'https://'),
2148 # https://bx1.be/lives/direct-tv/
2149 (r'^rmtp([es]?)://', r'rtmp\1://'),
2151 for mistake, fixup in COMMON_TYPOS:
2152 if re.match(mistake, url):
2153 return re.sub(mistake, fixup, url)
2157 def sanitized_Request(url, *args, **kwargs):
2158 return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs)
2162 """Expand shell variables and ~"""
2163 return os.path.expandvars(compat_expanduser(s))
2166 def orderedSet(iterable):
2167 """ Remove all duplicates from the input iterable """
2175 def _htmlentity_transform(entity_with_semicolon):
2176 """Transforms an HTML entity to a character."""
2177 entity = entity_with_semicolon[:-1]
2179 # Known non-numeric HTML entity
2180 if entity in compat_html_entities.name2codepoint:
2181 return compat_chr(compat_html_entities.name2codepoint[entity])
2183 # TODO: HTML5 allows entities without a semicolon. For example,
2184 # 'Éric' should be decoded as 'Éric'.
2185 if entity_with_semicolon in compat_html_entities_html5:
2186 return compat_html_entities_html5[entity_with_semicolon]
2188 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
2189 if mobj is not None:
2190 numstr = mobj.group(1)
2191 if numstr.startswith('x'):
2193 numstr = '0%s' % numstr
2196 # See https://github.com/ytdl-org/youtube-dl/issues/7518
2198 return compat_chr(int(numstr, base))
2202 # Unknown entity in name, return its literal representation
2203 return '&%s;' % entity
2206 def unescapeHTML(s):
2209 assert type(s) == compat_str
2212 r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
2215 def get_subprocess_encoding():
2216 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
2217 # For subprocess calls, encode with locale encoding
2218 # Refer to http://stackoverflow.com/a/9951851/35070
2219 encoding = preferredencoding()
2221 encoding = sys.getfilesystemencoding()
2222 if encoding is None:
2227 def encodeFilename(s, for_subprocess=False):
2229 @param s The name of the file
2232 assert type(s) == compat_str
2234 # Python 3 has a Unicode API
2235 if sys.version_info >= (3, 0):
2238 # Pass '' directly to use Unicode APIs on Windows 2000 and up
2239 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
2240 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
2241 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
2244 # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
2245 if sys.platform.startswith('java'):
2248 return s.encode(get_subprocess_encoding(), 'ignore')
2251 def decodeFilename(b, for_subprocess=False):
2253 if sys.version_info >= (3, 0):
2256 if not isinstance(b, bytes):
2259 return b.decode(get_subprocess_encoding(), 'ignore')
2262 def encodeArgument(s):
2263 if not isinstance(s, compat_str):
2264 # Legacy code that uses byte strings
2265 # Uncomment the following line after fixing all post processors
2266 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
2267 s = s.decode('ascii')
2268 return encodeFilename(s, True)
2271 def decodeArgument(b):
2272 return decodeFilename(b, True)
2275 def decodeOption(optval):
2278 if isinstance(optval, bytes):
2279 optval = optval.decode(preferredencoding())
2281 assert isinstance(optval, compat_str)
2285 def formatSeconds(secs):
2287 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
2289 return '%d:%02d' % (secs // 60, secs % 60)
2294 def make_HTTPS_handler(params, **kwargs):
2295 opts_no_check_certificate = params.get('nocheckcertificate', False)
2296 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
2297 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
2298 if opts_no_check_certificate:
2299 context.check_hostname = False
2300 context.verify_mode = ssl.CERT_NONE
2302 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
2305 # (create_default_context present but HTTPSHandler has no context=)
2308 if sys.version_info < (3, 2):
2309 return YoutubeDLHTTPSHandler(params, **kwargs)
2310 else: # Python < 3.4
2311 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
2312 context.verify_mode = (ssl.CERT_NONE
2313 if opts_no_check_certificate
2314 else ssl.CERT_REQUIRED)
2315 context.set_default_verify_paths()
2316 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
2319 def bug_reports_message():
2320 if ytdl_is_updateable():
2321 update_cmd = 'type youtube-dl -U to update'
2323 update_cmd = 'see https://yt-dl.org/update on how to update'
2324 msg = '; please report this issue on https://yt-dl.org/bug .'
2325 msg += ' Make sure you are using the latest version; %s.' % update_cmd
2326 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
2330 class YoutubeDLError(Exception):
2331 """Base exception for YoutubeDL errors."""
2335 class ExtractorError(YoutubeDLError):
2336 """Error during info extraction."""
2338 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
2339 """ tb, if given, is the original traceback (so that it can be printed out).
2340 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
2343 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
2345 if video_id is not None:
2346 msg = video_id + ': ' + msg
2348 msg += ' (caused by %r)' % cause
2350 msg += bug_reports_message()
2351 super(ExtractorError, self).__init__(msg)
2354 self.exc_info = sys.exc_info() # preserve original exception
2356 self.video_id = video_id
2358 def format_traceback(self):
2359 if self.traceback is None:
2361 return ''.join(traceback.format_tb(self.traceback))
2364 class UnsupportedError(ExtractorError):
2365 def __init__(self, url):
2366 super(UnsupportedError, self).__init__(
2367 'Unsupported URL: %s' % url, expected=True)
2371 class RegexNotFoundError(ExtractorError):
2372 """Error when a regex didn't match"""
2376 class GeoRestrictedError(ExtractorError):
2377 """Geographic restriction Error exception.
2379 This exception may be thrown when a video is not available from your
2380 geographic location due to geographic restrictions imposed by a website.
2382 def __init__(self, msg, countries=None):
2383 super(GeoRestrictedError, self).__init__(msg, expected=True)
2385 self.countries = countries
2388 class DownloadError(YoutubeDLError):
2389 """Download Error exception.
2391 This exception may be thrown by FileDownloader objects if they are not
2392 configured to continue on errors. They will contain the appropriate
2396 def __init__(self, msg, exc_info=None):
2397 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
2398 super(DownloadError, self).__init__(msg)
2399 self.exc_info = exc_info
2402 class SameFileError(YoutubeDLError):
2403 """Same File exception.
2405 This exception will be thrown by FileDownloader objects if they detect
2406 multiple files would have to be downloaded to the same file on disk.
2411 class PostProcessingError(YoutubeDLError):
2412 """Post Processing exception.
2414 This exception may be raised by PostProcessor's .run() method to
2415 indicate an error in the postprocessing task.
2418 def __init__(self, msg):
2419 super(PostProcessingError, self).__init__(msg)
2423 class MaxDownloadsReached(YoutubeDLError):
2424 """ --max-downloads limit has been reached. """
2428 class UnavailableVideoError(YoutubeDLError):
2429 """Unavailable Format exception.
2431 This exception will be thrown when a video is requested
2432 in a format that is not available for that video.
2437 class ContentTooShortError(YoutubeDLError):
2438 """Content Too Short exception.
2440 This exception may be raised by FileDownloader objects when a file they
2441 download is too small for what the server announced first, indicating
2442 the connection was probably interrupted.
2445 def __init__(self, downloaded, expected):
2446 super(ContentTooShortError, self).__init__(
2447 'Downloaded {0} bytes, expected {1} bytes'.format(downloaded, expected)
2450 self.downloaded = downloaded
2451 self.expected = expected
2454 class XAttrMetadataError(YoutubeDLError):
2455 def __init__(self, code=None, msg='Unknown error'):
2456 super(XAttrMetadataError, self).__init__(msg)
2460 # Parsing code and msg
2461 if (self.code in (errno.ENOSPC, errno.EDQUOT)
2462 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
2463 self.reason = 'NO_SPACE'
2464 elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
2465 self.reason = 'VALUE_TOO_LONG'
2467 self.reason = 'NOT_SUPPORTED'
2470 class XAttrUnavailableError(YoutubeDLError):
2474 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
2475 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
2476 # expected HTTP responses to meet HTTP/1.0 or later (see also
2477 # https://github.com/ytdl-org/youtube-dl/issues/6727)
2478 if sys.version_info < (3, 0):
2479 kwargs['strict'] = True
2480 hc = http_class(*args, **compat_kwargs(kwargs))
2481 source_address = ydl_handler._params.get('source_address')
2483 if source_address is not None:
2484 # This is to workaround _create_connection() from socket where it will try all
2485 # address data from getaddrinfo() including IPv6. This filters the result from
2486 # getaddrinfo() based on the source_address value.
2487 # This is based on the cpython socket.create_connection() function.
2488 # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
2489 def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
2490 host, port = address
2492 addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
2493 af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
2494 ip_addrs = [addr for addr in addrs if addr[0] == af]
2495 if addrs and not ip_addrs:
2496 ip_version = 'v4' if af == socket.AF_INET else 'v6'
2498 "No remote IP%s addresses available for connect, can't use '%s' as source address"
2499 % (ip_version, source_address[0]))
2500 for res in ip_addrs:
2501 af, socktype, proto, canonname, sa = res
2504 sock = socket.socket(af, socktype, proto)
2505 if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
2506 sock.settimeout(timeout)
2507 sock.bind(source_address)
2509 err = None # Explicitly break reference cycle
2511 except socket.error as _:
2513 if sock is not None:
2518 raise socket.error('getaddrinfo returns an empty list')
2519 if hasattr(hc, '_create_connection'):
2520 hc._create_connection = _create_connection
2521 sa = (source_address, 0)
2522 if hasattr(hc, 'source_address'): # Python 2.7+
2523 hc.source_address = sa
2525 def _hc_connect(self, *args, **kwargs):
2526 sock = _create_connection(
2527 (self.host, self.port), self.timeout, sa)
2529 self.sock = ssl.wrap_socket(
2530 sock, self.key_file, self.cert_file,
2531 ssl_version=ssl.PROTOCOL_TLSv1)
2534 hc.connect = functools.partial(_hc_connect, hc)
2539 def handle_youtubedl_headers(headers):
2540 filtered_headers = headers
2542 if 'Youtubedl-no-compression' in filtered_headers:
2543 filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
2544 del filtered_headers['Youtubedl-no-compression']
2546 return filtered_headers
2549 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
2550 """Handler for HTTP requests and responses.
2552 This class, when installed with an OpenerDirector, automatically adds
2553 the standard headers to every HTTP request and handles gzipped and
2554 deflated responses from web servers. If compression is to be avoided in
2555 a particular request, the original request in the program code only has
2556 to include the HTTP header "Youtubedl-no-compression", which will be
2557 removed before making the real request.
2559 Part of this code was copied from:
2561 http://techknack.net/python-urllib2-handlers/
2563 Andrew Rowls, the author of that code, agreed to release it to the
2567 def __init__(self, params, *args, **kwargs):
2568 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
2569 self._params = params
2571 def http_open(self, req):
2572 conn_class = compat_http_client.HTTPConnection
2574 socks_proxy = req.headers.get('Ytdl-socks-proxy')
2576 conn_class = make_socks_conn_class(conn_class, socks_proxy)
2577 del req.headers['Ytdl-socks-proxy']
2579 return self.do_open(functools.partial(
2580 _create_http_connection, self, conn_class, False),
2586 return zlib.decompress(data, -zlib.MAX_WBITS)
2588 return zlib.decompress(data)
2590 def http_request(self, req):
2591 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
2592 # always respected by websites, some tend to give out URLs with non percent-encoded
2593 # non-ASCII characters (see telemb.py, ard.py [#3412])
2594 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
2595 # To work around aforementioned issue we will replace request's original URL with
2596 # percent-encoded one
2597 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
2598 # the code of this workaround has been moved here from YoutubeDL.urlopen()
2599 url = req.get_full_url()
2600 url_escaped = escape_url(url)
2602 # Substitute URL if any change after escaping
2603 if url != url_escaped:
2604 req = update_Request(req, url=url_escaped)
2606 for h, v in std_headers.items():
2607 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
2608 # The dict keys are capitalized because of this bug by urllib
2609 if h.capitalize() not in req.headers:
2610 req.add_header(h, v)
2612 req.headers = handle_youtubedl_headers(req.headers)
2614 if sys.version_info < (2, 7) and '#' in req.get_full_url():
2615 # Python 2.6 is brain-dead when it comes to fragments
2616 req._Request__original = req._Request__original.partition('#')[0]
2617 req._Request__r_type = req._Request__r_type.partition('#')[0]
2621 def http_response(self, req, resp):
2624 if resp.headers.get('Content-encoding', '') == 'gzip':
2625 content = resp.read()
2626 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
2628 uncompressed = io.BytesIO(gz.read())
2629 except IOError as original_ioerror:
2630 # There may be junk add the end of the file
2631 # See http://stackoverflow.com/q/4928560/35070 for details
2632 for i in range(1, 1024):
2634 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
2635 uncompressed = io.BytesIO(gz.read())
2640 raise original_ioerror
2641 resp = compat_urllib_request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
2642 resp.msg = old_resp.msg
2643 del resp.headers['Content-encoding']
2645 if resp.headers.get('Content-encoding', '') == 'deflate':
2646 gz = io.BytesIO(self.deflate(resp.read()))
2647 resp = compat_urllib_request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
2648 resp.msg = old_resp.msg
2649 del resp.headers['Content-encoding']
2650 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
2651 # https://github.com/ytdl-org/youtube-dl/issues/6457).
2652 if 300 <= resp.code < 400:
2653 location = resp.headers.get('Location')
2655 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
2656 if sys.version_info >= (3, 0):
2657 location = location.encode('iso-8859-1').decode('utf-8')
2659 location = location.decode('utf-8')
2660 location_escaped = escape_url(location)
2661 if location != location_escaped:
2662 del resp.headers['Location']
2663 if sys.version_info < (3, 0):
2664 location_escaped = location_escaped.encode('utf-8')
2665 resp.headers['Location'] = location_escaped
2668 https_request = http_request
2669 https_response = http_response
2672 def make_socks_conn_class(base_class, socks_proxy):
2673 assert issubclass(base_class, (
2674 compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
2676 url_components = compat_urlparse.urlparse(socks_proxy)
2677 if url_components.scheme.lower() == 'socks5':
2678 socks_type = ProxyType.SOCKS5
2679 elif url_components.scheme.lower() in ('socks', 'socks4'):
2680 socks_type = ProxyType.SOCKS4
2681 elif url_components.scheme.lower() == 'socks4a':
2682 socks_type = ProxyType.SOCKS4A
2684 def unquote_if_non_empty(s):
2687 return compat_urllib_parse_unquote_plus(s)
2691 url_components.hostname, url_components.port or 1080,
2693 unquote_if_non_empty(url_components.username),
2694 unquote_if_non_empty(url_components.password),
2697 class SocksConnection(base_class):
2699 self.sock = sockssocket()
2700 self.sock.setproxy(*proxy_args)
2701 if type(self.timeout) in (int, float):
2702 self.sock.settimeout(self.timeout)
2703 self.sock.connect((self.host, self.port))
2705 if isinstance(self, compat_http_client.HTTPSConnection):
2706 if hasattr(self, '_context'): # Python > 2.6
2707 self.sock = self._context.wrap_socket(
2708 self.sock, server_hostname=self.host)
2710 self.sock = ssl.wrap_socket(self.sock)
2712 return SocksConnection
2715 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
2716 def __init__(self, params, https_conn_class=None, *args, **kwargs):
2717 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
2718 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
2719 self._params = params
2721 def https_open(self, req):
2723 conn_class = self._https_conn_class
2725 if hasattr(self, '_context'): # python > 2.6
2726 kwargs['context'] = self._context
2727 if hasattr(self, '_check_hostname'): # python 3.x
2728 kwargs['check_hostname'] = self._check_hostname
2730 socks_proxy = req.headers.get('Ytdl-socks-proxy')
2732 conn_class = make_socks_conn_class(conn_class, socks_proxy)
2733 del req.headers['Ytdl-socks-proxy']
2735 return self.do_open(functools.partial(
2736 _create_http_connection, self, conn_class, True),
2740 class YoutubeDLCookieJar(compat_cookiejar.MozillaCookieJar):
2742 See [1] for cookie file format.
2744 1. https://curl.haxx.se/docs/http-cookies.html
2746 _HTTPONLY_PREFIX = '#HttpOnly_'
2748 _HEADER = '''# Netscape HTTP Cookie File
2749 # This file is generated by youtube-dl. Do not edit.
2752 _CookieFileEntry = collections.namedtuple(
2754 ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
2756 def save(self, filename=None, ignore_discard=False, ignore_expires=False):
2758 Save cookies to a file.
2760 Most of the code is taken from CPython 3.8 and slightly adapted
2761 to support cookie files with UTF-8 in both python 2 and 3.
2763 if filename is None:
2764 if self.filename is not None:
2765 filename = self.filename
2767 raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT)
2769 # Store session cookies with `expires` set to 0 instead of an empty
2772 if cookie.expires is None:
2775 with io.open(filename, 'w', encoding='utf-8') as f:
2776 f.write(self._HEADER)
2779 if not ignore_discard and cookie.discard:
2781 if not ignore_expires and cookie.is_expired(now):
2787 if cookie.domain.startswith('.'):
2788 initial_dot = 'TRUE'
2790 initial_dot = 'FALSE'
2791 if cookie.expires is not None:
2792 expires = compat_str(cookie.expires)
2795 if cookie.value is None:
2796 # cookies.txt regards 'Set-Cookie: foo' as a cookie
2797 # with no name, whereas http.cookiejar regards it as a
2798 # cookie with no value.
2803 value = cookie.value
2805 '\t'.join([cookie.domain, initial_dot, cookie.path,
2806 secure, expires, name, value]) + '\n')
2808 def load(self, filename=None, ignore_discard=False, ignore_expires=False):
2809 """Load cookies from a file."""
2810 if filename is None:
2811 if self.filename is not None:
2812 filename = self.filename
2814 raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT)
2816 def prepare_line(line):
2817 if line.startswith(self._HTTPONLY_PREFIX):
2818 line = line[len(self._HTTPONLY_PREFIX):]
2819 # comments and empty lines are fine
2820 if line.startswith('#') or not line.strip():
2822 cookie_list = line.split('\t')
2823 if len(cookie_list) != self._ENTRY_LEN:
2824 raise compat_cookiejar.LoadError('invalid length %d' % len(cookie_list))
2825 cookie = self._CookieFileEntry(*cookie_list)
2826 if cookie.expires_at and not cookie.expires_at.isdigit():
2827 raise compat_cookiejar.LoadError('invalid expires at %s' % cookie.expires_at)
2831 with io.open(filename, encoding='utf-8') as f:
2834 cf.write(prepare_line(line))
2835 except compat_cookiejar.LoadError as e:
2837 'WARNING: skipping cookie file entry due to %s: %r\n'
2838 % (e, line), sys.stderr)
2841 self._really_load(cf, filename, ignore_discard, ignore_expires)
2842 # Session cookies are denoted by either `expires` field set to
2843 # an empty string or 0. MozillaCookieJar only recognizes the former
2844 # (see [1]). So we need force the latter to be recognized as session
2845 # cookies on our own.
2846 # Session cookies may be important for cookies-based authentication,
2847 # e.g. usually, when user does not check 'Remember me' check box while
2848 # logging in on a site, some important cookies are stored as session
2849 # cookies so that not recognizing them will result in failed login.
2850 # 1. https://bugs.python.org/issue17164
2852 # Treat `expires=0` cookies as session cookies
2853 if cookie.expires == 0:
2854 cookie.expires = None
2855 cookie.discard = True
2858 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
2859 def __init__(self, cookiejar=None):
2860 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
2862 def http_response(self, request, response):
2863 # Python 2 will choke on next HTTP request in row if there are non-ASCII
2864 # characters in Set-Cookie HTTP header of last response (see
2865 # https://github.com/ytdl-org/youtube-dl/issues/6769).
2866 # In order to at least prevent crashing we will percent encode Set-Cookie
2867 # header before HTTPCookieProcessor starts processing it.
2868 # if sys.version_info < (3, 0) and response.headers:
2869 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
2870 # set_cookie = response.headers.get(set_cookie_header)
2872 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
2873 # if set_cookie != set_cookie_escaped:
2874 # del response.headers[set_cookie_header]
2875 # response.headers[set_cookie_header] = set_cookie_escaped
2876 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
2878 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
2879 https_response = http_response
2882 class YoutubeDLRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
2883 """YoutubeDL redirect handler
2885 The code is based on HTTPRedirectHandler implementation from CPython [1].
2887 This redirect handler solves two issues:
2888 - ensures redirect URL is always unicode under python 2
2889 - introduces support for experimental HTTP response status code
2890 308 Permanent Redirect [2] used by some sites [3]
2892 1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
2893 2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308
2894 3. https://github.com/ytdl-org/youtube-dl/issues/28768
2897 http_error_301 = http_error_303 = http_error_307 = http_error_308 = compat_urllib_request.HTTPRedirectHandler.http_error_302
2899 def redirect_request(self, req, fp, code, msg, headers, newurl):
2900 """Return a Request or None in response to a redirect.
2902 This is called by the http_error_30x methods when a
2903 redirection response is received. If a redirection should
2904 take place, return a new Request to allow http_error_30x to
2905 perform the redirect. Otherwise, raise HTTPError if no-one
2906 else should try to handle this url. Return None if you can't
2907 but another Handler might.
2909 m = req.get_method()
2910 if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD")
2911 or code in (301, 302, 303) and m == "POST")):
2912 raise compat_HTTPError(req.full_url, code, msg, headers, fp)
2913 # Strictly (according to RFC 2616), 301 or 302 in response to
2914 # a POST MUST NOT cause a redirection without confirmation
2915 # from the user (of urllib.request, in this case). In practice,
2916 # essentially all clients do redirect in this case, so we do
2919 # On python 2 urlh.geturl() may sometimes return redirect URL
2920 # as byte string instead of unicode. This workaround allows
2921 # to force it always return unicode.
2922 if sys.version_info[0] < 3:
2923 newurl = compat_str(newurl)
2925 # Be conciliant with URIs containing a space. This is mainly
2926 # redundant with the more complete encoding done in http_error_302(),
2927 # but it is kept for compatibility with other callers.
2928 newurl = newurl.replace(' ', '%20')
2930 CONTENT_HEADERS = ("content-length", "content-type")
2931 # NB: don't use dict comprehension for python 2.6 compatibility
2932 newheaders = dict((k, v) for k, v in req.headers.items()
2933 if k.lower() not in CONTENT_HEADERS)
2934 return compat_urllib_request.Request(
2935 newurl, headers=newheaders, origin_req_host=req.origin_req_host,
2939 def extract_timezone(date_str):
2941 r'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
2944 timezone = datetime.timedelta()
2946 date_str = date_str[:-len(m.group('tz'))]
2947 if not m.group('sign'):
2948 timezone = datetime.timedelta()
2950 sign = 1 if m.group('sign') == '+' else -1
2951 timezone = datetime.timedelta(
2952 hours=sign * int(m.group('hours')),
2953 minutes=sign * int(m.group('minutes')))
2954 return timezone, date_str
2957 def parse_iso8601(date_str, delimiter='T', timezone=None):
2958 """ Return a UNIX timestamp from the given date """
2960 if date_str is None:
2963 date_str = re.sub(r'\.[0-9]+', '', date_str)
2965 if timezone is None:
2966 timezone, date_str = extract_timezone(date_str)
2969 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
2970 dt = datetime.datetime.strptime(date_str, date_format) - timezone
2971 return calendar.timegm(dt.timetuple())
2976 def date_formats(day_first=True):
2977 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
2980 def unified_strdate(date_str, day_first=True):
2981 """Return a string with the date in the format YYYYMMDD"""
2983 if date_str is None:
2987 date_str = date_str.replace(',', ' ')
2988 # Remove AM/PM + timezone
2989 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
2990 _, date_str = extract_timezone(date_str)
2992 for expression in date_formats(day_first):
2994 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
2997 if upload_date is None:
2998 timetuple = email.utils.parsedate_tz(date_str)
3001 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
3004 if upload_date is not None:
3005 return compat_str(upload_date)
3008 def unified_timestamp(date_str, day_first=True):
3009 if date_str is None:
3012 date_str = re.sub(r'[,|]', '', date_str)
3014 pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
3015 timezone, date_str = extract_timezone(date_str)
3017 # Remove AM/PM + timezone
3018 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
3020 # Remove unrecognized timezones from ISO 8601 alike timestamps
3021 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
3023 date_str = date_str[:-len(m.group('tz'))]
3025 # Python only supports microseconds, so remove nanoseconds
3026 m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
3028 date_str = m.group(1)
3030 for expression in date_formats(day_first):
3032 dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
3033 return calendar.timegm(dt.timetuple())
3036 timetuple = email.utils.parsedate_tz(date_str)
3038 return calendar.timegm(timetuple) + pm_delta * 3600
3041 def determine_ext(url, default_ext='unknown_video'):
3042 if url is None or '.' not in url:
3044 guess = url.partition('?')[0].rpartition('.')[2]
3045 if re.match(r'^[A-Za-z0-9]+$', guess):
3047 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
3048 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
3049 return guess.rstrip('/')
3054 def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
3055 return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
3058 def date_from_str(date_str):
3060 Return a datetime object from a string in the format YYYYMMDD or
3061 (now|today)[+-][0-9](day|week|month|year)(s)?"""
3062 today = datetime.date.today()
3063 if date_str in ('now', 'today'):
3065 if date_str == 'yesterday':
3066 return today - datetime.timedelta(days=1)
3067 match = re.match(r'(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
3068 if match is not None:
3069 sign = match.group('sign')
3070 time = int(match.group('time'))
3073 unit = match.group('unit')
3074 # A bad approximation?
3078 elif unit == 'year':
3082 delta = datetime.timedelta(**{unit: time})
3083 return today + delta
3084 return datetime.datetime.strptime(date_str, '%Y%m%d').date()
3087 def hyphenate_date(date_str):
3089 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
3090 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
3091 if match is not None:
3092 return '-'.join(match.groups())
3097 class DateRange(object):
3098 """Represents a time interval between two dates"""
3100 def __init__(self, start=None, end=None):
3101 """start and end must be strings in the format accepted by date"""
3102 if start is not None:
3103 self.start = date_from_str(start)
3105 self.start = datetime.datetime.min.date()
3107 self.end = date_from_str(end)
3109 self.end = datetime.datetime.max.date()
3110 if self.start > self.end:
3111 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
3115 """Returns a range that only contains the given day"""
3116 return cls(day, day)
3118 def __contains__(self, date):
3119 """Check if the date is in the range"""
3120 if not isinstance(date, datetime.date):
3121 date = date_from_str(date)
3122 return self.start <= date <= self.end
3125 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
3128 def platform_name():
3129 """ Returns the platform name as a compat_str """
3130 res = platform.platform()
3131 if isinstance(res, bytes):
3132 res = res.decode(preferredencoding())
3134 assert isinstance(res, compat_str)
3138 def _windows_write_string(s, out):
3139 """ Returns True if the string was written using special methods,
3140 False if it has yet to be written out."""
3141 # Adapted from http://stackoverflow.com/a/3259271/35070
3144 import ctypes.wintypes
3152 fileno = out.fileno()
3153 except AttributeError:
3154 # If the output stream doesn't have a fileno, it's virtual
3156 except io.UnsupportedOperation:
3157 # Some strange Windows pseudo files?
3159 if fileno not in WIN_OUTPUT_IDS:
3162 GetStdHandle = compat_ctypes_WINFUNCTYPE(
3163 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
3164 ('GetStdHandle', ctypes.windll.kernel32))
3165 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
3167 WriteConsoleW = compat_ctypes_WINFUNCTYPE(
3168 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
3169 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
3170 ctypes.wintypes.LPVOID)(('WriteConsoleW', ctypes.windll.kernel32))
3171 written = ctypes.wintypes.DWORD(0)
3173 GetFileType = compat_ctypes_WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(('GetFileType', ctypes.windll.kernel32))
3174 FILE_TYPE_CHAR = 0x0002
3175 FILE_TYPE_REMOTE = 0x8000
3176 GetConsoleMode = compat_ctypes_WINFUNCTYPE(
3177 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
3178 ctypes.POINTER(ctypes.wintypes.DWORD))(
3179 ('GetConsoleMode', ctypes.windll.kernel32))
3180 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
3182 def not_a_console(handle):
3183 if handle == INVALID_HANDLE_VALUE or handle is None:
3185 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
3186 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
3188 if not_a_console(h):
3191 def next_nonbmp_pos(s):
3193 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
3194 except StopIteration:
3198 count = min(next_nonbmp_pos(s), 1024)
3200 ret = WriteConsoleW(
3201 h, s, count if count else 2, ctypes.byref(written), None)
3203 raise OSError('Failed to write string')
3204 if not count: # We just wrote a non-BMP character
3205 assert written.value == 2
3208 assert written.value > 0
3209 s = s[written.value:]
3213 def write_string(s, out=None, encoding=None):
3216 assert type(s) == compat_str
3218 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
3219 if _windows_write_string(s, out):
3222 if ('b' in getattr(out, 'mode', '')
3223 or sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
3224 byt = s.encode(encoding or preferredencoding(), 'ignore')
3226 elif hasattr(out, 'buffer'):
3227 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
3228 byt = s.encode(enc, 'ignore')
3229 out.buffer.write(byt)
3235 def bytes_to_intlist(bs):
3238 if isinstance(bs[0], int): # Python 3
3241 return [ord(c) for c in bs]
3244 def intlist_to_bytes(xs):
3247 return compat_struct_pack('%dB' % len(xs), *xs)
3250 # Cross-platform file locking
3251 if sys.platform == 'win32':
3252 import ctypes.wintypes
3255 class OVERLAPPED(ctypes.Structure):
3257 ('Internal', ctypes.wintypes.LPVOID),
3258 ('InternalHigh', ctypes.wintypes.LPVOID),
3259 ('Offset', ctypes.wintypes.DWORD),
3260 ('OffsetHigh', ctypes.wintypes.DWORD),
3261 ('hEvent', ctypes.wintypes.HANDLE),
3264 kernel32 = ctypes.windll.kernel32
3265 LockFileEx = kernel32.LockFileEx
3266 LockFileEx.argtypes = [
3267 ctypes.wintypes.HANDLE, # hFile
3268 ctypes.wintypes.DWORD, # dwFlags
3269 ctypes.wintypes.DWORD, # dwReserved
3270 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
3271 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
3272 ctypes.POINTER(OVERLAPPED) # Overlapped
3274 LockFileEx.restype = ctypes.wintypes.BOOL
3275 UnlockFileEx = kernel32.UnlockFileEx
3276 UnlockFileEx.argtypes = [
3277 ctypes.wintypes.HANDLE, # hFile
3278 ctypes.wintypes.DWORD, # dwReserved
3279 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
3280 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
3281 ctypes.POINTER(OVERLAPPED) # Overlapped
3283 UnlockFileEx.restype = ctypes.wintypes.BOOL
3284 whole_low = 0xffffffff
3285 whole_high = 0x7fffffff
3287 def _lock_file(f, exclusive):
3288 overlapped = OVERLAPPED()
3289 overlapped.Offset = 0
3290 overlapped.OffsetHigh = 0
3291 overlapped.hEvent = 0
3292 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
3293 handle = msvcrt.get_osfhandle(f.fileno())
3294 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
3295 whole_low, whole_high, f._lock_file_overlapped_p):
3296 raise OSError('Locking file failed: %r' % ctypes.FormatError())
3298 def _unlock_file(f):
3299 assert f._lock_file_overlapped_p
3300 handle = msvcrt.get_osfhandle(f.fileno())
3301 if not UnlockFileEx(handle, 0,
3302 whole_low, whole_high, f._lock_file_overlapped_p):
3303 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
3306 # Some platforms, such as Jython, is missing fcntl
3310 def _lock_file(f, exclusive):
3311 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
3313 def _unlock_file(f):
3314 fcntl.flock(f, fcntl.LOCK_UN)
3316 UNSUPPORTED_MSG = 'file locking is not supported on this platform'
3318 def _lock_file(f, exclusive):
3319 raise IOError(UNSUPPORTED_MSG)
3321 def _unlock_file(f):
3322 raise IOError(UNSUPPORTED_MSG)
3325 class locked_file(object):
3326 def __init__(self, filename, mode, encoding=None):
3327 assert mode in ['r', 'a', 'w']
3328 self.f = io.open(filename, mode, encoding=encoding)
3331 def __enter__(self):
3332 exclusive = self.mode != 'r'
3334 _lock_file(self.f, exclusive)
3340 def __exit__(self, etype, value, traceback):
3342 _unlock_file(self.f)
3349 def write(self, *args):
3350 return self.f.write(*args)
3352 def read(self, *args):
3353 return self.f.read(*args)
3356 def get_filesystem_encoding():
3357 encoding = sys.getfilesystemencoding()
3358 return encoding if encoding is not None else 'utf-8'
3361 def shell_quote(args):
3363 encoding = get_filesystem_encoding()
3365 if isinstance(a, bytes):
3366 # We may get a filename encoded with 'encodeFilename'
3367 a = a.decode(encoding)
3368 quoted_args.append(compat_shlex_quote(a))
3369 return ' '.join(quoted_args)
3372 def smuggle_url(url, data):
3373 """ Pass additional data in a URL for internal use. """
3375 url, idata = unsmuggle_url(url, {})
3377 sdata = compat_urllib_parse_urlencode(
3378 {'__youtubedl_smuggle': json.dumps(data)})
3379 return url + '#' + sdata
3382 def unsmuggle_url(smug_url, default=None):
3383 if '#__youtubedl_smuggle' not in smug_url:
3384 return smug_url, default
3385 url, _, sdata = smug_url.rpartition('#')
3386 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
3387 data = json.loads(jsond)
3391 def format_bytes(bytes):
3394 if type(bytes) is str:
3395 bytes = float(bytes)
3399 exponent = int(math.log(bytes, 1024.0))
3400 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
3401 converted = float(bytes) / float(1024 ** exponent)
3402 return '%.2f%s' % (converted, suffix)
3405 def lookup_unit_table(unit_table, s):
3406 units_re = '|'.join(re.escape(u) for u in unit_table)
3408 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
3411 num_str = m.group('num').replace(',', '.')
3412 mult = unit_table[m.group('unit')]
3413 return int(float(num_str) * mult)
3416 def parse_filesize(s):
3420 # The lower-case forms are of course incorrect and unofficial,
3421 # but we support those too
3438 'megabytes': 1000 ** 2,
3439 'mebibytes': 1024 ** 2,
3445 'gigabytes': 1000 ** 3,
3446 'gibibytes': 1024 ** 3,
3452 'terabytes': 1000 ** 4,
3453 'tebibytes': 1024 ** 4,
3459 'petabytes': 1000 ** 5,
3460 'pebibytes': 1024 ** 5,
3466 'exabytes': 1000 ** 6,
3467 'exbibytes': 1024 ** 6,
3473 'zettabytes': 1000 ** 7,
3474 'zebibytes': 1024 ** 7,
3480 'yottabytes': 1000 ** 8,
3481 'yobibytes': 1024 ** 8,
3484 return lookup_unit_table(_UNIT_TABLE, s)
3493 if re.match(r'^[\d,.]+$', s):
3494 return str_to_int(s)
3505 return lookup_unit_table(_UNIT_TABLE, s)
3508 def parse_resolution(s):
3512 mobj = re.search(r'\b(?P<w>\d+)\s*[xX×]\s*(?P<h>\d+)\b', s)
3515 'width': int(mobj.group('w')),
3516 'height': int(mobj.group('h')),
3519 mobj = re.search(r'\b(\d+)[pPiI]\b', s)
3521 return {'height': int(mobj.group(1))}
3523 mobj = re.search(r'\b([48])[kK]\b', s)
3525 return {'height': int(mobj.group(1)) * 540}
3530 def parse_bitrate(s):
3531 if not isinstance(s, compat_str):
3533 mobj = re.search(r'\b(\d+)\s*kbps', s)
3535 return int(mobj.group(1))
3538 def month_by_name(name, lang='en'):
3539 """ Return the number of a month by (locale-independently) English name """
3541 month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
3544 return month_names.index(name) + 1
3549 def month_by_abbreviation(abbrev):
3550 """ Return the number of a month by (locale-independently) English
3554 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
3559 def fix_xml_ampersands(xml_str):
3560 """Replace all the '&' by '&' in XML"""
3562 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
3567 def setproctitle(title):
3568 assert isinstance(title, compat_str)
3570 # ctypes in Jython is not complete
3571 # http://bugs.jython.org/issue2148
3572 if sys.platform.startswith('java'):
3576 libc = ctypes.cdll.LoadLibrary('libc.so.6')
3580 # LoadLibrary in Windows Python 2.7.13 only expects
3581 # a bytestring, but since unicode_literals turns
3582 # every string into a unicode string, it fails.
3584 title_bytes = title.encode('utf-8')
3585 buf = ctypes.create_string_buffer(len(title_bytes))
3586 buf.value = title_bytes
3588 libc.prctl(15, buf, 0, 0, 0)
3589 except AttributeError:
3590 return # Strange libc, just skip this
3593 def remove_start(s, start):
3594 return s[len(start):] if s is not None and s.startswith(start) else s
3597 def remove_end(s, end):
3598 return s[:-len(end)] if s is not None and s.endswith(end) else s
3601 def remove_quotes(s):
3602 if s is None or len(s) < 2:
3604 for quote in ('"', "'", ):
3605 if s[0] == quote and s[-1] == quote:
3610 def url_basename(url):
3611 path = compat_urlparse.urlparse(url).path
3612 return path.strip('/').split('/')[-1]
3616 return re.match(r'https?://[^?#&]+/', url).group()
3619 def urljoin(base, path):
3620 if isinstance(path, bytes):
3621 path = path.decode('utf-8')
3622 if not isinstance(path, compat_str) or not path:
3624 if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
3626 if isinstance(base, bytes):
3627 base = base.decode('utf-8')
3628 if not isinstance(base, compat_str) or not re.match(
3629 r'^(?:https?:)?//', base):
3631 return compat_urlparse.urljoin(base, path)
3634 class HEADRequest(compat_urllib_request.Request):
3635 def get_method(self):
3639 class PUTRequest(compat_urllib_request.Request):
3640 def get_method(self):
3644 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
3647 v = getattr(v, get_attr, None)
3653 return int(v) * invscale // scale
3654 except (ValueError, TypeError):
3658 def str_or_none(v, default=None):
3659 return default if v is None else compat_str(v)
3662 def str_to_int(int_str):
3663 """ A more relaxed version of int_or_none """
3664 if isinstance(int_str, compat_integer_types):
3666 elif isinstance(int_str, compat_str):
3667 int_str = re.sub(r'[,\.\+]', '', int_str)
3668 return int_or_none(int_str)
3671 def float_or_none(v, scale=1, invscale=1, default=None):
3675 return float(v) * invscale / scale
3676 except (ValueError, TypeError):
3680 def bool_or_none(v, default=None):
3681 return v if isinstance(v, bool) else default
3684 def strip_or_none(v, default=None):
3685 return v.strip() if isinstance(v, compat_str) else default
3688 def url_or_none(url):
3689 if not url or not isinstance(url, compat_str):
3692 return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
3695 def parse_duration(s):
3696 if not isinstance(s, compat_basestring):
3701 days, hours, mins, secs, ms = [None] * 5
3702 m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?Z?$', s)
3704 days, hours, mins, secs, ms = m.groups()
3709 [0-9]+\s*y(?:ears?)?\s*
3712 [0-9]+\s*m(?:onths?)?\s*
3715 [0-9]+\s*w(?:eeks?)?\s*
3718 (?P<days>[0-9]+)\s*d(?:ays?)?\s*
3722 (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
3725 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
3728 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
3731 days, hours, mins, secs, ms = m.groups()
3733 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
3735 hours, mins = m.groups()
3741 duration += float(secs)
3743 duration += float(mins) * 60
3745 duration += float(hours) * 60 * 60
3747 duration += float(days) * 24 * 60 * 60
3749 duration += float(ms)
3753 def prepend_extension(filename, ext, expected_real_ext=None):
3754 name, real_ext = os.path.splitext(filename)
3756 '{0}.{1}{2}'.format(name, ext, real_ext)
3757 if not expected_real_ext or real_ext[1:] == expected_real_ext
3758 else '{0}.{1}'.format(filename, ext))
3761 def replace_extension(filename, ext, expected_real_ext=None):
3762 name, real_ext = os.path.splitext(filename)
3763 return '{0}.{1}'.format(
3764 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
3768 def check_executable(exe, args=[]):
3769 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
3770 args can be a list of arguments for a short output (like -version) """
3772 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
3778 def get_exe_version(exe, args=['--version'],
3779 version_re=None, unrecognized='present'):
3780 """ Returns the version of the specified executable,
3781 or False if the executable is not present """
3783 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
3784 # SIGTTOU if youtube-dl is run in the background.
3785 # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
3786 out, _ = subprocess.Popen(
3787 [encodeArgument(exe)] + args,
3788 stdin=subprocess.PIPE,
3789 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
3792 if isinstance(out, bytes): # Python 2.x
3793 out = out.decode('ascii', 'ignore')
3794 return detect_exe_version(out, version_re, unrecognized)
3797 def detect_exe_version(output, version_re=None, unrecognized='present'):
3798 assert isinstance(output, compat_str)
3799 if version_re is None:
3800 version_re = r'version\s+([-0-9._a-zA-Z]+)'
3801 m = re.search(version_re, output)
3808 class PagedList(object):
3810 # This is only useful for tests
3811 return len(self.getslice())
3814 class OnDemandPagedList(PagedList):
3815 def __init__(self, pagefunc, pagesize, use_cache=True):
3816 self._pagefunc = pagefunc
3817 self._pagesize = pagesize
3818 self._use_cache = use_cache
3822 def getslice(self, start=0, end=None):
3824 for pagenum in itertools.count(start // self._pagesize):
3825 firstid = pagenum * self._pagesize
3826 nextfirstid = pagenum * self._pagesize + self._pagesize
3827 if start >= nextfirstid:
3832 page_results = self._cache.get(pagenum)
3833 if page_results is None:
3834 page_results = list(self._pagefunc(pagenum))
3836 self._cache[pagenum] = page_results
3839 start % self._pagesize
3840 if firstid <= start < nextfirstid
3844 ((end - 1) % self._pagesize) + 1
3845 if (end is not None and firstid <= end <= nextfirstid)
3848 if startv != 0 or endv is not None:
3849 page_results = page_results[startv:endv]
3850 res.extend(page_results)
3852 # A little optimization - if current page is not "full", ie. does
3853 # not contain page_size videos then we can assume that this page
3854 # is the last one - there are no more ids on further pages -
3855 # i.e. no need to query again.
3856 if len(page_results) + startv < self._pagesize:
3859 # If we got the whole page, but the next page is not interesting,
3860 # break out early as well
3861 if end == nextfirstid:
3866 class InAdvancePagedList(PagedList):
3867 def __init__(self, pagefunc, pagecount, pagesize):
3868 self._pagefunc = pagefunc
3869 self._pagecount = pagecount
3870 self._pagesize = pagesize
3872 def getslice(self, start=0, end=None):
3874 start_page = start // self._pagesize
3876 self._pagecount if end is None else (end // self._pagesize + 1))
3877 skip_elems = start - start_page * self._pagesize
3878 only_more = None if end is None else end - start
3879 for pagenum in range(start_page, end_page):
3880 page = list(self._pagefunc(pagenum))
3882 page = page[skip_elems:]
3884 if only_more is not None:
3885 if len(page) < only_more:
3886 only_more -= len(page)
3888 page = page[:only_more]
3895 def uppercase_escape(s):
3896 unicode_escape = codecs.getdecoder('unicode_escape')
3898 r'\\U[0-9a-fA-F]{8}',
3899 lambda m: unicode_escape(m.group(0))[0],
3903 def lowercase_escape(s):
3904 unicode_escape = codecs.getdecoder('unicode_escape')
3906 r'\\u[0-9a-fA-F]{4}',
3907 lambda m: unicode_escape(m.group(0))[0],
3911 def escape_rfc3986(s):
3912 """Escape non-ASCII characters as suggested by RFC 3986"""
3913 if sys.version_info < (3, 0) and isinstance(s, compat_str):
3914 s = s.encode('utf-8')
3915 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
3918 def escape_url(url):
3919 """Escape URL as suggested by RFC 3986"""
3920 url_parsed = compat_urllib_parse_urlparse(url)
3921 return url_parsed._replace(
3922 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
3923 path=escape_rfc3986(url_parsed.path),
3924 params=escape_rfc3986(url_parsed.params),
3925 query=escape_rfc3986(url_parsed.query),
3926 fragment=escape_rfc3986(url_parsed.fragment)
3930 def read_batch_urls(batch_fd):
3932 if not isinstance(url, compat_str):
3933 url = url.decode('utf-8', 'replace')
3934 BOM_UTF8 = '\xef\xbb\xbf'
3935 if url.startswith(BOM_UTF8):
3936 url = url[len(BOM_UTF8):]
3938 if url.startswith(('#', ';', ']')):
3942 with contextlib.closing(batch_fd) as fd:
3943 return [url for url in map(fixup, fd) if url]
3946 def urlencode_postdata(*args, **kargs):
3947 return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
3950 def update_url_query(url, query):
3953 parsed_url = compat_urlparse.urlparse(url)
3954 qs = compat_parse_qs(parsed_url.query)
3956 return compat_urlparse.urlunparse(parsed_url._replace(
3957 query=compat_urllib_parse_urlencode(qs, True)))
3960 def update_Request(req, url=None, data=None, headers={}, query={}):
3961 req_headers = req.headers.copy()
3962 req_headers.update(headers)
3963 req_data = data or req.data
3964 req_url = update_url_query(url or req.get_full_url(), query)
3965 req_get_method = req.get_method()
3966 if req_get_method == 'HEAD':
3967 req_type = HEADRequest
3968 elif req_get_method == 'PUT':
3969 req_type = PUTRequest
3971 req_type = compat_urllib_request.Request
3973 req_url, data=req_data, headers=req_headers,
3974 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
3975 if hasattr(req, 'timeout'):
3976 new_req.timeout = req.timeout
3980 def _multipart_encode_impl(data, boundary):
3981 content_type = 'multipart/form-data; boundary=%s' % boundary
3984 for k, v in data.items():
3985 out += b'--' + boundary.encode('ascii') + b'\r\n'
3986 if isinstance(k, compat_str):
3987 k = k.encode('utf-8')
3988 if isinstance(v, compat_str):
3989 v = v.encode('utf-8')
3990 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
3991 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
3992 content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
3993 if boundary.encode('ascii') in content:
3994 raise ValueError('Boundary overlaps with data')
3997 out += b'--' + boundary.encode('ascii') + b'--\r\n'
3999 return out, content_type
4002 def multipart_encode(data, boundary=None):
4004 Encode a dict to RFC 7578-compliant form-data
4007 A dict where keys and values can be either Unicode or bytes-like
4010 If specified a Unicode object, it's used as the boundary. Otherwise
4011 a random boundary is generated.
4013 Reference: https://tools.ietf.org/html/rfc7578
4015 has_specified_boundary = boundary is not None
4018 if boundary is None:
4019 boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
4022 out, content_type = _multipart_encode_impl(data, boundary)
4025 if has_specified_boundary:
4029 return out, content_type
4032 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
4033 if isinstance(key_or_keys, (list, tuple)):
4034 for key in key_or_keys:
4035 if key not in d or d[key] is None or skip_false_values and not d[key]:
4039 return d.get(key_or_keys, default)
4042 def try_get(src, getter, expected_type=None):
4043 if not isinstance(getter, (list, tuple)):
4048 except (AttributeError, KeyError, TypeError, IndexError):
4051 if expected_type is None or isinstance(v, expected_type):
4055 def merge_dicts(*dicts):
4057 for a_dict in dicts:
4058 for k, v in a_dict.items():
4062 or (isinstance(v, compat_str) and v
4063 and isinstance(merged[k], compat_str)
4064 and not merged[k])):
4069 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
4070 return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
4082 TV_PARENTAL_GUIDELINES = {
4092 def parse_age_limit(s):
4094 return s if 0 <= s <= 21 else None
4095 if not isinstance(s, compat_basestring):
4097 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
4099 return int(m.group('age'))
4101 return US_RATINGS[s]
4102 m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
4104 return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
4108 def strip_jsonp(code):
4111 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
4112 (?:\s*&&\s*(?P=func_name))?
4113 \s*\(\s*(?P<callback_data>.*)\);?
4114 \s*?(?://[^\n]*)*$''',
4115 r'\g<callback_data>', code)
4118 def js_to_json(code):
4119 COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*'
4120 SKIP_RE = r'\s*(?:{comment})?\s*'.format(comment=COMMENT_RE)
4122 (r'(?s)^(0[xX][0-9a-fA-F]+){skip}:?$'.format(skip=SKIP_RE), 16),
4123 (r'(?s)^(0+[0-7]+){skip}:?$'.format(skip=SKIP_RE), 8),
4128 if v in ('true', 'false', 'null'):
4130 elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
4133 if v[0] in ("'", '"'):
4134 v = re.sub(r'(?s)\\.|"', lambda m: {
4139 }.get(m.group(0), m.group(0)), v[1:-1])
4141 for regex, base in INTEGER_TABLE:
4142 im = re.match(regex, v)
4144 i = int(im.group(1), base)
4145 return '"%d":' % i if v.endswith(':') else '%d' % i
4149 return re.sub(r'''(?sx)
4150 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
4151 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
4152 {comment}|,(?={skip}[\]}}])|
4153 (?:(?<![0-9])[eE]|[a-df-zA-DF-Z_])[.a-zA-Z_0-9]*|
4154 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
4157 '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
4160 def qualities(quality_ids):
4161 """ Get a numeric quality value out of a list of possible values """
4164 return quality_ids.index(qid)
4170 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
4173 def limit_length(s, length):
4174 """ Add ellipses to overly long strings """
4179 return s[:length - len(ELLIPSES)] + ELLIPSES
4183 def version_tuple(v):
4184 return tuple(int(e) for e in re.split(r'[-.]', v))
4187 def is_outdated_version(version, limit, assume_new=True):
4189 return not assume_new
4191 return version_tuple(version) < version_tuple(limit)
4193 return not assume_new
4196 def ytdl_is_updateable():
4197 """ Returns if youtube-dl can be updated with -U """
4198 from zipimport import zipimporter
4200 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
4203 def args_to_str(args):
4204 # Get a short string representation for a subprocess command
4205 return ' '.join(compat_shlex_quote(a) for a in args)
4208 def error_to_compat_str(err):
4210 # On python 2 error byte string must be decoded with proper
4211 # encoding rather than ascii
4212 if sys.version_info[0] < 3:
4213 err_str = err_str.decode(preferredencoding())
4217 def mimetype2ext(mt):
4223 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
4224 # it's the most popular one
4225 'audio/mpeg': 'mp3',
4230 _, _, res = mt.rpartition('/')
4231 res = res.split(';')[0].strip().lower()
4235 'smptett+xml': 'tt',
4239 'x-mp4-fragmented': 'mp4',
4240 'x-ms-sami': 'sami',
4243 'x-mpegurl': 'm3u8',
4244 'vnd.apple.mpegurl': 'm3u8',
4248 'vnd.ms-sstr+xml': 'ism',
4255 def parse_codecs(codecs_str):
4256 # http://tools.ietf.org/html/rfc6381
4259 split_codecs = list(filter(None, map(
4260 lambda str: str.strip(), codecs_str.strip().strip(',').split(','))))
4261 vcodec, acodec = None, None
4262 for full_codec in split_codecs:
4263 codec = full_codec.split('.')[0]
4264 if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v', 'hvc1', 'av01', 'theora'):
4267 elif codec in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
4271 write_string('WARNING: Unknown codec %s\n' % full_codec, sys.stderr)
4272 if not vcodec and not acodec:
4273 if len(split_codecs) == 2:
4275 'vcodec': split_codecs[0],
4276 'acodec': split_codecs[1],
4280 'vcodec': vcodec or 'none',
4281 'acodec': acodec or 'none',
4286 def urlhandle_detect_ext(url_handle):
4287 getheader = url_handle.headers.get
4289 cd = getheader('Content-Disposition')
4291 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
4293 e = determine_ext(m.group('filename'), default_ext=None)
4297 return mimetype2ext(getheader('Content-Type'))
4300 def encode_data_uri(data, mime_type):
4301 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
4304 def age_restricted(content_limit, age_limit):
4305 """ Returns True iff the content should be blocked """
4307 if age_limit is None: # No limit set
4309 if content_limit is None:
4310 return False # Content available for everyone
4311 return age_limit < content_limit
4314 def is_html(first_bytes):
4315 """ Detect whether a file contains HTML by examining its first bytes. """
4318 (b'\xef\xbb\xbf', 'utf-8'),
4319 (b'\x00\x00\xfe\xff', 'utf-32-be'),
4320 (b'\xff\xfe\x00\x00', 'utf-32-le'),
4321 (b'\xff\xfe', 'utf-16-le'),
4322 (b'\xfe\xff', 'utf-16-be'),
4324 for bom, enc in BOMS:
4325 if first_bytes.startswith(bom):
4326 s = first_bytes[len(bom):].decode(enc, 'replace')
4329 s = first_bytes.decode('utf-8', 'replace')
4331 return re.match(r'^\s*<', s)
4334 def determine_protocol(info_dict):
4335 protocol = info_dict.get('protocol')
4336 if protocol is not None:
4339 url = info_dict['url']
4340 if url.startswith('rtmp'):
4342 elif url.startswith('mms'):
4344 elif url.startswith('rtsp'):
4347 ext = determine_ext(url)
4353 return compat_urllib_parse_urlparse(url).scheme
4356 def render_table(header_row, data):
4357 """ Render a list of rows, each as a list of values """
4358 table = [header_row] + data
4359 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
4360 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
4361 return '\n'.join(format_str % tuple(row) for row in table)
4364 def _match_one(filter_part, dct):
4365 COMPARISON_OPERATORS = {
4373 operator_rex = re.compile(r'''(?x)\s*
4375 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
4377 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
4378 (?P<quote>["\'])(?P<quotedstrval>(?:\\.|(?!(?P=quote)|\\).)+?)(?P=quote)|
4379 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
4382 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
4383 m = operator_rex.search(filter_part)
4385 op = COMPARISON_OPERATORS[m.group('op')]
4386 actual_value = dct.get(m.group('key'))
4387 if (m.group('quotedstrval') is not None
4388 or m.group('strval') is not None
4389 # If the original field is a string and matching comparisonvalue is
4390 # a number we should respect the origin of the original field
4391 # and process comparison value as a string (see
4392 # https://github.com/ytdl-org/youtube-dl/issues/11082).
4393 or actual_value is not None and m.group('intval') is not None
4394 and isinstance(actual_value, compat_str)):
4395 if m.group('op') not in ('=', '!='):
4397 'Operator %s does not support string values!' % m.group('op'))
4398 comparison_value = m.group('quotedstrval') or m.group('strval') or m.group('intval')
4399 quote = m.group('quote')
4400 if quote is not None:
4401 comparison_value = comparison_value.replace(r'\%s' % quote, quote)
4404 comparison_value = int(m.group('intval'))
4406 comparison_value = parse_filesize(m.group('intval'))
4407 if comparison_value is None:
4408 comparison_value = parse_filesize(m.group('intval') + 'B')
4409 if comparison_value is None:
4411 'Invalid integer value %r in filter part %r' % (
4412 m.group('intval'), filter_part))
4413 if actual_value is None:
4414 return m.group('none_inclusive')
4415 return op(actual_value, comparison_value)
4418 '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
4419 '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
4421 operator_rex = re.compile(r'''(?x)\s*
4422 (?P<op>%s)\s*(?P<key>[a-z_]+)
4424 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
4425 m = operator_rex.search(filter_part)
4427 op = UNARY_OPERATORS[m.group('op')]
4428 actual_value = dct.get(m.group('key'))
4429 return op(actual_value)
4431 raise ValueError('Invalid filter part %r' % filter_part)
4434 def match_str(filter_str, dct):
4435 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
4438 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
4441 def match_filter_func(filter_str):
4442 def _match_func(info_dict):
4443 if match_str(filter_str, info_dict):
4446 video_title = info_dict.get('title', info_dict.get('id', 'video'))
4447 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
4451 def parse_dfxp_time_expr(time_expr):
4455 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
4457 return float(mobj.group('time_offset'))
4459 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
4461 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
4464 def srt_subtitles_timecode(seconds):
4465 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
4468 def dfxp2srt(dfxp_data):
4470 @param dfxp_data A bytes-like object containing DFXP data
4471 @returns A unicode object containing converted SRT data
4473 LEGACY_NAMESPACES = (
4474 (b'http://www.w3.org/ns/ttml', [
4475 b'http://www.w3.org/2004/11/ttaf1',
4476 b'http://www.w3.org/2006/04/ttaf1',
4477 b'http://www.w3.org/2006/10/ttaf1',
4479 (b'http://www.w3.org/ns/ttml#styling', [
4480 b'http://www.w3.org/ns/ttml#style',
4484 SUPPORTED_STYLING = [
4493 _x = functools.partial(xpath_with_ns, ns_map={
4494 'xml': 'http://www.w3.org/XML/1998/namespace',
4495 'ttml': 'http://www.w3.org/ns/ttml',
4496 'tts': 'http://www.w3.org/ns/ttml#styling',
4502 class TTMLPElementParser(object):
4504 _unclosed_elements = []
4505 _applied_styles = []
4507 def start(self, tag, attrib):
4508 if tag in (_x('ttml:br'), 'br'):
4511 unclosed_elements = []
4513 element_style_id = attrib.get('style')
4515 style.update(default_style)
4516 if element_style_id:
4517 style.update(styles.get(element_style_id, {}))
4518 for prop in SUPPORTED_STYLING:
4519 prop_val = attrib.get(_x('tts:' + prop))
4521 style[prop] = prop_val
4524 for k, v in sorted(style.items()):
4525 if self._applied_styles and self._applied_styles[-1].get(k) == v:
4528 font += ' color="%s"' % v
4529 elif k == 'fontSize':
4530 font += ' size="%s"' % v
4531 elif k == 'fontFamily':
4532 font += ' face="%s"' % v
4533 elif k == 'fontWeight' and v == 'bold':
4535 unclosed_elements.append('b')
4536 elif k == 'fontStyle' and v == 'italic':
4538 unclosed_elements.append('i')
4539 elif k == 'textDecoration' and v == 'underline':
4541 unclosed_elements.append('u')
4543 self._out += '<font' + font + '>'
4544 unclosed_elements.append('font')
4546 if self._applied_styles:
4547 applied_style.update(self._applied_styles[-1])
4548 applied_style.update(style)
4549 self._applied_styles.append(applied_style)
4550 self._unclosed_elements.append(unclosed_elements)
4553 if tag not in (_x('ttml:br'), 'br'):
4554 unclosed_elements = self._unclosed_elements.pop()
4555 for element in reversed(unclosed_elements):
4556 self._out += '</%s>' % element
4557 if unclosed_elements and self._applied_styles:
4558 self._applied_styles.pop()
4560 def data(self, data):
4564 return self._out.strip()
4566 def parse_node(node):
4567 target = TTMLPElementParser()
4568 parser = xml.etree.ElementTree.XMLParser(target=target)
4569 parser.feed(xml.etree.ElementTree.tostring(node))
4570 return parser.close()
4572 for k, v in LEGACY_NAMESPACES:
4574 dfxp_data = dfxp_data.replace(ns, k)
4576 dfxp = compat_etree_fromstring(dfxp_data)
4578 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
4581 raise ValueError('Invalid dfxp/TTML subtitle')
4585 for style in dfxp.findall(_x('.//ttml:style')):
4586 style_id = style.get('id') or style.get(_x('xml:id'))
4589 parent_style_id = style.get('style')
4591 if parent_style_id not in styles:
4594 styles[style_id] = styles[parent_style_id].copy()
4595 for prop in SUPPORTED_STYLING:
4596 prop_val = style.get(_x('tts:' + prop))
4598 styles.setdefault(style_id, {})[prop] = prop_val
4604 for p in ('body', 'div'):
4605 ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
4608 style = styles.get(ele.get('style'))
4611 default_style.update(style)
4613 for para, index in zip(paras, itertools.count(1)):
4614 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
4615 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
4616 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
4617 if begin_time is None:
4622 end_time = begin_time + dur
4623 out.append('%d\n%s --> %s\n%s\n\n' % (
4625 srt_subtitles_timecode(begin_time),
4626 srt_subtitles_timecode(end_time),
4632 def cli_option(params, command_option, param):
4633 param = params.get(param)
4635 param = compat_str(param)
4636 return [command_option, param] if param is not None else []
4639 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
4640 param = params.get(param)
4643 assert isinstance(param, bool)
4645 return [command_option + separator + (true_value if param else false_value)]
4646 return [command_option, true_value if param else false_value]
4649 def cli_valueless_option(params, command_option, param, expected_value=True):
4650 param = params.get(param)
4651 return [command_option] if param == expected_value else []
4654 def cli_configuration_args(params, param, default=[]):
4655 ex_args = params.get(param)
4658 assert isinstance(ex_args, list)
4662 class ISO639Utils(object):
4663 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
4722 'iw': 'heb', # Replaced by he in 1989 revision
4732 'in': 'ind', # Replaced by id in 1989 revision
4847 'ji': 'yid', # Replaced by yi in 1989 revision
4855 def short2long(cls, code):
4856 """Convert language code from ISO 639-1 to ISO 639-2/T"""
4857 return cls._lang_map.get(code[:2])
4860 def long2short(cls, code):
4861 """Convert language code from ISO 639-2/T to ISO 639-1"""
4862 for short_name, long_name in cls._lang_map.items():
4863 if long_name == code:
4867 class ISO3166Utils(object):
4868 # From http://data.okfn.org/data/core/country-list
4870 'AF': 'Afghanistan',
4871 'AX': 'Åland Islands',
4874 'AS': 'American Samoa',
4879 'AG': 'Antigua and Barbuda',
4896 'BO': 'Bolivia, Plurinational State of',
4897 'BQ': 'Bonaire, Sint Eustatius and Saba',
4898 'BA': 'Bosnia and Herzegovina',
4900 'BV': 'Bouvet Island',
4902 'IO': 'British Indian Ocean Territory',
4903 'BN': 'Brunei Darussalam',
4905 'BF': 'Burkina Faso',
4911 'KY': 'Cayman Islands',
4912 'CF': 'Central African Republic',
4916 'CX': 'Christmas Island',
4917 'CC': 'Cocos (Keeling) Islands',
4921 'CD': 'Congo, the Democratic Republic of the',
4922 'CK': 'Cook Islands',
4924 'CI': 'Côte d\'Ivoire',
4929 'CZ': 'Czech Republic',
4933 'DO': 'Dominican Republic',
4936 'SV': 'El Salvador',
4937 'GQ': 'Equatorial Guinea',
4941 'FK': 'Falkland Islands (Malvinas)',
4942 'FO': 'Faroe Islands',
4946 'GF': 'French Guiana',
4947 'PF': 'French Polynesia',
4948 'TF': 'French Southern Territories',
4963 'GW': 'Guinea-Bissau',
4966 'HM': 'Heard Island and McDonald Islands',
4967 'VA': 'Holy See (Vatican City State)',
4974 'IR': 'Iran, Islamic Republic of',
4977 'IM': 'Isle of Man',
4987 'KP': 'Korea, Democratic People\'s Republic of',
4988 'KR': 'Korea, Republic of',
4991 'LA': 'Lao People\'s Democratic Republic',
4997 'LI': 'Liechtenstein',
5001 'MK': 'Macedonia, the Former Yugoslav Republic of',
5008 'MH': 'Marshall Islands',
5014 'FM': 'Micronesia, Federated States of',
5015 'MD': 'Moldova, Republic of',
5026 'NL': 'Netherlands',
5027 'NC': 'New Caledonia',
5028 'NZ': 'New Zealand',
5033 'NF': 'Norfolk Island',
5034 'MP': 'Northern Mariana Islands',
5039 'PS': 'Palestine, State of',
5041 'PG': 'Papua New Guinea',
5044 'PH': 'Philippines',
5048 'PR': 'Puerto Rico',
5052 'RU': 'Russian Federation',
5054 'BL': 'Saint Barthélemy',
5055 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
5056 'KN': 'Saint Kitts and Nevis',
5057 'LC': 'Saint Lucia',
5058 'MF': 'Saint Martin (French part)',
5059 'PM': 'Saint Pierre and Miquelon',
5060 'VC': 'Saint Vincent and the Grenadines',
5063 'ST': 'Sao Tome and Principe',
5064 'SA': 'Saudi Arabia',
5068 'SL': 'Sierra Leone',
5070 'SX': 'Sint Maarten (Dutch part)',
5073 'SB': 'Solomon Islands',
5075 'ZA': 'South Africa',
5076 'GS': 'South Georgia and the South Sandwich Islands',
5077 'SS': 'South Sudan',
5082 'SJ': 'Svalbard and Jan Mayen',
5085 'CH': 'Switzerland',
5086 'SY': 'Syrian Arab Republic',
5087 'TW': 'Taiwan, Province of China',
5089 'TZ': 'Tanzania, United Republic of',
5091 'TL': 'Timor-Leste',
5095 'TT': 'Trinidad and Tobago',
5098 'TM': 'Turkmenistan',
5099 'TC': 'Turks and Caicos Islands',
5103 'AE': 'United Arab Emirates',
5104 'GB': 'United Kingdom',
5105 'US': 'United States',
5106 'UM': 'United States Minor Outlying Islands',
5110 'VE': 'Venezuela, Bolivarian Republic of',
5112 'VG': 'Virgin Islands, British',
5113 'VI': 'Virgin Islands, U.S.',
5114 'WF': 'Wallis and Futuna',
5115 'EH': 'Western Sahara',
5122 def short2full(cls, code):
5123 """Convert an ISO 3166-2 country code to the corresponding full name"""
5124 return cls._country_map.get(code.upper())
5127 class GeoUtils(object):
5128 # Major IPv4 address blocks per country
5130 'AD': '46.172.224.0/19',
5131 'AE': '94.200.0.0/13',
5132 'AF': '149.54.0.0/17',
5133 'AG': '209.59.64.0/18',
5134 'AI': '204.14.248.0/21',
5135 'AL': '46.99.0.0/16',
5136 'AM': '46.70.0.0/15',
5137 'AO': '105.168.0.0/13',
5138 'AP': '182.50.184.0/21',
5139 'AQ': '23.154.160.0/24',
5140 'AR': '181.0.0.0/12',
5141 'AS': '202.70.112.0/20',
5142 'AT': '77.116.0.0/14',
5143 'AU': '1.128.0.0/11',
5144 'AW': '181.41.0.0/18',
5145 'AX': '185.217.4.0/22',
5146 'AZ': '5.197.0.0/16',
5147 'BA': '31.176.128.0/17',
5148 'BB': '65.48.128.0/17',
5149 'BD': '114.130.0.0/16',
5151 'BF': '102.178.0.0/15',
5152 'BG': '95.42.0.0/15',
5153 'BH': '37.131.0.0/17',
5154 'BI': '154.117.192.0/18',
5155 'BJ': '137.255.0.0/16',
5156 'BL': '185.212.72.0/23',
5157 'BM': '196.12.64.0/18',
5158 'BN': '156.31.0.0/16',
5159 'BO': '161.56.0.0/16',
5160 'BQ': '161.0.80.0/20',
5161 'BR': '191.128.0.0/12',
5162 'BS': '24.51.64.0/18',
5163 'BT': '119.2.96.0/19',
5164 'BW': '168.167.0.0/16',
5165 'BY': '178.120.0.0/13',
5166 'BZ': '179.42.192.0/18',
5167 'CA': '99.224.0.0/11',
5168 'CD': '41.243.0.0/16',
5169 'CF': '197.242.176.0/21',
5170 'CG': '160.113.0.0/16',
5171 'CH': '85.0.0.0/13',
5172 'CI': '102.136.0.0/14',
5173 'CK': '202.65.32.0/19',
5174 'CL': '152.172.0.0/14',
5175 'CM': '102.244.0.0/14',
5176 'CN': '36.128.0.0/10',
5177 'CO': '181.240.0.0/12',
5178 'CR': '201.192.0.0/12',
5179 'CU': '152.206.0.0/15',
5180 'CV': '165.90.96.0/19',
5181 'CW': '190.88.128.0/17',
5182 'CY': '31.153.0.0/16',
5183 'CZ': '88.100.0.0/14',
5185 'DJ': '197.241.0.0/17',
5186 'DK': '87.48.0.0/12',
5187 'DM': '192.243.48.0/20',
5188 'DO': '152.166.0.0/15',
5189 'DZ': '41.96.0.0/12',
5190 'EC': '186.68.0.0/15',
5191 'EE': '90.190.0.0/15',
5192 'EG': '156.160.0.0/11',
5193 'ER': '196.200.96.0/20',
5194 'ES': '88.0.0.0/11',
5195 'ET': '196.188.0.0/14',
5196 'EU': '2.16.0.0/13',
5197 'FI': '91.152.0.0/13',
5198 'FJ': '144.120.0.0/16',
5199 'FK': '80.73.208.0/21',
5200 'FM': '119.252.112.0/20',
5201 'FO': '88.85.32.0/19',
5203 'GA': '41.158.0.0/15',
5205 'GD': '74.122.88.0/21',
5206 'GE': '31.146.0.0/16',
5207 'GF': '161.22.64.0/18',
5208 'GG': '62.68.160.0/19',
5209 'GH': '154.160.0.0/12',
5210 'GI': '95.164.0.0/16',
5211 'GL': '88.83.0.0/19',
5212 'GM': '160.182.0.0/15',
5213 'GN': '197.149.192.0/18',
5214 'GP': '104.250.0.0/19',
5215 'GQ': '105.235.224.0/20',
5216 'GR': '94.64.0.0/13',
5217 'GT': '168.234.0.0/16',
5218 'GU': '168.123.0.0/16',
5219 'GW': '197.214.80.0/20',
5220 'GY': '181.41.64.0/18',
5221 'HK': '113.252.0.0/14',
5222 'HN': '181.210.0.0/16',
5223 'HR': '93.136.0.0/13',
5224 'HT': '148.102.128.0/17',
5225 'HU': '84.0.0.0/14',
5226 'ID': '39.192.0.0/10',
5227 'IE': '87.32.0.0/12',
5228 'IL': '79.176.0.0/13',
5229 'IM': '5.62.80.0/20',
5230 'IN': '117.192.0.0/10',
5231 'IO': '203.83.48.0/21',
5232 'IQ': '37.236.0.0/14',
5233 'IR': '2.176.0.0/12',
5234 'IS': '82.221.0.0/16',
5235 'IT': '79.0.0.0/10',
5236 'JE': '87.244.64.0/18',
5237 'JM': '72.27.0.0/17',
5238 'JO': '176.29.0.0/16',
5239 'JP': '133.0.0.0/8',
5240 'KE': '105.48.0.0/12',
5241 'KG': '158.181.128.0/17',
5242 'KH': '36.37.128.0/17',
5243 'KI': '103.25.140.0/22',
5244 'KM': '197.255.224.0/20',
5245 'KN': '198.167.192.0/19',
5246 'KP': '175.45.176.0/22',
5247 'KR': '175.192.0.0/10',
5248 'KW': '37.36.0.0/14',
5249 'KY': '64.96.0.0/15',
5250 'KZ': '2.72.0.0/13',
5251 'LA': '115.84.64.0/18',
5252 'LB': '178.135.0.0/16',
5253 'LC': '24.92.144.0/20',
5254 'LI': '82.117.0.0/19',
5255 'LK': '112.134.0.0/15',
5256 'LR': '102.183.0.0/16',
5257 'LS': '129.232.0.0/17',
5258 'LT': '78.56.0.0/13',
5259 'LU': '188.42.0.0/16',
5260 'LV': '46.109.0.0/16',
5261 'LY': '41.252.0.0/14',
5262 'MA': '105.128.0.0/11',
5263 'MC': '88.209.64.0/18',
5264 'MD': '37.246.0.0/16',
5265 'ME': '178.175.0.0/17',
5266 'MF': '74.112.232.0/21',
5267 'MG': '154.126.0.0/17',
5268 'MH': '117.103.88.0/21',
5269 'MK': '77.28.0.0/15',
5270 'ML': '154.118.128.0/18',
5271 'MM': '37.111.0.0/17',
5272 'MN': '49.0.128.0/17',
5273 'MO': '60.246.0.0/16',
5274 'MP': '202.88.64.0/20',
5275 'MQ': '109.203.224.0/19',
5276 'MR': '41.188.64.0/18',
5277 'MS': '208.90.112.0/22',
5278 'MT': '46.11.0.0/16',
5279 'MU': '105.16.0.0/12',
5280 'MV': '27.114.128.0/18',
5281 'MW': '102.70.0.0/15',
5282 'MX': '187.192.0.0/11',
5283 'MY': '175.136.0.0/13',
5284 'MZ': '197.218.0.0/15',
5285 'NA': '41.182.0.0/16',
5286 'NC': '101.101.0.0/18',
5287 'NE': '197.214.0.0/18',
5288 'NF': '203.17.240.0/22',
5289 'NG': '105.112.0.0/12',
5290 'NI': '186.76.0.0/15',
5291 'NL': '145.96.0.0/11',
5292 'NO': '84.208.0.0/13',
5293 'NP': '36.252.0.0/15',
5294 'NR': '203.98.224.0/19',
5295 'NU': '49.156.48.0/22',
5296 'NZ': '49.224.0.0/14',
5297 'OM': '5.36.0.0/15',
5298 'PA': '186.72.0.0/15',
5299 'PE': '186.160.0.0/14',
5300 'PF': '123.50.64.0/18',
5301 'PG': '124.240.192.0/19',
5302 'PH': '49.144.0.0/13',
5303 'PK': '39.32.0.0/11',
5304 'PL': '83.0.0.0/11',
5305 'PM': '70.36.0.0/20',
5306 'PR': '66.50.0.0/16',
5307 'PS': '188.161.0.0/16',
5308 'PT': '85.240.0.0/13',
5309 'PW': '202.124.224.0/20',
5310 'PY': '181.120.0.0/14',
5311 'QA': '37.210.0.0/15',
5312 'RE': '102.35.0.0/16',
5313 'RO': '79.112.0.0/13',
5314 'RS': '93.86.0.0/15',
5315 'RU': '5.136.0.0/13',
5316 'RW': '41.186.0.0/16',
5317 'SA': '188.48.0.0/13',
5318 'SB': '202.1.160.0/19',
5319 'SC': '154.192.0.0/11',
5320 'SD': '102.120.0.0/13',
5321 'SE': '78.64.0.0/12',
5322 'SG': '8.128.0.0/10',
5323 'SI': '188.196.0.0/14',
5324 'SK': '78.98.0.0/15',
5325 'SL': '102.143.0.0/17',
5326 'SM': '89.186.32.0/19',
5327 'SN': '41.82.0.0/15',
5328 'SO': '154.115.192.0/18',
5329 'SR': '186.179.128.0/17',
5330 'SS': '105.235.208.0/21',
5331 'ST': '197.159.160.0/19',
5332 'SV': '168.243.0.0/16',
5333 'SX': '190.102.0.0/20',
5335 'SZ': '41.84.224.0/19',
5336 'TC': '65.255.48.0/20',
5337 'TD': '154.68.128.0/19',
5338 'TG': '196.168.0.0/14',
5339 'TH': '171.96.0.0/13',
5340 'TJ': '85.9.128.0/18',
5341 'TK': '27.96.24.0/21',
5342 'TL': '180.189.160.0/20',
5343 'TM': '95.85.96.0/19',
5344 'TN': '197.0.0.0/11',
5345 'TO': '175.176.144.0/21',
5346 'TR': '78.160.0.0/11',
5347 'TT': '186.44.0.0/15',
5348 'TV': '202.2.96.0/19',
5349 'TW': '120.96.0.0/11',
5350 'TZ': '156.156.0.0/14',
5351 'UA': '37.52.0.0/14',
5352 'UG': '102.80.0.0/13',
5354 'UY': '167.56.0.0/13',
5355 'UZ': '84.54.64.0/18',
5356 'VA': '212.77.0.0/19',
5357 'VC': '207.191.240.0/21',
5358 'VE': '186.88.0.0/13',
5359 'VG': '66.81.192.0/20',
5360 'VI': '146.226.0.0/16',
5361 'VN': '14.160.0.0/11',
5362 'VU': '202.80.32.0/20',
5363 'WF': '117.20.32.0/21',
5364 'WS': '202.4.32.0/19',
5365 'YE': '134.35.0.0/16',
5366 'YT': '41.242.116.0/22',
5367 'ZA': '41.0.0.0/11',
5368 'ZM': '102.144.0.0/13',
5369 'ZW': '102.177.192.0/18',
5373 def random_ipv4(cls, code_or_block):
5374 if len(code_or_block) == 2:
5375 block = cls._country_ip_map.get(code_or_block.upper())
5379 block = code_or_block
5380 addr, preflen = block.split('/')
5381 addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0]
5382 addr_max = addr_min | (0xffffffff >> int(preflen))
5383 return compat_str(socket.inet_ntoa(
5384 compat_struct_pack('!L', random.randint(addr_min, addr_max))))
5387 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
5388 def __init__(self, proxies=None):
5389 # Set default handlers
5390 for type in ('http', 'https'):
5391 setattr(self, '%s_open' % type,
5392 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
5393 meth(r, proxy, type))
5394 compat_urllib_request.ProxyHandler.__init__(self, proxies)
5396 def proxy_open(self, req, proxy, type):
5397 req_proxy = req.headers.get('Ytdl-request-proxy')
5398 if req_proxy is not None:
5400 del req.headers['Ytdl-request-proxy']
5402 if proxy == '__noproxy__':
5403 return None # No Proxy
5404 if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
5405 req.add_header('Ytdl-socks-proxy', proxy)
5406 # youtube-dl's http/https handlers do wrapping the socket with socks
5408 return compat_urllib_request.ProxyHandler.proxy_open(
5409 self, req, proxy, type)
5412 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
5413 # released into Public Domain
5414 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
5416 def long_to_bytes(n, blocksize=0):
5417 """long_to_bytes(n:long, blocksize:int) : string
5418 Convert a long integer to a byte string.
5420 If optional blocksize is given and greater than zero, pad the front of the
5421 byte string with binary zeros so that the length is a multiple of
5424 # after much testing, this algorithm was deemed to be the fastest
5428 s = compat_struct_pack('>I', n & 0xffffffff) + s
5430 # strip off leading zeros
5431 for i in range(len(s)):
5432 if s[i] != b'\000'[0]:
5435 # only happens when n == 0
5439 # add back some pad bytes. this could be done more efficiently w.r.t. the
5440 # de-padding being done above, but sigh...
5441 if blocksize > 0 and len(s) % blocksize:
5442 s = (blocksize - len(s) % blocksize) * b'\000' + s
5446 def bytes_to_long(s):
5447 """bytes_to_long(string) : long
5448 Convert a byte string to a long integer.
5450 This is (essentially) the inverse of long_to_bytes().
5455 extra = (4 - length % 4)
5456 s = b'\000' * extra + s
5457 length = length + extra
5458 for i in range(0, length, 4):
5459 acc = (acc << 32) + compat_struct_unpack('>I', s[i:i + 4])[0]
5463 def ohdave_rsa_encrypt(data, exponent, modulus):
5465 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
5468 data: data to encrypt, bytes-like object
5469 exponent, modulus: parameter e and N of RSA algorithm, both integer
5470 Output: hex string of encrypted data
5472 Limitation: supports one block encryption only
5475 payload = int(binascii.hexlify(data[::-1]), 16)
5476 encrypted = pow(payload, exponent, modulus)
5477 return '%x' % encrypted
5480 def pkcs1pad(data, length):
5482 Padding input data with PKCS#1 scheme
5484 @param {int[]} data input data
5485 @param {int} length target length
5486 @returns {int[]} padded data
5488 if len(data) > length - 11:
5489 raise ValueError('Input data too long for PKCS#1 padding')
5491 pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
5492 return [0, 2] + pseudo_random + [0] + data
5495 def encode_base_n(num, n, table=None):
5496 FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
5498 table = FULL_TABLE[:n]
5501 raise ValueError('base %d exceeds table length %d' % (n, len(table)))
5508 ret = table[num % n] + ret
5513 def decode_packed_codes(code):
5514 mobj = re.search(PACKED_CODES_RE, code)
5515 obfuscated_code, base, count, symbols = mobj.groups()
5518 symbols = symbols.split('|')
5523 base_n_count = encode_base_n(count, base)
5524 symbol_table[base_n_count] = symbols[count] or base_n_count
5527 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
5531 def caesar(s, alphabet, shift):
5536 alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
5541 return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
5544 def parse_m3u8_attributes(attrib):
5546 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
5547 if val.startswith('"'):
5553 def urshift(val, n):
5554 return val >> n if val >= 0 else (val + 0x100000000) >> n
5557 # Based on png2str() written by @gdkchan and improved by @yokrysty
5558 # Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
5559 def decode_png(png_data):
5560 # Reference: https://www.w3.org/TR/PNG/
5561 header = png_data[8:]
5563 if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
5564 raise IOError('Not a valid PNG file.')
5566 int_map = {1: '>B', 2: '>H', 4: '>I'}
5567 unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
5572 length = unpack_integer(header[:4])
5575 chunk_type = header[:4]
5578 chunk_data = header[:length]
5579 header = header[length:]
5581 header = header[4:] # Skip CRC
5589 ihdr = chunks[0]['data']
5591 width = unpack_integer(ihdr[:4])
5592 height = unpack_integer(ihdr[4:8])
5596 for chunk in chunks:
5597 if chunk['type'] == b'IDAT':
5598 idat += chunk['data']
5601 raise IOError('Unable to read PNG data.')
5603 decompressed_data = bytearray(zlib.decompress(idat))
5608 def _get_pixel(idx):
5613 for y in range(height):
5614 basePos = y * (1 + stride)
5615 filter_type = decompressed_data[basePos]
5619 pixels.append(current_row)
5621 for x in range(stride):
5622 color = decompressed_data[1 + basePos + x]
5623 basex = y * stride + x
5628 left = _get_pixel(basex - 3)
5630 up = _get_pixel(basex - stride)
5632 if filter_type == 1: # Sub
5633 color = (color + left) & 0xff
5634 elif filter_type == 2: # Up
5635 color = (color + up) & 0xff
5636 elif filter_type == 3: # Average
5637 color = (color + ((left + up) >> 1)) & 0xff
5638 elif filter_type == 4: # Paeth
5644 c = _get_pixel(basex - stride - 3)
5652 if pa <= pb and pa <= pc:
5653 color = (color + a) & 0xff
5655 color = (color + b) & 0xff
5657 color = (color + c) & 0xff
5659 current_row.append(color)
5661 return width, height, pixels
5664 def write_xattr(path, key, value):
5665 # This mess below finds the best xattr tool for the job
5667 # try the pyxattr module...
5670 if hasattr(xattr, 'set'): # pyxattr
5671 # Unicode arguments are not supported in python-pyxattr until
5673 # See https://github.com/ytdl-org/youtube-dl/issues/5498
5674 pyxattr_required_version = '0.5.0'
5675 if version_tuple(xattr.__version__) < version_tuple(pyxattr_required_version):
5676 # TODO: fallback to CLI tools
5677 raise XAttrUnavailableError(
5678 'python-pyxattr is detected but is too old. '
5679 'youtube-dl requires %s or above while your version is %s. '
5680 'Falling back to other xattr implementations' % (
5681 pyxattr_required_version, xattr.__version__))
5683 setxattr = xattr.set
5685 setxattr = xattr.setxattr
5688 setxattr(path, key, value)
5689 except EnvironmentError as e:
5690 raise XAttrMetadataError(e.errno, e.strerror)
5693 if compat_os_name == 'nt':
5694 # Write xattrs to NTFS Alternate Data Streams:
5695 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
5696 assert ':' not in key
5697 assert os.path.exists(path)
5699 ads_fn = path + ':' + key
5701 with open(ads_fn, 'wb') as f:
5703 except EnvironmentError as e:
5704 raise XAttrMetadataError(e.errno, e.strerror)
5706 user_has_setfattr = check_executable('setfattr', ['--version'])
5707 user_has_xattr = check_executable('xattr', ['-h'])
5709 if user_has_setfattr or user_has_xattr:
5711 value = value.decode('utf-8')
5712 if user_has_setfattr:
5713 executable = 'setfattr'
5714 opts = ['-n', key, '-v', value]
5715 elif user_has_xattr:
5716 executable = 'xattr'
5717 opts = ['-w', key, value]
5719 cmd = ([encodeFilename(executable, True)]
5720 + [encodeArgument(o) for o in opts]
5721 + [encodeFilename(path, True)])
5724 p = subprocess.Popen(
5725 cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
5726 except EnvironmentError as e:
5727 raise XAttrMetadataError(e.errno, e.strerror)
5728 stdout, stderr = p.communicate()
5729 stderr = stderr.decode('utf-8', 'replace')
5730 if p.returncode != 0:
5731 raise XAttrMetadataError(p.returncode, stderr)
5734 # On Unix, and can't find pyxattr, setfattr, or xattr.
5735 if sys.platform.startswith('linux'):
5736 raise XAttrUnavailableError(
5737 "Couldn't find a tool to set the xattrs. "
5738 "Install either the python 'pyxattr' or 'xattr' "
5739 "modules, or the GNU 'attr' package "
5740 "(which contains the 'setfattr' tool).")
5742 raise XAttrUnavailableError(
5743 "Couldn't find a tool to set the xattrs. "
5744 "Install either the python 'xattr' module, "
5745 "or the 'xattr' binary.")
5748 def random_birthday(year_field, month_field, day_field):
5749 start_date = datetime.date(1950, 1, 1)
5750 end_date = datetime.date(1995, 12, 31)
5751 offset = random.randint(0, (end_date - start_date).days)
5752 random_date = start_date + datetime.timedelta(offset)
5754 year_field: str(random_date.year),
5755 month_field: str(random_date.month),
5756 day_field: str(random_date.day),
5760 def clean_podcast_url(url):
5761 return re.sub(r'''(?x)
5765 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
5768 (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
5771 cn\.co| # https://podcorn.com/analytics-prefix/
5772 st\.fm # https://podsights.com/docs/