youtube_dl/YoutubeDL.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import absolute_import, unicode_literals
   5
   6 import collections
   7 import errno
   8 import io
   9 import json
  10 import os
  11 import platform
  12 import re
  13 import shutil
  14 import subprocess
  15 import socket
  16 import sys
  17 import time
  18 import traceback
  19
  20 if os.name == 'nt':
  21     import ctypes
  22
  23 from .utils import (
  24     compat_cookiejar,
  25     compat_http_client,
  26     compat_str,
  27     compat_urllib_error,
  28     compat_urllib_request,
  29     ContentTooShortError,
  30     date_from_str,
  31     DateRange,
  32     determine_ext,
  33     DownloadError,
  34     encodeFilename,
  35     ExtractorError,
  36     format_bytes,
  37     formatSeconds,
  38     get_term_width,
  39     locked_file,
  40     make_HTTPS_handler,
  41     MaxDownloadsReached,
  42     PostProcessingError,
  43     platform_name,
  44     preferredencoding,
  45     SameFileError,
  46     sanitize_filename,
  47     subtitles_filename,
  48     takewhile_inclusive,
  49     UnavailableVideoError,
  50     url_basename,
  51     write_json_file,
  52     write_string,
  53     YoutubeDLHandler,
  54     prepend_extension,
  55 )
  56 from .extractor import get_info_extractor, gen_extractors
  57 from .downloader import get_suitable_downloader
  58 from .postprocessor import FFmpegMergerPP
  59 from .version import __version__
  60
  61
  62 class YoutubeDL(object):
  63     """YoutubeDL class.
  64
  65     YoutubeDL objects are the ones responsible of downloading the
  66     actual video file and writing it to disk if the user has requested
  67     it, among some other tasks. In most cases there should be one per
  68     program. As, given a video URL, the downloader doesn't know how to
  69     extract all the needed information, task that InfoExtractors do, it
  70     has to pass the URL to one of them.
  71
  72     For this, YoutubeDL objects have a method that allows
  73     InfoExtractors to be registered in a given order. When it is passed
  74     a URL, the YoutubeDL object handles it to the first InfoExtractor it
  75     finds that reports being able to handle it. The InfoExtractor extracts
  76     all the information about the video or videos the URL refers to, and
  77     YoutubeDL process the extracted information, possibly using a File
  78     Downloader to download the video.
  79
  80     YoutubeDL objects accept a lot of parameters. In order not to saturate
  81     the object constructor with arguments, it receives a dictionary of
  82     options instead. These options are available through the params
  83     attribute for the InfoExtractors to use. The YoutubeDL also
  84     registers itself as the downloader in charge for the InfoExtractors
  85     that are added to it, so this is a "mutual registration".
  86
  87     Available options:
  88
  89     username:          Username for authentication purposes.
  90     password:          Password for authentication purposes.
  91     videopassword:     Password for acces a video.
  92     usenetrc:          Use netrc for authentication instead.
  93     verbose:           Print additional info to stdout.
  94     quiet:             Do not print messages to stdout.
  95     forceurl:          Force printing final URL.
  96     forcetitle:        Force printing title.
  97     forceid:           Force printing ID.
  98     forcethumbnail:    Force printing thumbnail URL.
  99     forcedescription:  Force printing description.
 100     forcefilename:     Force printing final filename.
 101     forceduration:     Force printing duration.
 102     forcejson:         Force printing info_dict as JSON.
 103     simulate:          Do not download the video files.
 104     format:            Video format code.
 105     format_limit:      Highest quality format to try.
 106     outtmpl:           Template for output names.
 107     restrictfilenames: Do not allow "&" and spaces in file names
 108     ignoreerrors:      Do not stop on download errors.
 109     nooverwrites:      Prevent overwriting files.
 110     playliststart:     Playlist item to start at.
 111     playlistend:       Playlist item to end at.
 112     matchtitle:        Download only matching titles.
 113     rejecttitle:       Reject downloads for matching titles.
 114     logger:            Log messages to a logging.Logger instance.
 115     logtostderr:       Log messages to stderr instead of stdout.
 116     writedescription:  Write the video description to a .description file
 117     writeinfojson:     Write the video description to a .info.json file
 118     writeannotations:  Write the video annotations to a .annotations.xml file
 119     writethumbnail:    Write the thumbnail image to a file
 120     writesubtitles:    Write the video subtitles to a file
 121     writeautomaticsub: Write the automatic subtitles to a file
 122     allsubtitles:      Downloads all the subtitles of the video
 123                        (requires writesubtitles or writeautomaticsub)
 124     listsubtitles:     Lists all available subtitles for the video
 125     subtitlesformat:   Subtitle format [srt/sbv/vtt] (default=srt)
 126     subtitleslangs:    List of languages of the subtitles to download
 127     keepvideo:         Keep the video file after post-processing
 128     daterange:         A DateRange object, download only if the upload_date is in the range.
 129     skip_download:     Skip the actual download of the video file
 130     cachedir:          Location of the cache files in the filesystem.
 131                        None to disable filesystem cache.
 132     noplaylist:        Download single video instead of a playlist if in doubt.
 133     age_limit:         An integer representing the user's age in years.
 134                        Unsuitable videos for the given age are skipped.
 135     min_views:         An integer representing the minimum view count the video
 136                        must have in order to not be skipped.
 137                        Videos without view count information are always
 138                        downloaded. None for no limit.
 139     max_views:         An integer representing the maximum view count.
 140                        Videos that are more popular than that are not
 141                        downloaded.
 142                        Videos without view count information are always
 143                        downloaded. None for no limit.
 144     download_archive:  File name of a file where all downloads are recorded.
 145                        Videos already present in the file are not downloaded
 146                        again.
 147     cookiefile:        File name where cookies should be read from and dumped to.
 148     nocheckcertificate:Do not verify SSL certificates
 149     proxy:             URL of the proxy server to use
 150     socket_timeout:    Time to wait for unresponsive hosts, in seconds
 151     bidi_workaround:   Work around buggy terminals without bidirectional text
 152                        support, using fridibi
 153     debug_printtraffic:Print out sent and received HTTP traffic
 154     include_ads:       Download ads as well
 155     default_search:    Prepend this string if an input url is not valid.
 156                        'auto' for elaborate guessing
 157
 158     The following parameters are not used by YoutubeDL itself, they are used by
 159     the FileDownloader:
 160     nopart, updatetime, buffersize, ratelimit, min_filesize, max_filesize, test,
 161     noresizebuffer, retries, continuedl, noprogress, consoletitle
 162
 163     The following options are used by the post processors:
 164     prefer_ffmpeg:     If True, use ffmpeg instead of avconv if both are available,
 165                        otherwise prefer avconv.
 166     """
 167
 168     params = None
 169     _ies = []
 170     _pps = []
 171     _download_retcode = None
 172     _num_downloads = None
 173     _screen_file = None
 174
 175     def __init__(self, params=None):
 176         """Create a FileDownloader object with the given options."""
 177         if params is None:
 178             params = {}
 179         self._ies = []
 180         self._ies_instances = {}
 181         self._pps = []
 182         self._progress_hooks = []
 183         self._download_retcode = 0
 184         self._num_downloads = 0
 185         self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
 186         self._err_file = sys.stderr
 187         self.params = params
 188
 189         if params.get('bidi_workaround', False):
 190             try:
 191                 import pty
 192                 master, slave = pty.openpty()
 193                 width = get_term_width()
 194                 if width is None:
 195                     width_args = []
 196                 else:
 197                     width_args = ['-w', str(width)]
 198                 sp_kwargs = dict(
 199                     stdin=subprocess.PIPE,
 200                     stdout=slave,
 201                     stderr=self._err_file)
 202                 try:
 203                     self._output_process = subprocess.Popen(
 204                         ['bidiv'] + width_args, **sp_kwargs
 205                     )
 206                 except OSError:
 207                     self._output_process = subprocess.Popen(
 208                         ['fribidi', '-c', 'UTF-8'] + width_args, **sp_kwargs)
 209                 self._output_channel = os.fdopen(master, 'rb')
 210             except OSError as ose:
 211                 if ose.errno == 2:
 212                     self.report_warning('Could not find fribidi executable, ignoring --bidi-workaround . Make sure that  fribidi  is an executable file in one of the directories in your $PATH.')
 213                 else:
 214                     raise
 215
 216         if (sys.version_info >= (3,) and sys.platform != 'win32' and
 217                 sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968']
 218                 and not params['restrictfilenames']):
 219             # On Python 3, the Unicode filesystem API will throw errors (#1474)
 220             self.report_warning(
 221                 'Assuming --restrict-filenames since file system encoding '
 222                 'cannot encode all charactes. '
 223                 'Set the LC_ALL environment variable to fix this.')
 224             self.params['restrictfilenames'] = True
 225
 226         if '%(stitle)s' in self.params.get('outtmpl', ''):
 227             self.report_warning('%(stitle)s is deprecated. Use the %(title)s and the --restrict-filenames flag(which also secures %(uploader)s et al) instead.')
 228
 229         self._setup_opener()
 230
 231     def add_info_extractor(self, ie):
 232         """Add an InfoExtractor object to the end of the list."""
 233         self._ies.append(ie)
 234         self._ies_instances[ie.ie_key()] = ie
 235         ie.set_downloader(self)
 236
 237     def get_info_extractor(self, ie_key):
 238         """
 239         Get an instance of an IE with name ie_key, it will try to get one from
 240         the _ies list, if there's no instance it will create a new one and add
 241         it to the extractor list.
 242         """
 243         ie = self._ies_instances.get(ie_key)
 244         if ie is None:
 245             ie = get_info_extractor(ie_key)()
 246             self.add_info_extractor(ie)
 247         return ie
 248
 249     def add_default_info_extractors(self):
 250         """
 251         Add the InfoExtractors returned by gen_extractors to the end of the list
 252         """
 253         for ie in gen_extractors():
 254             self.add_info_extractor(ie)
 255
 256     def add_post_processor(self, pp):
 257         """Add a PostProcessor object to the end of the chain."""
 258         self._pps.append(pp)
 259         pp.set_downloader(self)
 260
 261     def add_progress_hook(self, ph):
 262         """Add the progress hook (currently only for the file downloader)"""
 263         self._progress_hooks.append(ph)
 264
 265     def _bidi_workaround(self, message):
 266         if not hasattr(self, '_output_channel'):
 267             return message
 268
 269         assert hasattr(self, '_output_process')
 270         assert type(message) == type('')
 271         line_count = message.count('\n') + 1
 272         self._output_process.stdin.write((message + '\n').encode('utf-8'))
 273         self._output_process.stdin.flush()
 274         res = ''.join(self._output_channel.readline().decode('utf-8')
 275                        for _ in range(line_count))
 276         return res[:-len('\n')]
 277
 278     def to_screen(self, message, skip_eol=False):
 279         """Print message to stdout if not in quiet mode."""
 280         return self.to_stdout(message, skip_eol, check_quiet=True)
 281
 282     def to_stdout(self, message, skip_eol=False, check_quiet=False):
 283         """Print message to stdout if not in quiet mode."""
 284         if self.params.get('logger'):
 285             self.params['logger'].debug(message)
 286         elif not check_quiet or not self.params.get('quiet', False):
 287             message = self._bidi_workaround(message)
 288             terminator = ['\n', ''][skip_eol]
 289             output = message + terminator
 290
 291             write_string(output, self._screen_file)
 292
 293     def to_stderr(self, message):
 294         """Print message to stderr."""
 295         assert type(message) == type('')
 296         if self.params.get('logger'):
 297             self.params['logger'].error(message)
 298         else:
 299             message = self._bidi_workaround(message)
 300             output = message + '\n'
 301             write_string(output, self._err_file)
 302
 303     def to_console_title(self, message):
 304         if not self.params.get('consoletitle', False):
 305             return
 306         if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
 307             # c_wchar_p() might not be necessary if `message` is
 308             # already of type unicode()
 309             ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
 310         elif 'TERM' in os.environ:
 311             write_string('\033]0;%s\007' % message, self._screen_file)
 312
 313     def save_console_title(self):
 314         if not self.params.get('consoletitle', False):
 315             return
 316         if 'TERM' in os.environ:
 317             # Save the title on stack
 318             write_string('\033[22;0t', self._screen_file)
 319
 320     def restore_console_title(self):
 321         if not self.params.get('consoletitle', False):
 322             return
 323         if 'TERM' in os.environ:
 324             # Restore the title from stack
 325             write_string('\033[23;0t', self._screen_file)
 326
 327     def __enter__(self):
 328         self.save_console_title()
 329         return self
 330
 331     def __exit__(self, *args):
 332         self.restore_console_title()
 333
 334         if self.params.get('cookiefile') is not None:
 335             self.cookiejar.save()
 336
 337     def trouble(self, message=None, tb=None):
 338         """Determine action to take when a download problem appears.
 339
 340         Depending on if the downloader has been configured to ignore
 341         download errors or not, this method may throw an exception or
 342         not when errors are found, after printing the message.
 343
 344         tb, if given, is additional traceback information.
 345         """
 346         if message is not None:
 347             self.to_stderr(message)
 348         if self.params.get('verbose'):
 349             if tb is None:
 350                 if sys.exc_info()[0]:  # if .trouble has been called from an except block
 351                     tb = ''
 352                     if hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
 353                         tb += ''.join(traceback.format_exception(*sys.exc_info()[1].exc_info))
 354                     tb += compat_str(traceback.format_exc())
 355                 else:
 356                     tb_data = traceback.format_list(traceback.extract_stack())
 357                     tb = ''.join(tb_data)
 358             self.to_stderr(tb)
 359         if not self.params.get('ignoreerrors', False):
 360             if sys.exc_info()[0] and hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
 361                 exc_info = sys.exc_info()[1].exc_info
 362             else:
 363                 exc_info = sys.exc_info()
 364             raise DownloadError(message, exc_info)
 365         self._download_retcode = 1
 366
 367     def report_warning(self, message):
 368         '''
 369         Print the message to stderr, it will be prefixed with 'WARNING:'
 370         If stderr is a tty file the 'WARNING:' will be colored
 371         '''
 372         if self._err_file.isatty() and os.name != 'nt':
 373             _msg_header = '\033[0;33mWARNING:\033[0m'
 374         else:
 375             _msg_header = 'WARNING:'
 376         warning_message = '%s %s' % (_msg_header, message)
 377         self.to_stderr(warning_message)
 378
 379     def report_error(self, message, tb=None):
 380         '''
 381         Do the same as trouble, but prefixes the message with 'ERROR:', colored
 382         in red if stderr is a tty file.
 383         '''
 384         if self._err_file.isatty() and os.name != 'nt':
 385             _msg_header = '\033[0;31mERROR:\033[0m'
 386         else:
 387             _msg_header = 'ERROR:'
 388         error_message = '%s %s' % (_msg_header, message)
 389         self.trouble(error_message, tb)
 390
 391     def report_file_already_downloaded(self, file_name):
 392         """Report file has already been fully downloaded."""
 393         try:
 394             self.to_screen('[download] %s has already been downloaded' % file_name)
 395         except UnicodeEncodeError:
 396             self.to_screen('[download] The file has already been downloaded')
 397
 398     def increment_downloads(self):
 399         """Increment the ordinal that assigns a number to each file."""
 400         self._num_downloads += 1
 401
 402     def prepare_filename(self, info_dict):
 403         """Generate the output filename."""
 404         try:
 405             template_dict = dict(info_dict)
 406
 407             template_dict['epoch'] = int(time.time())
 408             autonumber_size = self.params.get('autonumber_size')
 409             if autonumber_size is None:
 410                 autonumber_size = 5
 411             autonumber_templ = '%0' + str(autonumber_size) + 'd'
 412             template_dict['autonumber'] = autonumber_templ % self._num_downloads
 413             if template_dict.get('playlist_index') is not None:
 414                 template_dict['playlist_index'] = '%05d' % template_dict['playlist_index']
 415
 416             sanitize = lambda k, v: sanitize_filename(
 417                 compat_str(v),
 418                 restricted=self.params.get('restrictfilenames'),
 419                 is_id=(k == 'id'))
 420             template_dict = dict((k, sanitize(k, v))
 421                                  for k, v in template_dict.items()
 422                                  if v is not None)
 423             template_dict = collections.defaultdict(lambda: 'NA', template_dict)
 424
 425             tmpl = os.path.expanduser(self.params['outtmpl'])
 426             filename = tmpl % template_dict
 427             return filename
 428         except ValueError as err:
 429             self.report_error('Error in output template: ' + str(err) + ' (encoding: ' + repr(preferredencoding()) + ')')
 430             return None
 431
 432     def _match_entry(self, info_dict):
 433         """ Returns None iff the file should be downloaded """
 434
 435         video_title = info_dict.get('title', info_dict.get('id', 'video'))
 436         if 'title' in info_dict:
 437             # This can happen when we're just evaluating the playlist
 438             title = info_dict['title']
 439             matchtitle = self.params.get('matchtitle', False)
 440             if matchtitle:
 441                 if not re.search(matchtitle, title, re.IGNORECASE):
 442                     return '"' + title + '" title did not match pattern "' + matchtitle + '"'
 443             rejecttitle = self.params.get('rejecttitle', False)
 444             if rejecttitle:
 445                 if re.search(rejecttitle, title, re.IGNORECASE):
 446                     return '"' + title + '" title matched reject pattern "' + rejecttitle + '"'
 447         date = info_dict.get('upload_date', None)
 448         if date is not None:
 449             dateRange = self.params.get('daterange', DateRange())
 450             if date not in dateRange:
 451                 return '%s upload date is not in range %s' % (date_from_str(date).isoformat(), dateRange)
 452         view_count = info_dict.get('view_count', None)
 453         if view_count is not None:
 454             min_views = self.params.get('min_views')
 455             if min_views is not None and view_count < min_views:
 456                 return 'Skipping %s, because it has not reached minimum view count (%d/%d)' % (video_title, view_count, min_views)
 457             max_views = self.params.get('max_views')
 458             if max_views is not None and view_count > max_views:
 459                 return 'Skipping %s, because it has exceeded the maximum view count (%d/%d)' % (video_title, view_count, max_views)
 460         age_limit = self.params.get('age_limit')
 461         if age_limit is not None:
 462             if age_limit < info_dict.get('age_limit', 0):
 463                 return 'Skipping "' + title + '" because it is age restricted'
 464         if self.in_download_archive(info_dict):
 465             return '%s has already been recorded in archive' % video_title
 466         return None
 467
 468     @staticmethod
 469     def add_extra_info(info_dict, extra_info):
 470         '''Set the keys from extra_info in info dict if they are missing'''
 471         for key, value in extra_info.items():
 472             info_dict.setdefault(key, value)
 473
 474     def extract_info(self, url, download=True, ie_key=None, extra_info={},
 475                      process=True):
 476         '''
 477         Returns a list with a dictionary for each video we find.
 478         If 'download', also downloads the videos.
 479         extra_info is a dict containing the extra values to add to each result
 480          '''
 481
 482         if ie_key:
 483             ies = [self.get_info_extractor(ie_key)]
 484         else:
 485             ies = self._ies
 486
 487         for ie in ies:
 488             if not ie.suitable(url):
 489                 continue
 490
 491             if not ie.working():
 492                 self.report_warning('The program functionality for this site has been marked as broken, '
 493                                     'and will probably not work.')
 494
 495             try:
 496                 ie_result = ie.extract(url)
 497                 if ie_result is None: # Finished already (backwards compatibility; listformats and friends should be moved here)
 498                     break
 499                 if isinstance(ie_result, list):
 500                     # Backwards compatibility: old IE result format
 501                     ie_result = {
 502                         '_type': 'compat_list',
 503                         'entries': ie_result,
 504                     }
 505                 self.add_extra_info(ie_result,
 506                     {
 507                         'extractor': ie.IE_NAME,
 508                         'webpage_url': url,
 509                         'webpage_url_basename': url_basename(url),
 510                         'extractor_key': ie.ie_key(),
 511                     })
 512                 if process:
 513                     return self.process_ie_result(ie_result, download, extra_info)
 514                 else:
 515                     return ie_result
 516             except ExtractorError as de: # An error we somewhat expected
 517                 self.report_error(compat_str(de), de.format_traceback())
 518                 break
 519             except Exception as e:
 520                 if self.params.get('ignoreerrors', False):
 521                     self.report_error(compat_str(e), tb=compat_str(traceback.format_exc()))
 522                     break
 523                 else:
 524                     raise
 525         else:
 526             self.report_error('no suitable InfoExtractor: %s' % url)
 527
 528     def process_ie_result(self, ie_result, download=True, extra_info={}):
 529         """
 530         Take the result of the ie(may be modified) and resolve all unresolved
 531         references (URLs, playlist items).
 532
 533         It will also download the videos if 'download'.
 534         Returns the resolved ie_result.
 535         """
 536
 537         result_type = ie_result.get('_type', 'video') # If not given we suppose it's a video, support the default old system
 538         if result_type == 'video':
 539             self.add_extra_info(ie_result, extra_info)
 540             return self.process_video_result(ie_result, download=download)
 541         elif result_type == 'url':
 542             # We have to add extra_info to the results because it may be
 543             # contained in a playlist
 544             return self.extract_info(ie_result['url'],
 545                                      download,
 546                                      ie_key=ie_result.get('ie_key'),
 547                                      extra_info=extra_info)
 548         elif result_type == 'url_transparent':
 549             # Use the information from the embedding page
 550             info = self.extract_info(
 551                 ie_result['url'], ie_key=ie_result.get('ie_key'),
 552                 extra_info=extra_info, download=False, process=False)
 553
 554             def make_result(embedded_info):
 555                 new_result = ie_result.copy()
 556                 for f in ('_type', 'url', 'ext', 'player_url', 'formats',
 557                           'entries', 'ie_key', 'duration',
 558                           'subtitles', 'annotations', 'format',
 559                           'thumbnail', 'thumbnails'):
 560                     if f in new_result:
 561                         del new_result[f]
 562                     if f in embedded_info:
 563                         new_result[f] = embedded_info[f]
 564                 return new_result
 565             new_result = make_result(info)
 566
 567             assert new_result.get('_type') != 'url_transparent'
 568             if new_result.get('_type') == 'compat_list':
 569                 new_result['entries'] = [
 570                     make_result(e) for e in new_result['entries']]
 571
 572             return self.process_ie_result(
 573                 new_result, download=download, extra_info=extra_info)
 574         elif result_type == 'playlist':
 575             # We process each entry in the playlist
 576             playlist = ie_result.get('title', None) or ie_result.get('id', None)
 577             self.to_screen('[download] Downloading playlist: %s' % playlist)
 578
 579             playlist_results = []
 580
 581             n_all_entries = len(ie_result['entries'])
 582             playliststart = self.params.get('playliststart', 1) - 1
 583             playlistend = self.params.get('playlistend', None)
 584             # For backwards compatibility, interpret -1 as whole list
 585             if playlistend == -1:
 586                 playlistend = None
 587
 588             entries = ie_result['entries'][playliststart:playlistend]
 589             n_entries = len(entries)
 590
 591             self.to_screen(
 592                 "[%s] playlist '%s': Collected %d video ids (downloading %d of them)" %
 593                 (ie_result['extractor'], playlist, n_all_entries, n_entries))
 594
 595             for i, entry in enumerate(entries, 1):
 596                 self.to_screen('[download] Downloading video #%s of %s' % (i, n_entries))
 597                 extra = {
 598                     'playlist': playlist,
 599                     'playlist_index': i + playliststart,
 600                     'extractor': ie_result['extractor'],
 601                     'webpage_url': ie_result['webpage_url'],
 602                     'webpage_url_basename': url_basename(ie_result['webpage_url']),
 603                     'extractor_key': ie_result['extractor_key'],
 604                 }
 605
 606                 reason = self._match_entry(entry)
 607                 if reason is not None:
 608                     self.to_screen('[download] ' + reason)
 609                     continue
 610
 611                 entry_result = self.process_ie_result(entry,
 612                                                       download=download,
 613                                                       extra_info=extra)
 614                 playlist_results.append(entry_result)
 615             ie_result['entries'] = playlist_results
 616             return ie_result
 617         elif result_type == 'compat_list':
 618             def _fixup(r):
 619                 self.add_extra_info(r,
 620                     {
 621                         'extractor': ie_result['extractor'],
 622                         'webpage_url': ie_result['webpage_url'],
 623                         'webpage_url_basename': url_basename(ie_result['webpage_url']),
 624                         'extractor_key': ie_result['extractor_key'],
 625                     })
 626                 return r
 627             ie_result['entries'] = [
 628                 self.process_ie_result(_fixup(r), download, extra_info)
 629                 for r in ie_result['entries']
 630             ]
 631             return ie_result
 632         else:
 633             raise Exception('Invalid result type: %s' % result_type)
 634
 635     def select_format(self, format_spec, available_formats):
 636         if format_spec == 'best' or format_spec is None:
 637             return available_formats[-1]
 638         elif format_spec == 'worst':
 639             return available_formats[0]
 640         elif format_spec == 'bestaudio':
 641             audio_formats = [
 642                 f for f in available_formats
 643                 if f.get('vcodec') == 'none']
 644             if audio_formats:
 645                 return audio_formats[-1]
 646         elif format_spec == 'worstaudio':
 647             audio_formats = [
 648                 f for f in available_formats
 649                 if f.get('vcodec') == 'none']
 650             if audio_formats:
 651                 return audio_formats[0]
 652         else:
 653             extensions = ['mp4', 'flv', 'webm', '3gp']
 654             if format_spec in extensions:
 655                 filter_f = lambda f: f['ext'] == format_spec
 656             else:
 657                 filter_f = lambda f: f['format_id'] == format_spec
 658             matches = list(filter(filter_f, available_formats))
 659             if matches:
 660                 return matches[-1]
 661         return None
 662
 663     def process_video_result(self, info_dict, download=True):
 664         assert info_dict.get('_type', 'video') == 'video'
 665
 666         if 'playlist' not in info_dict:
 667             # It isn't part of a playlist
 668             info_dict['playlist'] = None
 669             info_dict['playlist_index'] = None
 670
 671         # This extractors handle format selection themselves
 672         if info_dict['extractor'] in ['Youku']:
 673             if download:
 674                 self.process_info(info_dict)
 675             return info_dict
 676
 677         # We now pick which formats have to be downloaded
 678         if info_dict.get('formats') is None:
 679             # There's only one format available
 680             formats = [info_dict]
 681         else:
 682             formats = info_dict['formats']
 683
 684         # We check that all the formats have the format and format_id fields
 685         for (i, format) in enumerate(formats):
 686             if format.get('format_id') is None:
 687                 format['format_id'] = compat_str(i)
 688             if format.get('format') is None:
 689                 format['format'] = '{id} - {res}{note}'.format(
 690                     id=format['format_id'],
 691                     res=self.format_resolution(format),
 692                     note=' ({0})'.format(format['format_note']) if format.get('format_note') is not None else '',
 693                 )
 694             # Automatically determine file extension if missing
 695             if 'ext' not in format:
 696                 format['ext'] = determine_ext(format['url'])
 697
 698         format_limit = self.params.get('format_limit', None)
 699         if format_limit:
 700             formats = list(takewhile_inclusive(
 701                 lambda f: f['format_id'] != format_limit, formats
 702             ))
 703
 704         # TODO Central sorting goes here
 705
 706         if formats[0] is not info_dict:
 707             # only set the 'formats' fields if the original info_dict list them
 708             # otherwise we end up with a circular reference, the first (and unique)
 709             # element in the 'formats' field in info_dict is info_dict itself,
 710             # wich can't be exported to json
 711             info_dict['formats'] = formats
 712         if self.params.get('listformats', None):
 713             self.list_formats(info_dict)
 714             return
 715
 716         req_format = self.params.get('format')
 717         if req_format is None:
 718             req_format = 'best'
 719         formats_to_download = []
 720         # The -1 is for supporting YoutubeIE
 721         if req_format in ('-1', 'all'):
 722             formats_to_download = formats
 723         else:
 724             # We can accept formats requested in the format: 34/5/best, we pick
 725             # the first that is available, starting from left
 726             req_formats = req_format.split('/')
 727             for rf in req_formats:
 728                 if re.match(r'.+?\+.+?', rf) is not None:
 729                     # Two formats have been requested like '137+139'
 730                     format_1, format_2 = rf.split('+')
 731                     formats_info = (self.select_format(format_1, formats),
 732                         self.select_format(format_2, formats))
 733                     if all(formats_info):
 734                         selected_format = {
 735                             'requested_formats': formats_info,
 736                             'format': rf,
 737                             'ext': formats_info[0]['ext'],
 738                         }
 739                     else:
 740                         selected_format = None
 741                 else:
 742                     selected_format = self.select_format(rf, formats)
 743                 if selected_format is not None:
 744                     formats_to_download = [selected_format]
 745                     break
 746         if not formats_to_download:
 747             raise ExtractorError('requested format not available',
 748                                  expected=True)
 749
 750         if download:
 751             if len(formats_to_download) > 1:
 752                 self.to_screen('[info] %s: downloading video in %s formats' % (info_dict['id'], len(formats_to_download)))
 753             for format in formats_to_download:
 754                 new_info = dict(info_dict)
 755                 new_info.update(format)
 756                 self.process_info(new_info)
 757         # We update the info dict with the best quality format (backwards compatibility)
 758         info_dict.update(formats_to_download[-1])
 759         return info_dict
 760
 761     def process_info(self, info_dict):
 762         """Process a single resolved IE result."""
 763
 764         assert info_dict.get('_type', 'video') == 'video'
 765         #We increment the download the download count here to match the previous behaviour.
 766         self.increment_downloads()
 767
 768         info_dict['fulltitle'] = info_dict['title']
 769         if len(info_dict['title']) > 200:
 770             info_dict['title'] = info_dict['title'][:197] + '...'
 771
 772         # Keep for backwards compatibility
 773         info_dict['stitle'] = info_dict['title']
 774
 775         if not 'format' in info_dict:
 776             info_dict['format'] = info_dict['ext']
 777
 778         reason = self._match_entry(info_dict)
 779         if reason is not None:
 780             self.to_screen('[download] ' + reason)
 781             return
 782
 783         max_downloads = self.params.get('max_downloads')
 784         if max_downloads is not None:
 785             if self._num_downloads > int(max_downloads):
 786                 raise MaxDownloadsReached()
 787
 788         filename = self.prepare_filename(info_dict)
 789
 790         # Forced printings
 791         if self.params.get('forcetitle', False):
 792             self.to_stdout(info_dict['fulltitle'])
 793         if self.params.get('forceid', False):
 794             self.to_stdout(info_dict['id'])
 795         if self.params.get('forceurl', False):
 796             # For RTMP URLs, also include the playpath
 797             self.to_stdout(info_dict['url'] + info_dict.get('play_path', ''))
 798         if self.params.get('forcethumbnail', False) and info_dict.get('thumbnail') is not None:
 799             self.to_stdout(info_dict['thumbnail'])
 800         if self.params.get('forcedescription', False) and info_dict.get('description') is not None:
 801             self.to_stdout(info_dict['description'])
 802         if self.params.get('forcefilename', False) and filename is not None:
 803             self.to_stdout(filename)
 804         if self.params.get('forceduration', False) and info_dict.get('duration') is not None:
 805             self.to_stdout(formatSeconds(info_dict['duration']))
 806         if self.params.get('forceformat', False):
 807             self.to_stdout(info_dict['format'])
 808         if self.params.get('forcejson', False):
 809             info_dict['_filename'] = filename
 810             self.to_stdout(json.dumps(info_dict))
 811
 812         # Do nothing else if in simulate mode
 813         if self.params.get('simulate', False):
 814             return
 815
 816         if filename is None:
 817             return
 818
 819         try:
 820             dn = os.path.dirname(encodeFilename(filename))
 821             if dn != '' and not os.path.exists(dn):
 822                 os.makedirs(dn)
 823         except (OSError, IOError) as err:
 824             self.report_error('unable to create directory ' + compat_str(err))
 825             return
 826
 827         if self.params.get('writedescription', False):
 828             descfn = filename + '.description'
 829             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(descfn)):
 830                 self.to_screen('[info] Video description is already present')
 831             else:
 832                 try:
 833                     self.to_screen('[info] Writing video description to: ' + descfn)
 834                     with io.open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile:
 835                         descfile.write(info_dict['description'])
 836                 except (KeyError, TypeError):
 837                     self.report_warning('There\'s no description to write.')
 838                 except (OSError, IOError):
 839                     self.report_error('Cannot write description file ' + descfn)
 840                     return
 841
 842         if self.params.get('writeannotations', False):
 843             annofn = filename + '.annotations.xml'
 844             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(annofn)):
 845                 self.to_screen('[info] Video annotations are already present')
 846             else:
 847                 try:
 848                     self.to_screen('[info] Writing video annotations to: ' + annofn)
 849                     with io.open(encodeFilename(annofn), 'w', encoding='utf-8') as annofile:
 850                         annofile.write(info_dict['annotations'])
 851                 except (KeyError, TypeError):
 852                     self.report_warning('There are no annotations to write.')
 853                 except (OSError, IOError):
 854                     self.report_error('Cannot write annotations file: ' + annofn)
 855                     return
 856
 857         subtitles_are_requested = any([self.params.get('writesubtitles', False),
 858                                        self.params.get('writeautomaticsub')])
 859
 860         if subtitles_are_requested and 'subtitles' in info_dict and info_dict['subtitles']:
 861             # subtitles download errors are already managed as troubles in relevant IE
 862             # that way it will silently go on when used with unsupporting IE
 863             subtitles = info_dict['subtitles']
 864             sub_format = self.params.get('subtitlesformat', 'srt')
 865             for sub_lang in subtitles.keys():
 866                 sub = subtitles[sub_lang]
 867                 if sub is None:
 868                     continue
 869                 try:
 870                     sub_filename = subtitles_filename(filename, sub_lang, sub_format)
 871                     if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(sub_filename)):
 872                         self.to_screen('[info] Video subtitle %s.%s is already_present' % (sub_lang, sub_format))
 873                     else:
 874                         self.to_screen('[info] Writing video subtitles to: ' + sub_filename)
 875                         with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8') as subfile:
 876                                 subfile.write(sub)
 877                 except (OSError, IOError):
 878                     self.report_error('Cannot write subtitles file ' + descfn)
 879                     return
 880
 881         if self.params.get('writeinfojson', False):
 882             infofn = os.path.splitext(filename)[0] + '.info.json'
 883             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(infofn)):
 884                 self.to_screen('[info] Video description metadata is already present')
 885             else:
 886                 self.to_screen('[info] Writing video description metadata as JSON to: ' + infofn)
 887                 try:
 888                     write_json_file(info_dict, encodeFilename(infofn))
 889                 except (OSError, IOError):
 890                     self.report_error('Cannot write metadata to JSON file ' + infofn)
 891                     return
 892
 893         if self.params.get('writethumbnail', False):
 894             if info_dict.get('thumbnail') is not None:
 895                 thumb_format = determine_ext(info_dict['thumbnail'], 'jpg')
 896                 thumb_filename = os.path.splitext(filename)[0] + '.' + thumb_format
 897                 if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(thumb_filename)):
 898                     self.to_screen('[%s] %s: Thumbnail is already present' %
 899                                    (info_dict['extractor'], info_dict['id']))
 900                 else:
 901                     self.to_screen('[%s] %s: Downloading thumbnail ...' %
 902                                    (info_dict['extractor'], info_dict['id']))
 903                     try:
 904                         uf = compat_urllib_request.urlopen(info_dict['thumbnail'])
 905                         with open(thumb_filename, 'wb') as thumbf:
 906                             shutil.copyfileobj(uf, thumbf)
 907                         self.to_screen('[%s] %s: Writing thumbnail to: %s' %
 908                             (info_dict['extractor'], info_dict['id'], thumb_filename))
 909                     except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 910                         self.report_warning('Unable to download thumbnail "%s": %s' %
 911                             (info_dict['thumbnail'], compat_str(err)))
 912
 913         if not self.params.get('skip_download', False):
 914             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(filename)):
 915                 success = True
 916             else:
 917                 try:
 918                     def dl(name, info):
 919                         fd = get_suitable_downloader(info)(self, self.params)
 920                         for ph in self._progress_hooks:
 921                             fd.add_progress_hook(ph)
 922                         return fd.download(name, info)
 923                     if info_dict.get('requested_formats') is not None:
 924                         downloaded = []
 925                         success = True
 926                         merger = FFmpegMergerPP(self)
 927                         if not merger._get_executable():
 928                             postprocessors = []
 929                             self.report_warning('You have requested multiple '
 930                                 'formats but ffmpeg or avconv are not installed.'
 931                                 ' The formats won\'t be merged')
 932                         else:
 933                             postprocessors = [merger]
 934                         for f in info_dict['requested_formats']:
 935                             new_info = dict(info_dict)
 936                             new_info.update(f)
 937                             fname = self.prepare_filename(new_info)
 938                             fname = prepend_extension(fname, 'f%s' % f['format_id'])
 939                             downloaded.append(fname)
 940                             partial_success = dl(fname, new_info)
 941                             success = success and partial_success
 942                         info_dict['__postprocessors'] = postprocessors
 943                         info_dict['__files_to_merge'] = downloaded
 944                     else:
 945                         # Just a single file
 946                         success = dl(filename, info_dict)
 947                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 948                     self.report_error('unable to download video data: %s' % str(err))
 949                     return
 950                 except (OSError, IOError) as err:
 951                     raise UnavailableVideoError(err)
 952                 except (ContentTooShortError, ) as err:
 953                     self.report_error('content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
 954                     return
 955
 956             if success:
 957                 try:
 958                     self.post_process(filename, info_dict)
 959                 except (PostProcessingError) as err:
 960                     self.report_error('postprocessing: %s' % str(err))
 961                     return
 962
 963         self.record_download_archive(info_dict)
 964
 965     def download(self, url_list):
 966         """Download a given list of URLs."""
 967         if (len(url_list) > 1 and
 968                 '%' not in self.params['outtmpl']
 969                 and self.params.get('max_downloads') != 1):
 970             raise SameFileError(self.params['outtmpl'])
 971
 972         for url in url_list:
 973             try:
 974                 #It also downloads the videos
 975                 self.extract_info(url)
 976             except UnavailableVideoError:
 977                 self.report_error('unable to download video')
 978             except MaxDownloadsReached:
 979                 self.to_screen('[info] Maximum number of downloaded files reached.')
 980                 raise
 981
 982         return self._download_retcode
 983
 984     def download_with_info_file(self, info_filename):
 985         with io.open(info_filename, 'r', encoding='utf-8') as f:
 986             info = json.load(f)
 987         try:
 988             self.process_ie_result(info, download=True)
 989         except DownloadError:
 990             webpage_url = info.get('webpage_url')
 991             if webpage_url is not None:
 992                 self.report_warning('The info failed to download, trying with "%s"' % webpage_url)
 993                 return self.download([webpage_url])
 994             else:
 995                 raise
 996         return self._download_retcode
 997
 998     def post_process(self, filename, ie_info):
 999         """Run all the postprocessors on the given file."""
1000         info = dict(ie_info)
1001         info['filepath'] = filename
1002         keep_video = None
1003         pps_chain = []
1004         if ie_info.get('__postprocessors') is not None:
1005             pps_chain.extend(ie_info['__postprocessors'])
1006         pps_chain.extend(self._pps)
1007         for pp in pps_chain:
1008             try:
1009                 keep_video_wish, new_info = pp.run(info)
1010                 if keep_video_wish is not None:
1011                     if keep_video_wish:
1012                         keep_video = keep_video_wish
1013                     elif keep_video is None:
1014                         # No clear decision yet, let IE decide
1015                         keep_video = keep_video_wish
1016             except PostProcessingError as e:
1017                 self.report_error(e.msg)
1018         if keep_video is False and not self.params.get('keepvideo', False):
1019             try:
1020                 self.to_screen('Deleting original file %s (pass -k to keep)' % filename)
1021                 os.remove(encodeFilename(filename))
1022             except (IOError, OSError):
1023                 self.report_warning('Unable to remove downloaded video file')
1024
1025     def _make_archive_id(self, info_dict):
1026         # Future-proof against any change in case
1027         # and backwards compatibility with prior versions
1028         extractor = info_dict.get('extractor_key')
1029         if extractor is None:
1030             if 'id' in info_dict:
1031                 extractor = info_dict.get('ie_key')  # key in a playlist
1032         if extractor is None:
1033             return None  # Incomplete video information
1034         return extractor.lower() + ' ' + info_dict['id']
1035
1036     def in_download_archive(self, info_dict):
1037         fn = self.params.get('download_archive')
1038         if fn is None:
1039             return False
1040
1041         vid_id = self._make_archive_id(info_dict)
1042         if vid_id is None:
1043             return False  # Incomplete video information
1044
1045         try:
1046             with locked_file(fn, 'r', encoding='utf-8') as archive_file:
1047                 for line in archive_file:
1048                     if line.strip() == vid_id:
1049                         return True
1050         except IOError as ioe:
1051             if ioe.errno != errno.ENOENT:
1052                 raise
1053         return False
1054
1055     def record_download_archive(self, info_dict):
1056         fn = self.params.get('download_archive')
1057         if fn is None:
1058             return
1059         vid_id = self._make_archive_id(info_dict)
1060         assert vid_id
1061         with locked_file(fn, 'a', encoding='utf-8') as archive_file:
1062             archive_file.write(vid_id + '\n')
1063
1064     @staticmethod
1065     def format_resolution(format, default='unknown'):
1066         if format.get('vcodec') == 'none':
1067             return 'audio only'
1068         if format.get('resolution') is not None:
1069             return format['resolution']
1070         if format.get('height') is not None:
1071             if format.get('width') is not None:
1072                 res = '%sx%s' % (format['width'], format['height'])
1073             else:
1074                 res = '%sp' % format['height']
1075         elif format.get('width') is not None:
1076             res = '?x%d' % format['width']
1077         else:
1078             res = default
1079         return res
1080
1081     def list_formats(self, info_dict):
1082         def format_note(fdict):
1083             res = ''
1084             if fdict.get('ext') in ['f4f', 'f4m']:
1085                 res += '(unsupported) '
1086             if fdict.get('format_note') is not None:
1087                 res += fdict['format_note'] + ' '
1088             if fdict.get('tbr') is not None:
1089                 res += '%4dk ' % fdict['tbr']
1090             if (fdict.get('vcodec') is not None and
1091                     fdict.get('vcodec') != 'none'):
1092                 res += '%-5s' % fdict['vcodec']
1093                 if fdict.get('vbr') is not None:
1094                     res += '@'
1095             elif fdict.get('vbr') is not None and fdict.get('abr') is not None:
1096                 res += 'video@'
1097             if fdict.get('vbr') is not None:
1098                 res += '%4dk' % fdict['vbr']
1099             if fdict.get('acodec') is not None:
1100                 if res:
1101                     res += ', '
1102                 res += '%-5s' % fdict['acodec']
1103             elif fdict.get('abr') is not None:
1104                 if res:
1105                     res += ', '
1106                 res += 'audio'
1107             if fdict.get('abr') is not None:
1108                 res += '@%3dk' % fdict['abr']
1109             if fdict.get('asr') is not None:
1110                 res += ' (%5dHz)' % fdict['asr']
1111             if fdict.get('filesize') is not None:
1112                 if res:
1113                     res += ', '
1114                 res += format_bytes(fdict['filesize'])
1115             return res
1116
1117         def line(format, idlen=20):
1118             return (('%-' + compat_str(idlen + 1) + 's%-10s%-12s%s') % (
1119                 format['format_id'],
1120                 format['ext'],
1121                 self.format_resolution(format),
1122                 format_note(format),
1123             ))
1124
1125         formats = info_dict.get('formats', [info_dict])
1126         idlen = max(len('format code'),
1127                     max(len(f['format_id']) for f in formats))
1128         formats_s = [line(f, idlen) for f in formats]
1129         if len(formats) > 1:
1130             formats_s[0] += (' ' if format_note(formats[0]) else '') + '(worst)'
1131             formats_s[-1] += (' ' if format_note(formats[-1]) else '') + '(best)'
1132
1133         header_line = line({
1134             'format_id': 'format code', 'ext': 'extension',
1135             'resolution': 'resolution', 'format_note': 'note'}, idlen=idlen)
1136         self.to_screen('[info] Available formats for %s:\n%s\n%s' %
1137                        (info_dict['id'], header_line, '\n'.join(formats_s)))
1138
1139     def urlopen(self, req):
1140         """ Start an HTTP download """
1141         return self._opener.open(req)
1142
1143     def print_debug_header(self):
1144         if not self.params.get('verbose'):
1145             return
1146         write_string('[debug] youtube-dl version ' + __version__ + '\n')
1147         try:
1148             sp = subprocess.Popen(
1149                 ['git', 'rev-parse', '--short', 'HEAD'],
1150                 stdout=subprocess.PIPE, stderr=subprocess.PIPE,
1151                 cwd=os.path.dirname(os.path.abspath(__file__)))
1152             out, err = sp.communicate()
1153             out = out.decode().strip()
1154             if re.match('[0-9a-f]+', out):
1155                 write_string('[debug] Git HEAD: ' + out + '\n')
1156         except:
1157             try:
1158                 sys.exc_clear()
1159             except:
1160                 pass
1161         write_string('[debug] Python version %s - %s' %
1162                      (platform.python_version(), platform_name()) + '\n')
1163
1164         proxy_map = {}
1165         for handler in self._opener.handlers:
1166             if hasattr(handler, 'proxies'):
1167                 proxy_map.update(handler.proxies)
1168         write_string('[debug] Proxy map: ' + compat_str(proxy_map) + '\n')
1169
1170     def _setup_opener(self):
1171         timeout_val = self.params.get('socket_timeout')
1172         timeout = 600 if timeout_val is None else float(timeout_val)
1173
1174         opts_cookiefile = self.params.get('cookiefile')
1175         opts_proxy = self.params.get('proxy')
1176
1177         if opts_cookiefile is None:
1178             self.cookiejar = compat_cookiejar.CookieJar()
1179         else:
1180             self.cookiejar = compat_cookiejar.MozillaCookieJar(
1181                 opts_cookiefile)
1182             if os.access(opts_cookiefile, os.R_OK):
1183                 self.cookiejar.load()
1184
1185         cookie_processor = compat_urllib_request.HTTPCookieProcessor(
1186             self.cookiejar)
1187         if opts_proxy is not None:
1188             if opts_proxy == '':
1189                 proxies = {}
1190             else:
1191                 proxies = {'http': opts_proxy, 'https': opts_proxy}
1192         else:
1193             proxies = compat_urllib_request.getproxies()
1194             # Set HTTPS proxy to HTTP one if given (https://github.com/rg3/youtube-dl/issues/805)
1195             if 'http' in proxies and 'https' not in proxies:
1196                 proxies['https'] = proxies['http']
1197         proxy_handler = compat_urllib_request.ProxyHandler(proxies)
1198
1199         debuglevel = 1 if self.params.get('debug_printtraffic') else 0
1200         https_handler = make_HTTPS_handler(
1201             self.params.get('nocheckcertificate', False), debuglevel=debuglevel)
1202         ydlh = YoutubeDLHandler(debuglevel=debuglevel)
1203         opener = compat_urllib_request.build_opener(
1204             https_handler, proxy_handler, cookie_processor, ydlh)
1205         # Delete the default user-agent header, which would otherwise apply in
1206         # cases where our custom HTTP handler doesn't come into play
1207         # (See https://github.com/rg3/youtube-dl/issues/1309 for details)
1208         opener.addheaders = []
1209         self._opener = opener
1210
1211         # TODO remove this global modification
1212         compat_urllib_request.install_opener(opener)
1213         socket.setdefaulttimeout(timeout)