2 # -*- coding: utf-8 -*-
5 'Ricardo Garcia Gonzalez',
13 'Philipp Hagemeister',
21 __license__ = 'Public Domain'
22 __version__ = '2012.02.27'
24 UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
57 except ImportError: # Python 2.4
60 import cStringIO as StringIO
64 # parse_qs was moved from the cgi module to the urlparse module recently.
66 from urlparse import parse_qs
68 from cgi import parse_qs
76 import xml.etree.ElementTree
77 except ImportError: # Python<2.5: Not officially supported, but let it slip
78 warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
81 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
82 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
83 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
84 'Accept-Encoding': 'gzip, deflate',
85 'Accept-Language': 'en-us,en;q=0.5',
90 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
96 def raiseError(msg, i):
97 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
98 def skipSpace(i, expectMore=True):
99 while i < len(s) and s[i] in ' \t\r\n':
103 raiseError('Premature end', i)
105 def decodeEscape(match):
121 return unichr(int(esc[1:5], 16))
122 if len(esc) == 5+6 and esc[5:7] == '\\u':
123 hi = int(esc[1:5], 16)
124 low = int(esc[7:11], 16)
125 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
126 raise ValueError('Unknown escape ' + str(esc))
133 while s[e-bslashes-1] == '\\':
135 if bslashes % 2 == 1:
139 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
140 stri = rexp.sub(decodeEscape, s[i:e])
146 if s[i] == '}': # Empty dictionary
150 raiseError('Expected a string object key', i)
151 i,key = parseString(i)
153 if i >= len(s) or s[i] != ':':
154 raiseError('Expected a colon', i)
161 raiseError('Expected comma or closing curly brace', i)
166 if s[i] == ']': # Empty array
171 i = skipSpace(i) # Raise exception if premature end
175 raiseError('Expected a comma or closing bracket', i)
177 def parseDiscrete(i):
178 for k,v in {'true': True, 'false': False, 'null': None}.items():
179 if s.startswith(k, i):
181 raiseError('Not a boolean (or null)', i)
183 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
185 raiseError('Not a number', i)
187 if '.' in nums or 'e' in nums or 'E' in nums:
188 return (i+len(nums), float(nums))
189 return (i+len(nums), int(nums))
190 CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
193 i,res = CHARMAP.get(s[i], parseNumber)(i)
194 i = skipSpace(i, False)
198 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
201 def preferredencoding():
202 """Get preferred encoding.
204 Returns the best encoding scheme for the system, based on
205 locale.getpreferredencoding() and some further tweaks.
207 def yield_preferredencoding():
209 pref = locale.getpreferredencoding()
215 return yield_preferredencoding().next()
218 def htmlentity_transform(matchobj):
219 """Transforms an HTML entity to a Unicode character.
221 This function receives a match object and is intended to be used with
222 the re.sub() function.
224 entity = matchobj.group(1)
226 # Known non-numeric HTML entity
227 if entity in htmlentitydefs.name2codepoint:
228 return unichr(htmlentitydefs.name2codepoint[entity])
231 mobj = re.match(ur'(?u)#(x?\d+)', entity)
233 numstr = mobj.group(1)
234 if numstr.startswith(u'x'):
236 numstr = u'0%s' % numstr
239 return unichr(long(numstr, base))
241 # Unknown entity in name, return its literal representation
242 return (u'&%s;' % entity)
245 def sanitize_title(utitle):
246 """Sanitizes a video title so it could be used as part of a filename."""
247 utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
248 return utitle.replace(unicode(os.sep), u'%')
251 def sanitize_open(filename, open_mode):
252 """Try to open the given filename, and slightly tweak it if this fails.
254 Attempts to open the given filename. If this fails, it tries to change
255 the filename slightly, step by step, until it's either able to open it
256 or it fails and raises a final exception, like the standard open()
259 It returns the tuple (stream, definitive_file_name).
263 if sys.platform == 'win32':
265 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
266 return (sys.stdout, filename)
267 stream = open(_encodeFilename(filename), open_mode)
268 return (stream, filename)
269 except (IOError, OSError), err:
270 # In case of error, try to remove win32 forbidden chars
271 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
273 # An exception here should be caught in the caller
274 stream = open(_encodeFilename(filename), open_mode)
275 return (stream, filename)
278 def timeconvert(timestr):
279 """Convert RFC 2822 defined time string into system timestamp"""
281 timetuple = email.utils.parsedate_tz(timestr)
282 if timetuple is not None:
283 timestamp = email.utils.mktime_tz(timetuple)
286 def _simplify_title(title):
287 expr = re.compile(ur'[^\w\d_\-]+', flags=re.UNICODE)
288 return expr.sub(u'_', title).strip(u'_')
290 def _orderedSet(iterable):
291 """ Remove all duplicates from the input iterable """
298 def _unescapeHTML(s):
300 @param s a string (of type unicode)
302 assert type(s) == type(u'')
304 htmlParser = HTMLParser.HTMLParser()
305 return htmlParser.unescape(s)
307 def _encodeFilename(s):
309 @param s The name of the file (of type unicode)
312 assert type(s) == type(u'')
314 if sys.platform == 'win32' and sys.getwindowsversion().major >= 5:
315 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
316 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
317 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
320 return s.encode(sys.getfilesystemencoding(), 'ignore')
322 class DownloadError(Exception):
323 """Download Error exception.
325 This exception may be thrown by FileDownloader objects if they are not
326 configured to continue on errors. They will contain the appropriate
332 class SameFileError(Exception):
333 """Same File exception.
335 This exception will be thrown by FileDownloader objects if they detect
336 multiple files would have to be downloaded to the same file on disk.
341 class PostProcessingError(Exception):
342 """Post Processing exception.
344 This exception may be raised by PostProcessor's .run() method to
345 indicate an error in the postprocessing task.
349 class MaxDownloadsReached(Exception):
350 """ --max-downloads limit has been reached. """
354 class UnavailableVideoError(Exception):
355 """Unavailable Format exception.
357 This exception will be thrown when a video is requested
358 in a format that is not available for that video.
363 class ContentTooShortError(Exception):
364 """Content Too Short exception.
366 This exception may be raised by FileDownloader objects when a file they
367 download is too small for what the server announced first, indicating
368 the connection was probably interrupted.
374 def __init__(self, downloaded, expected):
375 self.downloaded = downloaded
376 self.expected = expected
379 class YoutubeDLHandler(urllib2.HTTPHandler):
380 """Handler for HTTP requests and responses.
382 This class, when installed with an OpenerDirector, automatically adds
383 the standard headers to every HTTP request and handles gzipped and
384 deflated responses from web servers. If compression is to be avoided in
385 a particular request, the original request in the program code only has
386 to include the HTTP header "Youtubedl-No-Compression", which will be
387 removed before making the real request.
389 Part of this code was copied from:
391 http://techknack.net/python-urllib2-handlers/
393 Andrew Rowls, the author of that code, agreed to release it to the
400 return zlib.decompress(data, -zlib.MAX_WBITS)
402 return zlib.decompress(data)
405 def addinfourl_wrapper(stream, headers, url, code):
406 if hasattr(urllib2.addinfourl, 'getcode'):
407 return urllib2.addinfourl(stream, headers, url, code)
408 ret = urllib2.addinfourl(stream, headers, url)
412 def http_request(self, req):
413 for h in std_headers:
416 req.add_header(h, std_headers[h])
417 if 'Youtubedl-no-compression' in req.headers:
418 if 'Accept-encoding' in req.headers:
419 del req.headers['Accept-encoding']
420 del req.headers['Youtubedl-no-compression']
423 def http_response(self, req, resp):
426 if resp.headers.get('Content-encoding', '') == 'gzip':
427 gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
428 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
429 resp.msg = old_resp.msg
431 if resp.headers.get('Content-encoding', '') == 'deflate':
432 gz = StringIO.StringIO(self.deflate(resp.read()))
433 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
434 resp.msg = old_resp.msg
438 class FileDownloader(object):
439 """File Downloader class.
441 File downloader objects are the ones responsible of downloading the
442 actual video file and writing it to disk if the user has requested
443 it, among some other tasks. In most cases there should be one per
444 program. As, given a video URL, the downloader doesn't know how to
445 extract all the needed information, task that InfoExtractors do, it
446 has to pass the URL to one of them.
448 For this, file downloader objects have a method that allows
449 InfoExtractors to be registered in a given order. When it is passed
450 a URL, the file downloader handles it to the first InfoExtractor it
451 finds that reports being able to handle it. The InfoExtractor extracts
452 all the information about the video or videos the URL refers to, and
453 asks the FileDownloader to process the video information, possibly
454 downloading the video.
456 File downloaders accept a lot of parameters. In order not to saturate
457 the object constructor with arguments, it receives a dictionary of
458 options instead. These options are available through the params
459 attribute for the InfoExtractors to use. The FileDownloader also
460 registers itself as the downloader in charge for the InfoExtractors
461 that are added to it, so this is a "mutual registration".
465 username: Username for authentication purposes.
466 password: Password for authentication purposes.
467 usenetrc: Use netrc for authentication instead.
468 quiet: Do not print messages to stdout.
469 forceurl: Force printing final URL.
470 forcetitle: Force printing title.
471 forcethumbnail: Force printing thumbnail URL.
472 forcedescription: Force printing description.
473 forcefilename: Force printing final filename.
474 simulate: Do not download the video files.
475 format: Video format code.
476 format_limit: Highest quality format to try.
477 outtmpl: Template for output names.
478 ignoreerrors: Do not stop on download errors.
479 ratelimit: Download speed limit, in bytes/sec.
480 nooverwrites: Prevent overwriting files.
481 retries: Number of times to retry for HTTP error 5xx
482 continuedl: Try to continue downloads if possible.
483 noprogress: Do not print the progress bar.
484 playliststart: Playlist item to start at.
485 playlistend: Playlist item to end at.
486 matchtitle: Download only matching titles.
487 rejecttitle: Reject downloads for matching titles.
488 logtostderr: Log messages to stderr instead of stdout.
489 consoletitle: Display progress in console window's titlebar.
490 nopart: Do not use temporary .part files.
491 updatetime: Use the Last-modified header to set output file timestamps.
492 writedescription: Write the video description to a .description file
493 writeinfojson: Write the video description to a .info.json file
494 writesubtitles: Write the video subtitles to a .srt file
495 subtitleslang: Language of the subtitles to download
501 _download_retcode = None
502 _num_downloads = None
505 def __init__(self, params):
506 """Create a FileDownloader object with the given options."""
509 self._download_retcode = 0
510 self._num_downloads = 0
511 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
515 def format_bytes(bytes):
518 if type(bytes) is str:
523 exponent = long(math.log(bytes, 1024.0))
524 suffix = 'bkMGTPEZY'[exponent]
525 converted = float(bytes) / float(1024 ** exponent)
526 return '%.2f%s' % (converted, suffix)
529 def calc_percent(byte_counter, data_len):
532 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
535 def calc_eta(start, now, total, current):
539 if current == 0 or dif < 0.001: # One millisecond
541 rate = float(current) / dif
542 eta = long((float(total) - float(current)) / rate)
543 (eta_mins, eta_secs) = divmod(eta, 60)
546 return '%02d:%02d' % (eta_mins, eta_secs)
549 def calc_speed(start, now, bytes):
551 if bytes == 0 or dif < 0.001: # One millisecond
552 return '%10s' % '---b/s'
553 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
556 def best_block_size(elapsed_time, bytes):
557 new_min = max(bytes / 2.0, 1.0)
558 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
559 if elapsed_time < 0.001:
561 rate = bytes / elapsed_time
569 def parse_bytes(bytestr):
570 """Parse a string indicating a byte quantity into a long integer."""
571 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
574 number = float(matchobj.group(1))
575 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
576 return long(round(number * multiplier))
578 def add_info_extractor(self, ie):
579 """Add an InfoExtractor object to the end of the list."""
581 ie.set_downloader(self)
583 def add_post_processor(self, pp):
584 """Add a PostProcessor object to the end of the chain."""
586 pp.set_downloader(self)
588 def to_screen(self, message, skip_eol=False):
589 """Print message to stdout if not in quiet mode."""
590 assert type(message) == type(u'')
591 if not self.params.get('quiet', False):
592 terminator = [u'\n', u''][skip_eol]
593 output = message + terminator
595 if 'b' not in self._screen_file.mode or sys.version_info[0] < 3: # Python 2 lies about the mode of sys.stdout/sys.stderr
596 output = output.encode(preferredencoding(), 'ignore')
597 self._screen_file.write(output)
598 self._screen_file.flush()
600 def to_stderr(self, message):
601 """Print message to stderr."""
602 print >>sys.stderr, message.encode(preferredencoding())
604 def to_cons_title(self, message):
605 """Set console/terminal window title to message."""
606 if not self.params.get('consoletitle', False):
608 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
609 # c_wchar_p() might not be necessary if `message` is
610 # already of type unicode()
611 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
612 elif 'TERM' in os.environ:
613 sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
615 def fixed_template(self):
616 """Checks if the output template is fixed."""
617 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
619 def trouble(self, message=None):
620 """Determine action to take when a download problem appears.
622 Depending on if the downloader has been configured to ignore
623 download errors or not, this method may throw an exception or
624 not when errors are found, after printing the message.
626 if message is not None:
627 self.to_stderr(message)
628 if not self.params.get('ignoreerrors', False):
629 raise DownloadError(message)
630 self._download_retcode = 1
632 def slow_down(self, start_time, byte_counter):
633 """Sleep if the download speed is over the rate limit."""
634 rate_limit = self.params.get('ratelimit', None)
635 if rate_limit is None or byte_counter == 0:
638 elapsed = now - start_time
641 speed = float(byte_counter) / elapsed
642 if speed > rate_limit:
643 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
645 def temp_name(self, filename):
646 """Returns a temporary filename for the given filename."""
647 if self.params.get('nopart', False) or filename == u'-' or \
648 (os.path.exists(_encodeFilename(filename)) and not os.path.isfile(_encodeFilename(filename))):
650 return filename + u'.part'
652 def undo_temp_name(self, filename):
653 if filename.endswith(u'.part'):
654 return filename[:-len(u'.part')]
657 def try_rename(self, old_filename, new_filename):
659 if old_filename == new_filename:
661 os.rename(_encodeFilename(old_filename), _encodeFilename(new_filename))
662 except (IOError, OSError), err:
663 self.trouble(u'ERROR: unable to rename file')
665 def try_utime(self, filename, last_modified_hdr):
666 """Try to set the last-modified time of the given file."""
667 if last_modified_hdr is None:
669 if not os.path.isfile(_encodeFilename(filename)):
671 timestr = last_modified_hdr
674 filetime = timeconvert(timestr)
678 os.utime(filename, (time.time(), filetime))
683 def report_writedescription(self, descfn):
684 """ Report that the description file is being written """
685 self.to_screen(u'[info] Writing video description to: ' + descfn)
687 def report_writesubtitles(self, srtfn):
688 """ Report that the subtitles file is being written """
689 self.to_screen(u'[info] Writing video subtitles to: ' + srtfn)
691 def report_writeinfojson(self, infofn):
692 """ Report that the metadata file has been written """
693 self.to_screen(u'[info] Video description metadata as JSON to: ' + infofn)
695 def report_destination(self, filename):
696 """Report destination filename."""
697 self.to_screen(u'[download] Destination: ' + filename)
699 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
700 """Report download progress."""
701 if self.params.get('noprogress', False):
703 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
704 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
705 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
706 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
708 def report_resuming_byte(self, resume_len):
709 """Report attempt to resume at given byte."""
710 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
712 def report_retry(self, count, retries):
713 """Report retry in case of HTTP error 5xx"""
714 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
716 def report_file_already_downloaded(self, file_name):
717 """Report file has already been fully downloaded."""
719 self.to_screen(u'[download] %s has already been downloaded' % file_name)
720 except (UnicodeEncodeError), err:
721 self.to_screen(u'[download] The file has already been downloaded')
723 def report_unable_to_resume(self):
724 """Report it was impossible to resume download."""
725 self.to_screen(u'[download] Unable to resume')
727 def report_finish(self):
728 """Report download finished."""
729 if self.params.get('noprogress', False):
730 self.to_screen(u'[download] Download completed')
734 def increment_downloads(self):
735 """Increment the ordinal that assigns a number to each file."""
736 self._num_downloads += 1
738 def prepare_filename(self, info_dict):
739 """Generate the output filename."""
741 template_dict = dict(info_dict)
742 template_dict['epoch'] = unicode(long(time.time()))
743 template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
744 filename = self.params['outtmpl'] % template_dict
746 except (ValueError, KeyError), err:
747 self.trouble(u'ERROR: invalid system charset or erroneous output template')
750 def _match_entry(self, info_dict):
751 """ Returns None iff the file should be downloaded """
753 title = info_dict['title']
754 matchtitle = self.params.get('matchtitle', False)
755 if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
756 return u'[download] "' + title + '" title did not match pattern "' + matchtitle + '"'
757 rejecttitle = self.params.get('rejecttitle', False)
758 if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
759 return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"'
762 def process_info(self, info_dict):
763 """Process a single dictionary returned by an InfoExtractor."""
765 reason = self._match_entry(info_dict)
766 if reason is not None:
767 self.to_screen(u'[download] ' + reason)
770 max_downloads = self.params.get('max_downloads')
771 if max_downloads is not None:
772 if self._num_downloads > int(max_downloads):
773 raise MaxDownloadsReached()
775 filename = self.prepare_filename(info_dict)
778 if self.params.get('forcetitle', False):
779 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
780 if self.params.get('forceurl', False):
781 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
782 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
783 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
784 if self.params.get('forcedescription', False) and 'description' in info_dict:
785 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
786 if self.params.get('forcefilename', False) and filename is not None:
787 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
788 if self.params.get('forceformat', False):
789 print info_dict['format'].encode(preferredencoding(), 'xmlcharrefreplace')
791 # Do nothing else if in simulate mode
792 if self.params.get('simulate', False):
799 dn = os.path.dirname(_encodeFilename(filename))
800 if dn != '' and not os.path.exists(dn): # dn is already encoded
802 except (OSError, IOError), err:
803 self.trouble(u'ERROR: unable to create directory ' + unicode(err))
806 if self.params.get('writedescription', False):
808 descfn = filename + u'.description'
809 self.report_writedescription(descfn)
810 descfile = open(_encodeFilename(descfn), 'wb')
812 descfile.write(info_dict['description'].encode('utf-8'))
815 except (OSError, IOError):
816 self.trouble(u'ERROR: Cannot write description file ' + descfn)
819 if self.params.get('writesubtitles', False) and 'subtitles' in info_dict and info_dict['subtitles']:
820 # subtitles download errors are already managed as troubles in relevant IE
821 # that way it will silently go on when used with unsupporting IE
823 srtfn = filename.rsplit('.', 1)[0] + u'.srt'
824 self.report_writesubtitles(srtfn)
825 srtfile = open(_encodeFilename(srtfn), 'wb')
827 srtfile.write(info_dict['subtitles'].encode('utf-8'))
830 except (OSError, IOError):
831 self.trouble(u'ERROR: Cannot write subtitles file ' + descfn)
834 if self.params.get('writeinfojson', False):
835 infofn = filename + u'.info.json'
836 self.report_writeinfojson(infofn)
839 except (NameError,AttributeError):
840 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
843 infof = open(_encodeFilename(infofn), 'wb')
845 json_info_dict = dict((k,v) for k,v in info_dict.iteritems() if not k in ('urlhandle',))
846 json.dump(json_info_dict, infof)
849 except (OSError, IOError):
850 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
853 if not self.params.get('skip_download', False):
854 if self.params.get('nooverwrites', False) and os.path.exists(_encodeFilename(filename)):
858 success = self._do_download(filename, info_dict)
859 except (OSError, IOError), err:
860 raise UnavailableVideoError
861 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
862 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
864 except (ContentTooShortError, ), err:
865 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
870 self.post_process(filename, info_dict)
871 except (PostProcessingError), err:
872 self.trouble(u'ERROR: postprocessing: %s' % str(err))
875 def download(self, url_list):
876 """Download a given list of URLs."""
877 if len(url_list) > 1 and self.fixed_template():
878 raise SameFileError(self.params['outtmpl'])
881 suitable_found = False
883 # Go to next InfoExtractor if not suitable
884 if not ie.suitable(url):
887 # Suitable InfoExtractor found
888 suitable_found = True
890 # Extract information from URL and process it
893 # Suitable InfoExtractor had been found; go to next URL
896 if not suitable_found:
897 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
899 return self._download_retcode
901 def post_process(self, filename, ie_info):
902 """Run the postprocessing chain on the given file."""
904 info['filepath'] = filename
910 def _download_with_rtmpdump(self, filename, url, player_url):
911 self.report_destination(filename)
912 tmpfilename = self.temp_name(filename)
914 # Check for rtmpdump first
916 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
917 except (OSError, IOError):
918 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
921 # Download using rtmpdump. rtmpdump returns exit code 2 when
922 # the connection was interrumpted and resuming appears to be
923 # possible. This is part of rtmpdump's normal usage, AFAIK.
924 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
925 args = basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)]
926 if self.params.get('verbose', False):
929 shell_quote = lambda args: ' '.join(map(pipes.quote, args))
932 self.to_screen(u'[debug] rtmpdump command line: ' + shell_quote(args))
933 retval = subprocess.call(args)
934 while retval == 2 or retval == 1:
935 prevsize = os.path.getsize(_encodeFilename(tmpfilename))
936 self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
937 time.sleep(5.0) # This seems to be needed
938 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
939 cursize = os.path.getsize(_encodeFilename(tmpfilename))
940 if prevsize == cursize and retval == 1:
942 # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
943 if prevsize == cursize and retval == 2 and cursize > 1024:
944 self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
948 self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(_encodeFilename(tmpfilename)))
949 self.try_rename(tmpfilename, filename)
952 self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
955 def _do_download(self, filename, info_dict):
956 url = info_dict['url']
957 player_url = info_dict.get('player_url', None)
959 # Check file already present
960 if self.params.get('continuedl', False) and os.path.isfile(_encodeFilename(filename)) and not self.params.get('nopart', False):
961 self.report_file_already_downloaded(filename)
964 # Attempt to download using rtmpdump
965 if url.startswith('rtmp'):
966 return self._download_with_rtmpdump(filename, url, player_url)
968 tmpfilename = self.temp_name(filename)
971 # Do not include the Accept-Encoding header
972 headers = {'Youtubedl-no-compression': 'True'}
973 basic_request = urllib2.Request(url, None, headers)
974 request = urllib2.Request(url, None, headers)
976 # Establish possible resume length
977 if os.path.isfile(_encodeFilename(tmpfilename)):
978 resume_len = os.path.getsize(_encodeFilename(tmpfilename))
984 if self.params.get('continuedl', False):
985 self.report_resuming_byte(resume_len)
986 request.add_header('Range','bytes=%d-' % resume_len)
992 retries = self.params.get('retries', 0)
993 while count <= retries:
994 # Establish connection
996 if count == 0 and 'urlhandle' in info_dict:
997 data = info_dict['urlhandle']
998 data = urllib2.urlopen(request)
1000 except (urllib2.HTTPError, ), err:
1001 if (err.code < 500 or err.code >= 600) and err.code != 416:
1002 # Unexpected HTTP error
1004 elif err.code == 416:
1005 # Unable to resume (requested range not satisfiable)
1007 # Open the connection again without the range header
1008 data = urllib2.urlopen(basic_request)
1009 content_length = data.info()['Content-Length']
1010 except (urllib2.HTTPError, ), err:
1011 if err.code < 500 or err.code >= 600:
1014 # Examine the reported length
1015 if (content_length is not None and
1016 (resume_len - 100 < long(content_length) < resume_len + 100)):
1017 # The file had already been fully downloaded.
1018 # Explanation to the above condition: in issue #175 it was revealed that
1019 # YouTube sometimes adds or removes a few bytes from the end of the file,
1020 # changing the file size slightly and causing problems for some users. So
1021 # I decided to implement a suggested change and consider the file
1022 # completely downloaded if the file size differs less than 100 bytes from
1023 # the one in the hard drive.
1024 self.report_file_already_downloaded(filename)
1025 self.try_rename(tmpfilename, filename)
1028 # The length does not match, we start the download over
1029 self.report_unable_to_resume()
1034 if count <= retries:
1035 self.report_retry(count, retries)
1038 self.trouble(u'ERROR: giving up after %s retries' % retries)
1041 data_len = data.info().get('Content-length', None)
1042 if data_len is not None:
1043 data_len = long(data_len) + resume_len
1044 data_len_str = self.format_bytes(data_len)
1045 byte_counter = 0 + resume_len
1049 # Download and write
1050 before = time.time()
1051 data_block = data.read(block_size)
1053 if len(data_block) == 0:
1055 byte_counter += len(data_block)
1057 # Open file just in time
1060 (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
1061 assert stream is not None
1062 filename = self.undo_temp_name(tmpfilename)
1063 self.report_destination(filename)
1064 except (OSError, IOError), err:
1065 self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
1068 stream.write(data_block)
1069 except (IOError, OSError), err:
1070 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
1072 block_size = self.best_block_size(after - before, len(data_block))
1075 speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
1076 if data_len is None:
1077 self.report_progress('Unknown %', data_len_str, speed_str, 'Unknown ETA')
1079 percent_str = self.calc_percent(byte_counter, data_len)
1080 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
1081 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
1084 self.slow_down(start, byte_counter - resume_len)
1087 self.trouble(u'\nERROR: Did not get any data blocks')
1090 self.report_finish()
1091 if data_len is not None and byte_counter != data_len:
1092 raise ContentTooShortError(byte_counter, long(data_len))
1093 self.try_rename(tmpfilename, filename)
1095 # Update file modification time
1096 if self.params.get('updatetime', True):
1097 info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None))
1102 class InfoExtractor(object):
1103 """Information Extractor class.
1105 Information extractors are the classes that, given a URL, extract
1106 information from the video (or videos) the URL refers to. This
1107 information includes the real video URL, the video title and simplified
1108 title, author and others. The information is stored in a dictionary
1109 which is then passed to the FileDownloader. The FileDownloader
1110 processes this information possibly downloading the video to the file
1111 system, among other possible outcomes. The dictionaries must include
1112 the following fields:
1114 id: Video identifier.
1115 url: Final video URL.
1116 uploader: Nickname of the video uploader.
1117 title: Literal title.
1118 stitle: Simplified title.
1119 ext: Video filename extension.
1120 format: Video format.
1121 player_url: SWF Player URL (may be None).
1123 The following fields are optional. Their primary purpose is to allow
1124 youtube-dl to serve as the backend for a video search function, such
1125 as the one in youtube2mp3. They are only used when their respective
1126 forced printing functions are called:
1128 thumbnail: Full URL to a video thumbnail image.
1129 description: One-line video description.
1131 Subclasses of this one should re-define the _real_initialize() and
1132 _real_extract() methods and define a _VALID_URL regexp.
1133 Probably, they should also be added to the list of extractors.
1139 def __init__(self, downloader=None):
1140 """Constructor. Receives an optional downloader."""
1142 self.set_downloader(downloader)
1144 def suitable(self, url):
1145 """Receives a URL and returns True if suitable for this IE."""
1146 return re.match(self._VALID_URL, url) is not None
1148 def initialize(self):
1149 """Initializes an instance (authentication, etc)."""
1151 self._real_initialize()
1154 def extract(self, url):
1155 """Extracts URL information and returns it in list of dicts."""
1157 return self._real_extract(url)
1159 def set_downloader(self, downloader):
1160 """Sets the downloader for this IE."""
1161 self._downloader = downloader
1163 def _real_initialize(self):
1164 """Real initialization process. Redefine in subclasses."""
1167 def _real_extract(self, url):
1168 """Real extraction process. Redefine in subclasses."""
1172 class YoutubeIE(InfoExtractor):
1173 """Information extractor for youtube.com."""
1175 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1176 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1177 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1178 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1179 _NETRC_MACHINE = 'youtube'
1180 # Listed in order of quality
1181 _available_formats = ['38', '37', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
1182 _available_formats_prefer_free = ['38', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
1183 _video_extensions = {
1189 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1194 _video_dimensions = {
1209 IE_NAME = u'youtube'
1211 def report_lang(self):
1212 """Report attempt to set language."""
1213 self._downloader.to_screen(u'[youtube] Setting language')
1215 def report_login(self):
1216 """Report attempt to log in."""
1217 self._downloader.to_screen(u'[youtube] Logging in')
1219 def report_age_confirmation(self):
1220 """Report attempt to confirm age."""
1221 self._downloader.to_screen(u'[youtube] Confirming age')
1223 def report_video_webpage_download(self, video_id):
1224 """Report attempt to download video webpage."""
1225 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1227 def report_video_info_webpage_download(self, video_id):
1228 """Report attempt to download video info webpage."""
1229 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1231 def report_video_subtitles_download(self, video_id):
1232 """Report attempt to download video info webpage."""
1233 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
1235 def report_information_extraction(self, video_id):
1236 """Report attempt to extract video information."""
1237 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1239 def report_unavailable_format(self, video_id, format):
1240 """Report extracted video URL."""
1241 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1243 def report_rtmp_download(self):
1244 """Indicate the download will use the RTMP protocol."""
1245 self._downloader.to_screen(u'[youtube] RTMP download detected')
1247 def _closed_captions_xml_to_srt(self, xml_string):
1249 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
1250 # TODO parse xml instead of regex
1251 for n, (start, dur_tag, dur, caption) in enumerate(texts):
1252 if not dur: dur = '4'
1253 start = float(start)
1254 end = start + float(dur)
1255 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
1256 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
1257 caption = re.sub(ur'(?u)&(.+?);', htmlentity_transform, caption)
1258 caption = re.sub(ur'(?u)&(.+?);', htmlentity_transform, caption) # double cycle, inentional
1259 srt += str(n) + '\n'
1260 srt += start + ' --> ' + end + '\n'
1261 srt += caption + '\n\n'
1264 def _print_formats(self, formats):
1265 print 'Available formats:'
1267 print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
1269 def _real_initialize(self):
1270 if self._downloader is None:
1275 downloader_params = self._downloader.params
1277 # Attempt to use provided username and password or .netrc data
1278 if downloader_params.get('username', None) is not None:
1279 username = downloader_params['username']
1280 password = downloader_params['password']
1281 elif downloader_params.get('usenetrc', False):
1283 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1284 if info is not None:
1288 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1289 except (IOError, netrc.NetrcParseError), err:
1290 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1294 request = urllib2.Request(self._LANG_URL)
1297 urllib2.urlopen(request).read()
1298 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1299 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1302 # No authentication to be performed
1303 if username is None:
1308 'current_form': 'loginForm',
1310 'action_login': 'Log In',
1311 'username': username,
1312 'password': password,
1314 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1317 login_results = urllib2.urlopen(request).read()
1318 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1319 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1321 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1322 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1328 'action_confirm': 'Confirm',
1330 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1332 self.report_age_confirmation()
1333 age_results = urllib2.urlopen(request).read()
1334 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1335 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1338 def _real_extract(self, url):
1339 # Extract video id from URL
1340 mobj = re.match(self._VALID_URL, url)
1342 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1344 video_id = mobj.group(2)
1347 self.report_video_webpage_download(video_id)
1348 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
1350 video_webpage = urllib2.urlopen(request).read()
1351 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1352 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1355 # Attempt to extract SWF player URL
1356 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1357 if mobj is not None:
1358 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1363 self.report_video_info_webpage_download(video_id)
1364 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1365 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1366 % (video_id, el_type))
1367 request = urllib2.Request(video_info_url)
1369 video_info_webpage = urllib2.urlopen(request).read()
1370 video_info = parse_qs(video_info_webpage)
1371 if 'token' in video_info:
1373 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1374 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1376 if 'token' not in video_info:
1377 if 'reason' in video_info:
1378 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1380 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1383 # Start extracting information
1384 self.report_information_extraction(video_id)
1387 if 'author' not in video_info:
1388 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1390 video_uploader = urllib.unquote_plus(video_info['author'][0])
1393 if 'title' not in video_info:
1394 self._downloader.trouble(u'ERROR: unable to extract video title')
1396 video_title = urllib.unquote_plus(video_info['title'][0])
1397 video_title = video_title.decode('utf-8')
1398 video_title = sanitize_title(video_title)
1401 simple_title = _simplify_title(video_title)
1404 if 'thumbnail_url' not in video_info:
1405 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1406 video_thumbnail = ''
1407 else: # don't panic if we can't find it
1408 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1412 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1413 if mobj is not None:
1414 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1415 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1416 for expression in format_expressions:
1418 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1426 video_description = u'No description available.'
1427 mobj = re.search(r'<meta name="description" content="(.*?)">', video_webpage)
1428 if mobj is not None:
1429 video_description = mobj.group(1).decode('utf-8')
1431 html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1432 vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1433 video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1434 # TODO use another parser
1437 video_subtitles = None
1438 if self._downloader.params.get('writesubtitles', False):
1439 self.report_video_subtitles_download(video_id)
1440 request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
1442 srt_list = urllib2.urlopen(request).read()
1443 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1444 self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
1446 srt_lang_list = re.findall(r'lang_code="([\w\-]+)"', srt_list)
1448 if self._downloader.params.get('subtitleslang', False):
1449 srt_lang = self._downloader.params.get('subtitleslang')
1450 elif 'en' in srt_lang_list:
1453 srt_lang = srt_lang_list[0]
1454 if not srt_lang in srt_lang_list:
1455 self._downloader.trouble(u'WARNING: no closed captions found in the specified language')
1457 request = urllib2.Request('http://video.google.com/timedtext?hl=en&lang=%s&v=%s' % (srt_lang, video_id))
1459 srt_xml = urllib2.urlopen(request).read()
1460 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1461 self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
1463 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
1465 self._downloader.trouble(u'WARNING: video has no closed captions')
1468 video_token = urllib.unquote_plus(video_info['token'][0])
1470 # Decide which formats to download
1471 req_format = self._downloader.params.get('format', None)
1473 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1474 self.report_rtmp_download()
1475 video_url_list = [(None, video_info['conn'][0])]
1476 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1477 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1478 url_data = [parse_qs(uds) for uds in url_data_strs]
1479 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1480 url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1482 format_limit = self._downloader.params.get('format_limit', None)
1483 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
1484 if format_limit is not None and format_limit in available_formats:
1485 format_list = available_formats[available_formats.index(format_limit):]
1487 format_list = available_formats
1488 existing_formats = [x for x in format_list if x in url_map]
1489 if len(existing_formats) == 0:
1490 self._downloader.trouble(u'ERROR: no known formats available for video')
1492 if self._downloader.params.get('listformats', None):
1493 self._print_formats(existing_formats)
1495 if req_format is None or req_format == 'best':
1496 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1497 elif req_format == 'worst':
1498 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1499 elif req_format in ('-1', 'all'):
1500 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1502 # Specific formats. We pick the first in a slash-delimeted sequence.
1503 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1504 req_formats = req_format.split('/')
1505 video_url_list = None
1506 for rf in req_formats:
1508 video_url_list = [(rf, url_map[rf])]
1510 if video_url_list is None:
1511 self._downloader.trouble(u'ERROR: requested format not available')
1514 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1517 for format_param, video_real_url in video_url_list:
1518 # At this point we have a new video
1519 self._downloader.increment_downloads()
1522 video_extension = self._video_extensions.get(format_param, 'flv')
1525 # Process video information
1526 self._downloader.process_info({
1527 'id': video_id.decode('utf-8'),
1528 'url': video_real_url.decode('utf-8'),
1529 'uploader': video_uploader.decode('utf-8'),
1530 'upload_date': upload_date,
1531 'title': video_title,
1532 'stitle': simple_title,
1533 'ext': video_extension.decode('utf-8'),
1534 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1535 'thumbnail': video_thumbnail.decode('utf-8'),
1536 'description': video_description,
1537 'player_url': player_url,
1538 'subtitles': video_subtitles
1540 except UnavailableVideoError, err:
1541 self._downloader.trouble(u'\nERROR: unable to download video')
1544 class MetacafeIE(InfoExtractor):
1545 """Information Extractor for metacafe.com."""
1547 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1548 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1549 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1551 IE_NAME = u'metacafe'
1553 def __init__(self, youtube_ie, downloader=None):
1554 InfoExtractor.__init__(self, downloader)
1555 self._youtube_ie = youtube_ie
1557 def report_disclaimer(self):
1558 """Report disclaimer retrieval."""
1559 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1561 def report_age_confirmation(self):
1562 """Report attempt to confirm age."""
1563 self._downloader.to_screen(u'[metacafe] Confirming age')
1565 def report_download_webpage(self, video_id):
1566 """Report webpage download."""
1567 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1569 def report_extraction(self, video_id):
1570 """Report information extraction."""
1571 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1573 def _real_initialize(self):
1574 # Retrieve disclaimer
1575 request = urllib2.Request(self._DISCLAIMER)
1577 self.report_disclaimer()
1578 disclaimer = urllib2.urlopen(request).read()
1579 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1580 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1586 'submit': "Continue - I'm over 18",
1588 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1590 self.report_age_confirmation()
1591 disclaimer = urllib2.urlopen(request).read()
1592 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1593 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1596 def _real_extract(self, url):
1597 # Extract id and simplified title from URL
1598 mobj = re.match(self._VALID_URL, url)
1600 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1603 video_id = mobj.group(1)
1605 # Check if video comes from YouTube
1606 mobj2 = re.match(r'^yt-(.*)$', video_id)
1607 if mobj2 is not None:
1608 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1611 # At this point we have a new video
1612 self._downloader.increment_downloads()
1614 simple_title = mobj.group(2).decode('utf-8')
1616 # Retrieve video webpage to extract further information
1617 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1619 self.report_download_webpage(video_id)
1620 webpage = urllib2.urlopen(request).read()
1621 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1622 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1625 # Extract URL, uploader and title from webpage
1626 self.report_extraction(video_id)
1627 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1628 if mobj is not None:
1629 mediaURL = urllib.unquote(mobj.group(1))
1630 video_extension = mediaURL[-3:]
1632 # Extract gdaKey if available
1633 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1635 video_url = mediaURL
1637 gdaKey = mobj.group(1)
1638 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1640 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1642 self._downloader.trouble(u'ERROR: unable to extract media URL')
1644 vardict = parse_qs(mobj.group(1))
1645 if 'mediaData' not in vardict:
1646 self._downloader.trouble(u'ERROR: unable to extract media URL')
1648 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1650 self._downloader.trouble(u'ERROR: unable to extract media URL')
1652 mediaURL = mobj.group(1).replace('\\/', '/')
1653 video_extension = mediaURL[-3:]
1654 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1656 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1658 self._downloader.trouble(u'ERROR: unable to extract title')
1660 video_title = mobj.group(1).decode('utf-8')
1661 video_title = sanitize_title(video_title)
1663 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1665 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1667 video_uploader = mobj.group(1)
1670 # Process video information
1671 self._downloader.process_info({
1672 'id': video_id.decode('utf-8'),
1673 'url': video_url.decode('utf-8'),
1674 'uploader': video_uploader.decode('utf-8'),
1675 'upload_date': u'NA',
1676 'title': video_title,
1677 'stitle': simple_title,
1678 'ext': video_extension.decode('utf-8'),
1682 except UnavailableVideoError:
1683 self._downloader.trouble(u'\nERROR: unable to download video')
1686 class DailymotionIE(InfoExtractor):
1687 """Information Extractor for Dailymotion"""
1689 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1690 IE_NAME = u'dailymotion'
1692 def __init__(self, downloader=None):
1693 InfoExtractor.__init__(self, downloader)
1695 def report_download_webpage(self, video_id):
1696 """Report webpage download."""
1697 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1699 def report_extraction(self, video_id):
1700 """Report information extraction."""
1701 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1703 def _real_extract(self, url):
1704 # Extract id and simplified title from URL
1705 mobj = re.match(self._VALID_URL, url)
1707 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1710 # At this point we have a new video
1711 self._downloader.increment_downloads()
1712 video_id = mobj.group(1)
1714 video_extension = 'flv'
1716 # Retrieve video webpage to extract further information
1717 request = urllib2.Request(url)
1718 request.add_header('Cookie', 'family_filter=off')
1720 self.report_download_webpage(video_id)
1721 webpage = urllib2.urlopen(request).read()
1722 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1723 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1726 # Extract URL, uploader and title from webpage
1727 self.report_extraction(video_id)
1728 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
1730 self._downloader.trouble(u'ERROR: unable to extract media URL')
1732 sequence = urllib.unquote(mobj.group(1))
1733 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1735 self._downloader.trouble(u'ERROR: unable to extract media URL')
1737 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
1739 # if needed add http://www.dailymotion.com/ if relative URL
1741 video_url = mediaURL
1743 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
1745 self._downloader.trouble(u'ERROR: unable to extract title')
1747 video_title = _unescapeHTML(mobj.group('title').decode('utf-8'))
1748 video_title = sanitize_title(video_title)
1749 simple_title = _simplify_title(video_title)
1751 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
1753 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1755 video_uploader = mobj.group(1)
1758 # Process video information
1759 self._downloader.process_info({
1760 'id': video_id.decode('utf-8'),
1761 'url': video_url.decode('utf-8'),
1762 'uploader': video_uploader.decode('utf-8'),
1763 'upload_date': u'NA',
1764 'title': video_title,
1765 'stitle': simple_title,
1766 'ext': video_extension.decode('utf-8'),
1770 except UnavailableVideoError:
1771 self._downloader.trouble(u'\nERROR: unable to download video')
1774 class GoogleIE(InfoExtractor):
1775 """Information extractor for video.google.com."""
1777 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1778 IE_NAME = u'video.google'
1780 def __init__(self, downloader=None):
1781 InfoExtractor.__init__(self, downloader)
1783 def report_download_webpage(self, video_id):
1784 """Report webpage download."""
1785 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1787 def report_extraction(self, video_id):
1788 """Report information extraction."""
1789 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1791 def _real_extract(self, url):
1792 # Extract id from URL
1793 mobj = re.match(self._VALID_URL, url)
1795 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1798 # At this point we have a new video
1799 self._downloader.increment_downloads()
1800 video_id = mobj.group(1)
1802 video_extension = 'mp4'
1804 # Retrieve video webpage to extract further information
1805 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1807 self.report_download_webpage(video_id)
1808 webpage = urllib2.urlopen(request).read()
1809 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1810 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1813 # Extract URL, uploader, and title from webpage
1814 self.report_extraction(video_id)
1815 mobj = re.search(r"download_url:'([^']+)'", webpage)
1817 video_extension = 'flv'
1818 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1820 self._downloader.trouble(u'ERROR: unable to extract media URL')
1822 mediaURL = urllib.unquote(mobj.group(1))
1823 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1824 mediaURL = mediaURL.replace('\\x26', '\x26')
1826 video_url = mediaURL
1828 mobj = re.search(r'<title>(.*)</title>', webpage)
1830 self._downloader.trouble(u'ERROR: unable to extract title')
1832 video_title = mobj.group(1).decode('utf-8')
1833 video_title = sanitize_title(video_title)
1834 simple_title = _simplify_title(video_title)
1836 # Extract video description
1837 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1839 self._downloader.trouble(u'ERROR: unable to extract video description')
1841 video_description = mobj.group(1).decode('utf-8')
1842 if not video_description:
1843 video_description = 'No description available.'
1845 # Extract video thumbnail
1846 if self._downloader.params.get('forcethumbnail', False):
1847 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1849 webpage = urllib2.urlopen(request).read()
1850 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1851 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1853 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1855 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1857 video_thumbnail = mobj.group(1)
1858 else: # we need something to pass to process_info
1859 video_thumbnail = ''
1862 # Process video information
1863 self._downloader.process_info({
1864 'id': video_id.decode('utf-8'),
1865 'url': video_url.decode('utf-8'),
1867 'upload_date': u'NA',
1868 'title': video_title,
1869 'stitle': simple_title,
1870 'ext': video_extension.decode('utf-8'),
1874 except UnavailableVideoError:
1875 self._downloader.trouble(u'\nERROR: unable to download video')
1878 class PhotobucketIE(InfoExtractor):
1879 """Information extractor for photobucket.com."""
1881 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1882 IE_NAME = u'photobucket'
1884 def __init__(self, downloader=None):
1885 InfoExtractor.__init__(self, downloader)
1887 def report_download_webpage(self, video_id):
1888 """Report webpage download."""
1889 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1891 def report_extraction(self, video_id):
1892 """Report information extraction."""
1893 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1895 def _real_extract(self, url):
1896 # Extract id from URL
1897 mobj = re.match(self._VALID_URL, url)
1899 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1902 # At this point we have a new video
1903 self._downloader.increment_downloads()
1904 video_id = mobj.group(1)
1906 video_extension = 'flv'
1908 # Retrieve video webpage to extract further information
1909 request = urllib2.Request(url)
1911 self.report_download_webpage(video_id)
1912 webpage = urllib2.urlopen(request).read()
1913 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1914 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1917 # Extract URL, uploader, and title from webpage
1918 self.report_extraction(video_id)
1919 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1921 self._downloader.trouble(u'ERROR: unable to extract media URL')
1923 mediaURL = urllib.unquote(mobj.group(1))
1925 video_url = mediaURL
1927 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1929 self._downloader.trouble(u'ERROR: unable to extract title')
1931 video_title = mobj.group(1).decode('utf-8')
1932 video_title = sanitize_title(video_title)
1933 simple_title = _simplify_title(vide_title)
1935 video_uploader = mobj.group(2).decode('utf-8')
1938 # Process video information
1939 self._downloader.process_info({
1940 'id': video_id.decode('utf-8'),
1941 'url': video_url.decode('utf-8'),
1942 'uploader': video_uploader,
1943 'upload_date': u'NA',
1944 'title': video_title,
1945 'stitle': simple_title,
1946 'ext': video_extension.decode('utf-8'),
1950 except UnavailableVideoError:
1951 self._downloader.trouble(u'\nERROR: unable to download video')
1954 class YahooIE(InfoExtractor):
1955 """Information extractor for video.yahoo.com."""
1957 # _VALID_URL matches all Yahoo! Video URLs
1958 # _VPAGE_URL matches only the extractable '/watch/' URLs
1959 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1960 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1961 IE_NAME = u'video.yahoo'
1963 def __init__(self, downloader=None):
1964 InfoExtractor.__init__(self, downloader)
1966 def report_download_webpage(self, video_id):
1967 """Report webpage download."""
1968 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1970 def report_extraction(self, video_id):
1971 """Report information extraction."""
1972 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1974 def _real_extract(self, url, new_video=True):
1975 # Extract ID from URL
1976 mobj = re.match(self._VALID_URL, url)
1978 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1981 # At this point we have a new video
1982 self._downloader.increment_downloads()
1983 video_id = mobj.group(2)
1984 video_extension = 'flv'
1986 # Rewrite valid but non-extractable URLs as
1987 # extractable English language /watch/ URLs
1988 if re.match(self._VPAGE_URL, url) is None:
1989 request = urllib2.Request(url)
1991 webpage = urllib2.urlopen(request).read()
1992 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1993 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1996 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1998 self._downloader.trouble(u'ERROR: Unable to extract id field')
2000 yahoo_id = mobj.group(1)
2002 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
2004 self._downloader.trouble(u'ERROR: Unable to extract vid field')
2006 yahoo_vid = mobj.group(1)
2008 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
2009 return self._real_extract(url, new_video=False)
2011 # Retrieve video webpage to extract further information
2012 request = urllib2.Request(url)
2014 self.report_download_webpage(video_id)
2015 webpage = urllib2.urlopen(request).read()
2016 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2017 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2020 # Extract uploader and title from webpage
2021 self.report_extraction(video_id)
2022 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
2024 self._downloader.trouble(u'ERROR: unable to extract video title')
2026 video_title = mobj.group(1).decode('utf-8')
2027 simple_title = _simplify_title(video_title)
2029 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
2031 self._downloader.trouble(u'ERROR: unable to extract video uploader')
2033 video_uploader = mobj.group(1).decode('utf-8')
2035 # Extract video thumbnail
2036 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
2038 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2040 video_thumbnail = mobj.group(1).decode('utf-8')
2042 # Extract video description
2043 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
2045 self._downloader.trouble(u'ERROR: unable to extract video description')
2047 video_description = mobj.group(1).decode('utf-8')
2048 if not video_description:
2049 video_description = 'No description available.'
2051 # Extract video height and width
2052 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
2054 self._downloader.trouble(u'ERROR: unable to extract video height')
2056 yv_video_height = mobj.group(1)
2058 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
2060 self._downloader.trouble(u'ERROR: unable to extract video width')
2062 yv_video_width = mobj.group(1)
2064 # Retrieve video playlist to extract media URL
2065 # I'm not completely sure what all these options are, but we
2066 # seem to need most of them, otherwise the server sends a 401.
2067 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
2068 yv_bitrate = '700' # according to Wikipedia this is hard-coded
2069 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
2070 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
2071 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
2073 self.report_download_webpage(video_id)
2074 webpage = urllib2.urlopen(request).read()
2075 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2076 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2079 # Extract media URL from playlist XML
2080 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
2082 self._downloader.trouble(u'ERROR: Unable to extract media URL')
2084 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
2085 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
2088 # Process video information
2089 self._downloader.process_info({
2090 'id': video_id.decode('utf-8'),
2092 'uploader': video_uploader,
2093 'upload_date': u'NA',
2094 'title': video_title,
2095 'stitle': simple_title,
2096 'ext': video_extension.decode('utf-8'),
2097 'thumbnail': video_thumbnail.decode('utf-8'),
2098 'description': video_description,
2099 'thumbnail': video_thumbnail,
2102 except UnavailableVideoError:
2103 self._downloader.trouble(u'\nERROR: unable to download video')
2106 class VimeoIE(InfoExtractor):
2107 """Information extractor for vimeo.com."""
2109 # _VALID_URL matches Vimeo URLs
2110 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
2113 def __init__(self, downloader=None):
2114 InfoExtractor.__init__(self, downloader)
2116 def report_download_webpage(self, video_id):
2117 """Report webpage download."""
2118 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
2120 def report_extraction(self, video_id):
2121 """Report information extraction."""
2122 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
2124 def _real_extract(self, url, new_video=True):
2125 # Extract ID from URL
2126 mobj = re.match(self._VALID_URL, url)
2128 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2131 # At this point we have a new video
2132 self._downloader.increment_downloads()
2133 video_id = mobj.group(1)
2135 # Retrieve video webpage to extract further information
2136 request = urllib2.Request(url, None, std_headers)
2138 self.report_download_webpage(video_id)
2139 webpage = urllib2.urlopen(request).read()
2140 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2141 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2144 # Now we begin extracting as much information as we can from what we
2145 # retrieved. First we extract the information common to all extractors,
2146 # and latter we extract those that are Vimeo specific.
2147 self.report_extraction(video_id)
2149 # Extract the config JSON
2150 config = webpage.split(' = {config:')[1].split(',assets:')[0]
2152 config = json.loads(config)
2154 self._downloader.trouble(u'ERROR: unable to extract info section')
2158 video_title = config["video"]["title"]
2159 simple_title = _simplify_title(video_title)
2162 video_uploader = config["video"]["owner"]["name"]
2164 # Extract video thumbnail
2165 video_thumbnail = config["video"]["thumbnail"]
2167 # Extract video description
2171 video_description = u'No description available.'
2172 mobj = re.search(r'<meta name="description" content="(.*?)" />', webpage, re.MULTILINE)
2173 if mobj is not None:
2174 video_description = mobj.group(1)
2176 html_parser = lxml.etree.HTMLParser()
2177 vwebpage_doc = lxml.etree.parse(StringIO.StringIO(webpage), html_parser)
2178 video_description = u''.join(vwebpage_doc.xpath('id("description")//text()')).strip()
2179 # TODO use another parser
2181 # Extract upload date
2182 video_upload_date = u'NA'
2183 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
2184 if mobj is not None:
2185 video_upload_date = mobj.group(1)
2187 # Vimeo specific: extract request signature and timestamp
2188 sig = config['request']['signature']
2189 timestamp = config['request']['timestamp']
2191 # Vimeo specific: extract video codec and quality information
2192 # TODO bind to format param
2193 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
2194 for codec in codecs:
2195 if codec[0] in config["video"]["files"]:
2196 video_codec = codec[0]
2197 video_extension = codec[1]
2198 if 'hd' in config["video"]["files"][codec[0]]: quality = 'hd'
2199 else: quality = 'sd'
2202 self._downloader.trouble(u'ERROR: no known codec found')
2205 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
2206 %(video_id, sig, timestamp, quality, video_codec.upper())
2209 # Process video information
2210 self._downloader.process_info({
2213 'uploader': video_uploader,
2214 'upload_date': video_upload_date,
2215 'title': video_title,
2216 'stitle': simple_title,
2217 'ext': video_extension,
2218 'thumbnail': video_thumbnail,
2219 'description': video_description,
2222 except UnavailableVideoError:
2223 self._downloader.trouble(u'ERROR: unable to download video')
2226 class GenericIE(InfoExtractor):
2227 """Generic last-resort information extractor."""
2230 IE_NAME = u'generic'
2232 def __init__(self, downloader=None):
2233 InfoExtractor.__init__(self, downloader)
2235 def report_download_webpage(self, video_id):
2236 """Report webpage download."""
2237 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2238 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2240 def report_extraction(self, video_id):
2241 """Report information extraction."""
2242 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2244 def _real_extract(self, url):
2245 # At this point we have a new video
2246 self._downloader.increment_downloads()
2248 video_id = url.split('/')[-1]
2249 request = urllib2.Request(url)
2251 self.report_download_webpage(video_id)
2252 webpage = urllib2.urlopen(request).read()
2253 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2254 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2256 except ValueError, err:
2257 # since this is the last-resort InfoExtractor, if
2258 # this error is thrown, it'll be thrown here
2259 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2262 self.report_extraction(video_id)
2263 # Start with something easy: JW Player in SWFObject
2264 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2266 # Broaden the search a little bit
2267 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2269 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2272 # It's possible that one of the regexes
2273 # matched, but returned an empty group:
2274 if mobj.group(1) is None:
2275 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2278 video_url = urllib.unquote(mobj.group(1))
2279 video_id = os.path.basename(video_url)
2281 # here's a fun little line of code for you:
2282 video_extension = os.path.splitext(video_id)[1][1:]
2283 video_id = os.path.splitext(video_id)[0]
2285 # it's tempting to parse this further, but you would
2286 # have to take into account all the variations like
2287 # Video Title - Site Name
2288 # Site Name | Video Title
2289 # Video Title - Tagline | Site Name
2290 # and so on and so forth; it's just not practical
2291 mobj = re.search(r'<title>(.*)</title>', webpage)
2293 self._downloader.trouble(u'ERROR: unable to extract title')
2295 video_title = mobj.group(1).decode('utf-8')
2296 video_title = sanitize_title(video_title)
2297 simple_title = _simplify_title(video_title)
2299 # video uploader is domain name
2300 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2302 self._downloader.trouble(u'ERROR: unable to extract title')
2304 video_uploader = mobj.group(1).decode('utf-8')
2307 # Process video information
2308 self._downloader.process_info({
2309 'id': video_id.decode('utf-8'),
2310 'url': video_url.decode('utf-8'),
2311 'uploader': video_uploader,
2312 'upload_date': u'NA',
2313 'title': video_title,
2314 'stitle': simple_title,
2315 'ext': video_extension.decode('utf-8'),
2319 except UnavailableVideoError, err:
2320 self._downloader.trouble(u'\nERROR: unable to download video')
2323 class YoutubeSearchIE(InfoExtractor):
2324 """Information Extractor for YouTube search queries."""
2325 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
2326 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
2328 _max_youtube_results = 1000
2329 IE_NAME = u'youtube:search'
2331 def __init__(self, youtube_ie, downloader=None):
2332 InfoExtractor.__init__(self, downloader)
2333 self._youtube_ie = youtube_ie
2335 def report_download_page(self, query, pagenum):
2336 """Report attempt to download playlist page with given number."""
2337 query = query.decode(preferredencoding())
2338 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2340 def _real_initialize(self):
2341 self._youtube_ie.initialize()
2343 def _real_extract(self, query):
2344 mobj = re.match(self._VALID_URL, query)
2346 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2349 prefix, query = query.split(':')
2351 query = query.encode('utf-8')
2353 self._download_n_results(query, 1)
2355 elif prefix == 'all':
2356 self._download_n_results(query, self._max_youtube_results)
2362 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2364 elif n > self._max_youtube_results:
2365 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2366 n = self._max_youtube_results
2367 self._download_n_results(query, n)
2369 except ValueError: # parsing prefix as integer fails
2370 self._download_n_results(query, 1)
2373 def _download_n_results(self, query, n):
2374 """Downloads a specified number of results for a query"""
2380 while (50 * pagenum) < limit:
2381 self.report_download_page(query, pagenum+1)
2382 result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
2383 request = urllib2.Request(result_url)
2385 data = urllib2.urlopen(request).read()
2386 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2387 self._downloader.trouble(u'ERROR: unable to download API page: %s' % str(err))
2389 api_response = json.loads(data)['data']
2391 new_ids = list(video['id'] for video in api_response['items'])
2392 video_ids += new_ids
2394 limit = min(n, api_response['totalItems'])
2397 if len(video_ids) > n:
2398 video_ids = video_ids[:n]
2399 for id in video_ids:
2400 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2404 class GoogleSearchIE(InfoExtractor):
2405 """Information Extractor for Google Video search queries."""
2406 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
2407 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2408 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
2409 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
2411 _max_google_results = 1000
2412 IE_NAME = u'video.google:search'
2414 def __init__(self, google_ie, downloader=None):
2415 InfoExtractor.__init__(self, downloader)
2416 self._google_ie = google_ie
2418 def report_download_page(self, query, pagenum):
2419 """Report attempt to download playlist page with given number."""
2420 query = query.decode(preferredencoding())
2421 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2423 def _real_initialize(self):
2424 self._google_ie.initialize()
2426 def _real_extract(self, query):
2427 mobj = re.match(self._VALID_URL, query)
2429 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2432 prefix, query = query.split(':')
2434 query = query.encode('utf-8')
2436 self._download_n_results(query, 1)
2438 elif prefix == 'all':
2439 self._download_n_results(query, self._max_google_results)
2445 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2447 elif n > self._max_google_results:
2448 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2449 n = self._max_google_results
2450 self._download_n_results(query, n)
2452 except ValueError: # parsing prefix as integer fails
2453 self._download_n_results(query, 1)
2456 def _download_n_results(self, query, n):
2457 """Downloads a specified number of results for a query"""
2463 self.report_download_page(query, pagenum)
2464 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
2465 request = urllib2.Request(result_url)
2467 page = urllib2.urlopen(request).read()
2468 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2469 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2472 # Extract video identifiers
2473 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2474 video_id = mobj.group(1)
2475 if video_id not in video_ids:
2476 video_ids.append(video_id)
2477 if len(video_ids) == n:
2478 # Specified n videos reached
2479 for id in video_ids:
2480 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2483 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2484 for id in video_ids:
2485 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2488 pagenum = pagenum + 1
2491 class YahooSearchIE(InfoExtractor):
2492 """Information Extractor for Yahoo! Video search queries."""
2493 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
2494 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2495 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2496 _MORE_PAGES_INDICATOR = r'\s*Next'
2498 _max_yahoo_results = 1000
2499 IE_NAME = u'video.yahoo:search'
2501 def __init__(self, yahoo_ie, downloader=None):
2502 InfoExtractor.__init__(self, downloader)
2503 self._yahoo_ie = yahoo_ie
2505 def report_download_page(self, query, pagenum):
2506 """Report attempt to download playlist page with given number."""
2507 query = query.decode(preferredencoding())
2508 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2510 def _real_initialize(self):
2511 self._yahoo_ie.initialize()
2513 def _real_extract(self, query):
2514 mobj = re.match(self._VALID_URL, query)
2516 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2519 prefix, query = query.split(':')
2521 query = query.encode('utf-8')
2523 self._download_n_results(query, 1)
2525 elif prefix == 'all':
2526 self._download_n_results(query, self._max_yahoo_results)
2532 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2534 elif n > self._max_yahoo_results:
2535 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2536 n = self._max_yahoo_results
2537 self._download_n_results(query, n)
2539 except ValueError: # parsing prefix as integer fails
2540 self._download_n_results(query, 1)
2543 def _download_n_results(self, query, n):
2544 """Downloads a specified number of results for a query"""
2547 already_seen = set()
2551 self.report_download_page(query, pagenum)
2552 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2553 request = urllib2.Request(result_url)
2555 page = urllib2.urlopen(request).read()
2556 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2557 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2560 # Extract video identifiers
2561 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2562 video_id = mobj.group(1)
2563 if video_id not in already_seen:
2564 video_ids.append(video_id)
2565 already_seen.add(video_id)
2566 if len(video_ids) == n:
2567 # Specified n videos reached
2568 for id in video_ids:
2569 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2572 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2573 for id in video_ids:
2574 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2577 pagenum = pagenum + 1
2580 class YoutubePlaylistIE(InfoExtractor):
2581 """Information Extractor for YouTube playlists."""
2583 _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2584 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2585 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&list=PL%s&'
2586 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2588 IE_NAME = u'youtube:playlist'
2590 def __init__(self, youtube_ie, downloader=None):
2591 InfoExtractor.__init__(self, downloader)
2592 self._youtube_ie = youtube_ie
2594 def report_download_page(self, playlist_id, pagenum):
2595 """Report attempt to download playlist page with given number."""
2596 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2598 def _real_initialize(self):
2599 self._youtube_ie.initialize()
2601 def _real_extract(self, url):
2602 # Extract playlist id
2603 mobj = re.match(self._VALID_URL, url)
2605 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2609 if mobj.group(3) is not None:
2610 self._youtube_ie.extract(mobj.group(3))
2613 # Download playlist pages
2614 # prefix is 'p' as default for playlists but there are other types that need extra care
2615 playlist_prefix = mobj.group(1)
2616 if playlist_prefix == 'a':
2617 playlist_access = 'artist'
2619 playlist_prefix = 'p'
2620 playlist_access = 'view_play_list'
2621 playlist_id = mobj.group(2)
2626 self.report_download_page(playlist_id, pagenum)
2627 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
2628 request = urllib2.Request(url)
2630 page = urllib2.urlopen(request).read()
2631 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2632 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2635 # Extract video identifiers
2637 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
2638 if mobj.group(1) not in ids_in_page:
2639 ids_in_page.append(mobj.group(1))
2640 video_ids.extend(ids_in_page)
2642 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2644 pagenum = pagenum + 1
2646 playliststart = self._downloader.params.get('playliststart', 1) - 1
2647 playlistend = self._downloader.params.get('playlistend', -1)
2648 if playlistend == -1:
2649 video_ids = video_ids[playliststart:]
2651 video_ids = video_ids[playliststart:playlistend]
2653 for id in video_ids:
2654 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2658 class YoutubeUserIE(InfoExtractor):
2659 """Information Extractor for YouTube users."""
2661 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2662 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2663 _GDATA_PAGE_SIZE = 50
2664 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2665 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
2667 IE_NAME = u'youtube:user'
2669 def __init__(self, youtube_ie, downloader=None):
2670 InfoExtractor.__init__(self, downloader)
2671 self._youtube_ie = youtube_ie
2673 def report_download_page(self, username, start_index):
2674 """Report attempt to download user page."""
2675 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2676 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2678 def _real_initialize(self):
2679 self._youtube_ie.initialize()
2681 def _real_extract(self, url):
2683 mobj = re.match(self._VALID_URL, url)
2685 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2688 username = mobj.group(1)
2690 # Download video ids using YouTube Data API. Result size per
2691 # query is limited (currently to 50 videos) so we need to query
2692 # page by page until there are no video ids - it means we got
2699 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2700 self.report_download_page(username, start_index)
2702 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2705 page = urllib2.urlopen(request).read()
2706 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2707 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2710 # Extract video identifiers
2713 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2714 if mobj.group(1) not in ids_in_page:
2715 ids_in_page.append(mobj.group(1))
2717 video_ids.extend(ids_in_page)
2719 # A little optimization - if current page is not
2720 # "full", ie. does not contain PAGE_SIZE video ids then
2721 # we can assume that this page is the last one - there
2722 # are no more ids on further pages - no need to query
2725 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2730 all_ids_count = len(video_ids)
2731 playliststart = self._downloader.params.get('playliststart', 1) - 1
2732 playlistend = self._downloader.params.get('playlistend', -1)
2734 if playlistend == -1:
2735 video_ids = video_ids[playliststart:]
2737 video_ids = video_ids[playliststart:playlistend]
2739 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2740 (username, all_ids_count, len(video_ids)))
2742 for video_id in video_ids:
2743 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2746 class DepositFilesIE(InfoExtractor):
2747 """Information extractor for depositfiles.com"""
2749 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2750 IE_NAME = u'DepositFiles'
2752 def __init__(self, downloader=None):
2753 InfoExtractor.__init__(self, downloader)
2755 def report_download_webpage(self, file_id):
2756 """Report webpage download."""
2757 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2759 def report_extraction(self, file_id):
2760 """Report information extraction."""
2761 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2763 def _real_extract(self, url):
2764 # At this point we have a new file
2765 self._downloader.increment_downloads()
2767 file_id = url.split('/')[-1]
2768 # Rebuild url in english locale
2769 url = 'http://depositfiles.com/en/files/' + file_id
2771 # Retrieve file webpage with 'Free download' button pressed
2772 free_download_indication = { 'gateway_result' : '1' }
2773 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2775 self.report_download_webpage(file_id)
2776 webpage = urllib2.urlopen(request).read()
2777 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2778 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2781 # Search for the real file URL
2782 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2783 if (mobj is None) or (mobj.group(1) is None):
2784 # Try to figure out reason of the error.
2785 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2786 if (mobj is not None) and (mobj.group(1) is not None):
2787 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2788 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2790 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2793 file_url = mobj.group(1)
2794 file_extension = os.path.splitext(file_url)[1][1:]
2796 # Search for file title
2797 mobj = re.search(r'<b title="(.*?)">', webpage)
2799 self._downloader.trouble(u'ERROR: unable to extract title')
2801 file_title = mobj.group(1).decode('utf-8')
2804 # Process file information
2805 self._downloader.process_info({
2806 'id': file_id.decode('utf-8'),
2807 'url': file_url.decode('utf-8'),
2809 'upload_date': u'NA',
2810 'title': file_title,
2811 'stitle': file_title,
2812 'ext': file_extension.decode('utf-8'),
2816 except UnavailableVideoError, err:
2817 self._downloader.trouble(u'ERROR: unable to download file')
2820 class FacebookIE(InfoExtractor):
2821 """Information Extractor for Facebook"""
2823 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2824 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2825 _NETRC_MACHINE = 'facebook'
2826 _available_formats = ['video', 'highqual', 'lowqual']
2827 _video_extensions = {
2832 IE_NAME = u'facebook'
2834 def __init__(self, downloader=None):
2835 InfoExtractor.__init__(self, downloader)
2837 def _reporter(self, message):
2838 """Add header and report message."""
2839 self._downloader.to_screen(u'[facebook] %s' % message)
2841 def report_login(self):
2842 """Report attempt to log in."""
2843 self._reporter(u'Logging in')
2845 def report_video_webpage_download(self, video_id):
2846 """Report attempt to download video webpage."""
2847 self._reporter(u'%s: Downloading video webpage' % video_id)
2849 def report_information_extraction(self, video_id):
2850 """Report attempt to extract video information."""
2851 self._reporter(u'%s: Extracting video information' % video_id)
2853 def _parse_page(self, video_webpage):
2854 """Extract video information from page"""
2856 data = {'title': r'\("video_title", "(.*?)"\)',
2857 'description': r'<div class="datawrap">(.*?)</div>',
2858 'owner': r'\("video_owner_name", "(.*?)"\)',
2859 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2862 for piece in data.keys():
2863 mobj = re.search(data[piece], video_webpage)
2864 if mobj is not None:
2865 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2869 for fmt in self._available_formats:
2870 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2871 if mobj is not None:
2872 # URL is in a Javascript segment inside an escaped Unicode format within
2873 # the generally utf-8 page
2874 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2875 video_info['video_urls'] = video_urls
2879 def _real_initialize(self):
2880 if self._downloader is None:
2885 downloader_params = self._downloader.params
2887 # Attempt to use provided username and password or .netrc data
2888 if downloader_params.get('username', None) is not None:
2889 useremail = downloader_params['username']
2890 password = downloader_params['password']
2891 elif downloader_params.get('usenetrc', False):
2893 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2894 if info is not None:
2898 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2899 except (IOError, netrc.NetrcParseError), err:
2900 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2903 if useremail is None:
2912 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2915 login_results = urllib2.urlopen(request).read()
2916 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2917 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2919 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2920 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2923 def _real_extract(self, url):
2924 mobj = re.match(self._VALID_URL, url)
2926 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2928 video_id = mobj.group('ID')
2931 self.report_video_webpage_download(video_id)
2932 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2934 page = urllib2.urlopen(request)
2935 video_webpage = page.read()
2936 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2937 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2940 # Start extracting information
2941 self.report_information_extraction(video_id)
2943 # Extract information
2944 video_info = self._parse_page(video_webpage)
2947 if 'owner' not in video_info:
2948 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2950 video_uploader = video_info['owner']
2953 if 'title' not in video_info:
2954 self._downloader.trouble(u'ERROR: unable to extract video title')
2956 video_title = video_info['title']
2957 video_title = video_title.decode('utf-8')
2958 video_title = sanitize_title(video_title)
2960 simple_title = _simplify_title(video_title)
2963 if 'thumbnail' not in video_info:
2964 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2965 video_thumbnail = ''
2967 video_thumbnail = video_info['thumbnail']
2971 if 'upload_date' in video_info:
2972 upload_time = video_info['upload_date']
2973 timetuple = email.utils.parsedate_tz(upload_time)
2974 if timetuple is not None:
2976 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2981 video_description = video_info.get('description', 'No description available.')
2983 url_map = video_info['video_urls']
2984 if len(url_map.keys()) > 0:
2985 # Decide which formats to download
2986 req_format = self._downloader.params.get('format', None)
2987 format_limit = self._downloader.params.get('format_limit', None)
2989 if format_limit is not None and format_limit in self._available_formats:
2990 format_list = self._available_formats[self._available_formats.index(format_limit):]
2992 format_list = self._available_formats
2993 existing_formats = [x for x in format_list if x in url_map]
2994 if len(existing_formats) == 0:
2995 self._downloader.trouble(u'ERROR: no known formats available for video')
2997 if req_format is None:
2998 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2999 elif req_format == 'worst':
3000 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
3001 elif req_format == '-1':
3002 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
3005 if req_format not in url_map:
3006 self._downloader.trouble(u'ERROR: requested format not available')
3008 video_url_list = [(req_format, url_map[req_format])] # Specific format
3010 for format_param, video_real_url in video_url_list:
3012 # At this point we have a new video
3013 self._downloader.increment_downloads()
3016 video_extension = self._video_extensions.get(format_param, 'mp4')
3019 # Process video information
3020 self._downloader.process_info({
3021 'id': video_id.decode('utf-8'),
3022 'url': video_real_url.decode('utf-8'),
3023 'uploader': video_uploader.decode('utf-8'),
3024 'upload_date': upload_date,
3025 'title': video_title,
3026 'stitle': simple_title,
3027 'ext': video_extension.decode('utf-8'),
3028 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3029 'thumbnail': video_thumbnail.decode('utf-8'),
3030 'description': video_description.decode('utf-8'),
3033 except UnavailableVideoError, err:
3034 self._downloader.trouble(u'\nERROR: unable to download video')
3036 class BlipTVIE(InfoExtractor):
3037 """Information extractor for blip.tv"""
3039 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
3040 _URL_EXT = r'^.*\.([a-z0-9]+)$'
3041 IE_NAME = u'blip.tv'
3043 def report_extraction(self, file_id):
3044 """Report information extraction."""
3045 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3047 def report_direct_download(self, title):
3048 """Report information extraction."""
3049 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
3051 def _real_extract(self, url):
3052 mobj = re.match(self._VALID_URL, url)
3054 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3061 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
3062 request = urllib2.Request(json_url)
3063 self.report_extraction(mobj.group(1))
3066 urlh = urllib2.urlopen(request)
3067 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
3068 basename = url.split('/')[-1]
3069 title,ext = os.path.splitext(basename)
3070 title = title.decode('UTF-8')
3071 ext = ext.replace('.', '')
3072 self.report_direct_download(title)
3077 'stitle': _simplify_title(title),
3081 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3082 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
3084 if info is None: # Regular URL
3086 json_code = urlh.read()
3087 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3088 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
3092 json_data = json.loads(json_code)
3093 if 'Post' in json_data:
3094 data = json_data['Post']
3098 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
3099 video_url = data['media']['url']
3100 umobj = re.match(self._URL_EXT, video_url)
3102 raise ValueError('Can not determine filename extension')
3103 ext = umobj.group(1)
3106 'id': data['item_id'],
3108 'uploader': data['display_name'],
3109 'upload_date': upload_date,
3110 'title': data['title'],
3111 'stitle': _simplify_title(data['title']),
3113 'format': data['media']['mimeType'],
3114 'thumbnail': data['thumbnailUrl'],
3115 'description': data['description'],
3116 'player_url': data['embedUrl']
3118 except (ValueError,KeyError), err:
3119 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
3122 self._downloader.increment_downloads()
3125 self._downloader.process_info(info)
3126 except UnavailableVideoError, err:
3127 self._downloader.trouble(u'\nERROR: unable to download video')
3130 class MyVideoIE(InfoExtractor):
3131 """Information Extractor for myvideo.de."""
3133 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
3134 IE_NAME = u'myvideo'
3136 def __init__(self, downloader=None):
3137 InfoExtractor.__init__(self, downloader)
3139 def report_download_webpage(self, video_id):
3140 """Report webpage download."""
3141 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
3143 def report_extraction(self, video_id):
3144 """Report information extraction."""
3145 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
3147 def _real_extract(self,url):
3148 mobj = re.match(self._VALID_URL, url)
3150 self._download.trouble(u'ERROR: invalid URL: %s' % url)
3153 video_id = mobj.group(1)
3156 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
3158 self.report_download_webpage(video_id)
3159 webpage = urllib2.urlopen(request).read()
3160 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3161 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3164 self.report_extraction(video_id)
3165 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
3168 self._downloader.trouble(u'ERROR: unable to extract media URL')
3170 video_url = mobj.group(1) + ('/%s.flv' % video_id)
3172 mobj = re.search('<title>([^<]+)</title>', webpage)
3174 self._downloader.trouble(u'ERROR: unable to extract title')
3177 video_title = mobj.group(1)
3178 video_title = sanitize_title(video_title)
3180 simple_title = _simplify_title(video_title)
3183 self._downloader.process_info({
3187 'upload_date': u'NA',
3188 'title': video_title,
3189 'stitle': simple_title,
3194 except UnavailableVideoError:
3195 self._downloader.trouble(u'\nERROR: Unable to download video')
3197 class ComedyCentralIE(InfoExtractor):
3198 """Information extractor for The Daily Show and Colbert Report """
3200 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
3201 IE_NAME = u'comedycentral'
3203 def report_extraction(self, episode_id):
3204 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3206 def report_config_download(self, episode_id):
3207 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3209 def report_index_download(self, episode_id):
3210 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3212 def report_player_url(self, episode_id):
3213 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3215 def _real_extract(self, url):
3216 mobj = re.match(self._VALID_URL, url)
3218 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3221 if mobj.group('shortname'):
3222 if mobj.group('shortname') in ('tds', 'thedailyshow'):
3223 url = u'http://www.thedailyshow.com/full-episodes/'
3225 url = u'http://www.colbertnation.com/full-episodes/'
3226 mobj = re.match(self._VALID_URL, url)
3227 assert mobj is not None
3229 dlNewest = not mobj.group('episode')
3231 epTitle = mobj.group('showname')
3233 epTitle = mobj.group('episode')
3235 req = urllib2.Request(url)
3236 self.report_extraction(epTitle)
3238 htmlHandle = urllib2.urlopen(req)
3239 html = htmlHandle.read()
3240 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3241 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3244 url = htmlHandle.geturl()
3245 mobj = re.match(self._VALID_URL, url)
3247 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3249 if mobj.group('episode') == '':
3250 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3252 epTitle = mobj.group('episode')
3254 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
3255 if len(mMovieParams) == 0:
3256 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3259 playerUrl_raw = mMovieParams[0][0]
3260 self.report_player_url(epTitle)
3262 urlHandle = urllib2.urlopen(playerUrl_raw)
3263 playerUrl = urlHandle.geturl()
3264 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3265 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3268 uri = mMovieParams[0][1]
3269 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3270 self.report_index_download(epTitle)
3272 indexXml = urllib2.urlopen(indexUrl).read()
3273 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3274 self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
3277 idoc = xml.etree.ElementTree.fromstring(indexXml)
3278 itemEls = idoc.findall('.//item')
3279 for itemEl in itemEls:
3280 mediaId = itemEl.findall('./guid')[0].text
3281 shortMediaId = mediaId.split(':')[-1]
3282 showId = mediaId.split(':')[-2].replace('.com', '')
3283 officialTitle = itemEl.findall('./title')[0].text
3284 officialDate = itemEl.findall('./pubDate')[0].text
3286 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3287 urllib.urlencode({'uri': mediaId}))
3288 configReq = urllib2.Request(configUrl)
3289 self.report_config_download(epTitle)
3291 configXml = urllib2.urlopen(configReq).read()
3292 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3293 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3296 cdoc = xml.etree.ElementTree.fromstring(configXml)
3298 for rendition in cdoc.findall('.//rendition'):
3299 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3303 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
3306 # For now, just pick the highest bitrate
3307 format,video_url = turls[-1]
3309 self._downloader.increment_downloads()
3311 effTitle = showId + u'-' + epTitle
3316 'upload_date': officialDate,
3318 'stitle': _simplify_title(effTitle),
3322 'description': officialTitle,
3323 'player_url': playerUrl
3327 self._downloader.process_info(info)
3328 except UnavailableVideoError, err:
3329 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
3333 class EscapistIE(InfoExtractor):
3334 """Information extractor for The Escapist """
3336 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
3337 IE_NAME = u'escapist'
3339 def report_extraction(self, showName):
3340 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3342 def report_config_download(self, showName):
3343 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3345 def _real_extract(self, url):
3346 htmlParser = HTMLParser.HTMLParser()
3348 mobj = re.match(self._VALID_URL, url)
3350 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3352 showName = mobj.group('showname')
3353 videoId = mobj.group('episode')
3355 self.report_extraction(showName)
3357 webPage = urllib2.urlopen(url).read()
3358 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3359 self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3362 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3363 description = htmlParser.unescape(descMatch.group(1))
3364 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3365 imgUrl = htmlParser.unescape(imgMatch.group(1))
3366 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3367 playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
3368 configUrlMatch = re.search('config=(.*)$', playerUrl)
3369 configUrl = urllib2.unquote(configUrlMatch.group(1))
3371 self.report_config_download(showName)
3373 configJSON = urllib2.urlopen(configUrl).read()
3374 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3375 self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3378 # Technically, it's JavaScript, not JSON
3379 configJSON = configJSON.replace("'", '"')
3382 config = json.loads(configJSON)
3383 except (ValueError,), err:
3384 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3387 playlist = config['playlist']
3388 videoUrl = playlist[1]['url']
3390 self._downloader.increment_downloads()
3394 'uploader': showName,
3395 'upload_date': None,
3397 'stitle': _simplify_title(showName),
3400 'thumbnail': imgUrl,
3401 'description': description,
3402 'player_url': playerUrl,
3406 self._downloader.process_info(info)
3407 except UnavailableVideoError, err:
3408 self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3411 class CollegeHumorIE(InfoExtractor):
3412 """Information extractor for collegehumor.com"""
3414 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
3415 IE_NAME = u'collegehumor'
3417 def report_webpage(self, video_id):
3418 """Report information extraction."""
3419 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3421 def report_extraction(self, video_id):
3422 """Report information extraction."""
3423 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3425 def _real_extract(self, url):
3426 htmlParser = HTMLParser.HTMLParser()
3428 mobj = re.match(self._VALID_URL, url)
3430 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3432 video_id = mobj.group('videoid')
3434 self.report_webpage(video_id)
3435 request = urllib2.Request(url)
3437 webpage = urllib2.urlopen(request).read()
3438 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3439 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3442 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
3444 self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
3446 internal_video_id = m.group('internalvideoid')
3450 'internal_id': internal_video_id,
3453 self.report_extraction(video_id)
3454 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
3456 metaXml = urllib2.urlopen(xmlUrl).read()
3457 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3458 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
3461 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3463 videoNode = mdoc.findall('./video')[0]
3464 info['description'] = videoNode.findall('./description')[0].text
3465 info['title'] = videoNode.findall('./caption')[0].text
3466 info['stitle'] = _simplify_title(info['title'])
3467 info['url'] = videoNode.findall('./file')[0].text
3468 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
3469 info['ext'] = info['url'].rpartition('.')[2]
3470 info['format'] = info['ext']
3472 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3475 self._downloader.increment_downloads()
3478 self._downloader.process_info(info)
3479 except UnavailableVideoError, err:
3480 self._downloader.trouble(u'\nERROR: unable to download video')
3483 class XVideosIE(InfoExtractor):
3484 """Information extractor for xvideos.com"""
3486 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
3487 IE_NAME = u'xvideos'
3489 def report_webpage(self, video_id):
3490 """Report information extraction."""
3491 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3493 def report_extraction(self, video_id):
3494 """Report information extraction."""
3495 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3497 def _real_extract(self, url):
3498 htmlParser = HTMLParser.HTMLParser()
3500 mobj = re.match(self._VALID_URL, url)
3502 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3504 video_id = mobj.group(1).decode('utf-8')
3506 self.report_webpage(video_id)
3508 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
3510 webpage = urllib2.urlopen(request).read()
3511 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3512 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3515 self.report_extraction(video_id)
3519 mobj = re.search(r'flv_url=(.+?)&', webpage)
3521 self._downloader.trouble(u'ERROR: unable to extract video url')
3523 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
3527 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
3529 self._downloader.trouble(u'ERROR: unable to extract video title')
3531 video_title = mobj.group(1).decode('utf-8')
3534 # Extract video thumbnail
3535 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]/[a-fA-F0-9]/[a-fA-F0-9]/([a-fA-F0-9.]+jpg)', webpage)
3537 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3539 video_thumbnail = mobj.group(1).decode('utf-8')
3543 self._downloader.increment_downloads()
3548 'upload_date': None,
3549 'title': video_title,
3550 'stitle': _simplify_title(video_title),
3553 'thumbnail': video_thumbnail,
3554 'description': None,
3559 self._downloader.process_info(info)
3560 except UnavailableVideoError, err:
3561 self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
3564 class SoundcloudIE(InfoExtractor):
3565 """Information extractor for soundcloud.com
3566 To access the media, the uid of the song and a stream token
3567 must be extracted from the page source and the script must make
3568 a request to media.soundcloud.com/crossdomain.xml. Then
3569 the media can be grabbed by requesting from an url composed
3570 of the stream token and uid
3573 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
3574 IE_NAME = u'soundcloud'
3576 def __init__(self, downloader=None):
3577 InfoExtractor.__init__(self, downloader)
3579 def report_webpage(self, video_id):
3580 """Report information extraction."""
3581 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3583 def report_extraction(self, video_id):
3584 """Report information extraction."""
3585 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3587 def _real_extract(self, url):
3588 htmlParser = HTMLParser.HTMLParser()
3590 mobj = re.match(self._VALID_URL, url)
3592 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3595 # extract uploader (which is in the url)
3596 uploader = mobj.group(1).decode('utf-8')
3597 # extract simple title (uploader + slug of song title)
3598 slug_title = mobj.group(2).decode('utf-8')
3599 simple_title = uploader + '-' + slug_title
3601 self.report_webpage('%s/%s' % (uploader, slug_title))
3603 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
3605 webpage = urllib2.urlopen(request).read()
3606 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3607 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3610 self.report_extraction('%s/%s' % (uploader, slug_title))
3612 # extract uid and stream token that soundcloud hands out for access
3613 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
3615 video_id = mobj.group(1)
3616 stream_token = mobj.group(2)
3618 # extract unsimplified title
3619 mobj = re.search('"title":"(.*?)",', webpage)
3621 title = mobj.group(1)
3623 # construct media url (with uid/token)
3624 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
3625 mediaURL = mediaURL % (video_id, stream_token)
3628 description = u'No description available'
3629 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
3631 description = mobj.group(1)
3635 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
3638 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
3639 except Exception, e:
3642 # for soundcloud, a request to a cross domain is required for cookies
3643 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
3646 self._downloader.process_info({
3647 'id': video_id.decode('utf-8'),
3649 'uploader': uploader.decode('utf-8'),
3650 'upload_date': upload_date,
3651 'title': simple_title.decode('utf-8'),
3652 'stitle': simple_title.decode('utf-8'),
3656 'description': description.decode('utf-8')
3658 except UnavailableVideoError:
3659 self._downloader.trouble(u'\nERROR: unable to download video')
3662 class InfoQIE(InfoExtractor):
3663 """Information extractor for infoq.com"""
3665 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
3668 def report_webpage(self, video_id):
3669 """Report information extraction."""
3670 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3672 def report_extraction(self, video_id):
3673 """Report information extraction."""
3674 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3676 def _real_extract(self, url):
3677 htmlParser = HTMLParser.HTMLParser()
3679 mobj = re.match(self._VALID_URL, url)
3681 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3684 self.report_webpage(url)
3686 request = urllib2.Request(url)
3688 webpage = urllib2.urlopen(request).read()
3689 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3690 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3693 self.report_extraction(url)
3697 mobj = re.search(r"jsclassref='([^']*)'", webpage)
3699 self._downloader.trouble(u'ERROR: unable to extract video url')
3701 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
3705 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
3707 self._downloader.trouble(u'ERROR: unable to extract video title')
3709 video_title = mobj.group(1).decode('utf-8')
3711 # Extract description
3712 video_description = u'No description available.'
3713 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
3714 if mobj is not None:
3715 video_description = mobj.group(1).decode('utf-8')
3717 video_filename = video_url.split('/')[-1]
3718 video_id, extension = video_filename.split('.')
3720 self._downloader.increment_downloads()
3725 'upload_date': None,
3726 'title': video_title,
3727 'stitle': _simplify_title(video_title),
3729 'format': extension, # Extension is always(?) mp4, but seems to be flv
3731 'description': video_description,
3736 self._downloader.process_info(info)
3737 except UnavailableVideoError, err:
3738 self._downloader.trouble(u'\nERROR: unable to download ' + video_url)
3740 class MixcloudIE(InfoExtractor):
3741 """Information extractor for www.mixcloud.com"""
3742 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
3743 IE_NAME = u'mixcloud'
3745 def __init__(self, downloader=None):
3746 InfoExtractor.__init__(self, downloader)
3748 def report_download_json(self, file_id):
3749 """Report JSON download."""
3750 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
3752 def report_extraction(self, file_id):
3753 """Report information extraction."""
3754 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3756 def get_urls(self, jsonData, fmt, bitrate='best'):
3757 """Get urls from 'audio_formats' section in json"""
3760 bitrate_list = jsonData[fmt]
3761 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
3762 bitrate = max(bitrate_list) # select highest
3764 url_list = jsonData[fmt][bitrate]
3765 except TypeError: # we have no bitrate info.
3766 url_list = jsonData[fmt]
3769 def check_urls(self, url_list):
3770 """Returns 1st active url from list"""
3771 for url in url_list:
3773 urllib2.urlopen(url)
3775 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3780 def _print_formats(self, formats):
3781 print 'Available formats:'
3782 for fmt in formats.keys():
3783 for b in formats[fmt]:
3785 ext = formats[fmt][b][0]
3786 print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
3787 except TypeError: # we have no bitrate info
3788 ext = formats[fmt][0]
3789 print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
3792 def _real_extract(self, url):
3793 mobj = re.match(self._VALID_URL, url)
3795 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3797 # extract uploader & filename from url
3798 uploader = mobj.group(1).decode('utf-8')
3799 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3801 # construct API request
3802 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3803 # retrieve .json file with links to files
3804 request = urllib2.Request(file_url)
3806 self.report_download_json(file_url)
3807 jsonData = urllib2.urlopen(request).read()
3808 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3809 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
3813 json_data = json.loads(jsonData)
3814 player_url = json_data['player_swf_url']
3815 formats = dict(json_data['audio_formats'])
3817 req_format = self._downloader.params.get('format', None)
3820 if self._downloader.params.get('listformats', None):
3821 self._print_formats(formats)
3824 if req_format is None or req_format == 'best':
3825 for format_param in formats.keys():
3826 url_list = self.get_urls(formats, format_param)
3828 file_url = self.check_urls(url_list)
3829 if file_url is not None:
3832 if req_format not in formats.keys():
3833 self._downloader.trouble(u'ERROR: format is not available')
3836 url_list = self.get_urls(formats, req_format)
3837 file_url = self.check_urls(url_list)
3838 format_param = req_format
3841 self._downloader.increment_downloads()
3843 # Process file information
3844 self._downloader.process_info({
3845 'id': file_id.decode('utf-8'),
3846 'url': file_url.decode('utf-8'),
3847 'uploader': uploader.decode('utf-8'),
3848 'upload_date': u'NA',
3849 'title': json_data['name'],
3850 'stitle': _simplify_title(json_data['name']),
3851 'ext': file_url.split('.')[-1].decode('utf-8'),
3852 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3853 'thumbnail': json_data['thumbnail_url'],
3854 'description': json_data['description'],
3855 'player_url': player_url.decode('utf-8'),
3857 except UnavailableVideoError, err:
3858 self._downloader.trouble(u'ERROR: unable to download file')
3860 class StanfordOpenClassroomIE(InfoExtractor):
3861 """Information extractor for Stanford's Open ClassRoom"""
3863 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3864 IE_NAME = u'stanfordoc'
3866 def report_download_webpage(self, objid):
3867 """Report information extraction."""
3868 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3870 def report_extraction(self, video_id):
3871 """Report information extraction."""
3872 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3874 def _real_extract(self, url):
3875 mobj = re.match(self._VALID_URL, url)
3877 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3880 if mobj.group('course') and mobj.group('video'): # A specific video
3881 course = mobj.group('course')
3882 video = mobj.group('video')
3884 'id': _simplify_title(course + '_' + video),
3887 self.report_extraction(info['id'])
3888 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3889 xmlUrl = baseUrl + video + '.xml'
3891 metaXml = urllib2.urlopen(xmlUrl).read()
3892 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3893 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
3895 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3897 info['title'] = mdoc.findall('./title')[0].text
3898 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3900 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3902 info['stitle'] = _simplify_title(info['title'])
3903 info['ext'] = info['url'].rpartition('.')[2]
3904 info['format'] = info['ext']
3905 self._downloader.increment_downloads()
3907 self._downloader.process_info(info)
3908 except UnavailableVideoError, err:
3909 self._downloader.trouble(u'\nERROR: unable to download video')
3910 elif mobj.group('course'): # A course page
3911 unescapeHTML = HTMLParser.HTMLParser().unescape
3913 course = mobj.group('course')
3915 'id': _simplify_title(course),
3919 self.report_download_webpage(info['id'])
3921 coursepage = urllib2.urlopen(url).read()
3922 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3923 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3926 m = re.search('<h1>([^<]+)</h1>', coursepage)
3928 info['title'] = unescapeHTML(m.group(1))
3930 info['title'] = info['id']
3931 info['stitle'] = _simplify_title(info['title'])
3933 m = re.search('<description>([^<]+)</description>', coursepage)
3935 info['description'] = unescapeHTML(m.group(1))
3937 links = _orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3940 'type': 'reference',
3941 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3945 for entry in info['list']:
3946 assert entry['type'] == 'reference'
3947 self.extract(entry['url'])
3949 unescapeHTML = HTMLParser.HTMLParser().unescape
3952 'id': 'Stanford OpenClassroom',
3956 self.report_download_webpage(info['id'])
3957 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3959 rootpage = urllib2.urlopen(rootURL).read()
3960 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3961 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3964 info['title'] = info['id']
3965 info['stitle'] = _simplify_title(info['title'])
3967 links = _orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3970 'type': 'reference',
3971 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3975 for entry in info['list']:
3976 assert entry['type'] == 'reference'
3977 self.extract(entry['url'])
3979 class MTVIE(InfoExtractor):
3980 """Information extractor for MTV.com"""
3982 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3985 def report_webpage(self, video_id):
3986 """Report information extraction."""
3987 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3989 def report_extraction(self, video_id):
3990 """Report information extraction."""
3991 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3993 def _real_extract(self, url):
3994 mobj = re.match(self._VALID_URL, url)
3996 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3998 if not mobj.group('proto'):
3999 url = 'http://' + url
4000 video_id = mobj.group('videoid')
4001 self.report_webpage(video_id)
4003 request = urllib2.Request(url)
4005 webpage = urllib2.urlopen(request).read()
4006 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
4007 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
4010 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
4012 self._downloader.trouble(u'ERROR: unable to extract song name')
4014 song_name = _unescapeHTML(mobj.group(1).decode('iso-8859-1'))
4015 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
4017 self._downloader.trouble(u'ERROR: unable to extract performer')
4019 performer = _unescapeHTML(mobj.group(1).decode('iso-8859-1'))
4020 video_title = performer + ' - ' + song_name
4022 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
4024 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
4026 mtvn_uri = mobj.group(1)
4028 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
4030 self._downloader.trouble(u'ERROR: unable to extract content id')
4032 content_id = mobj.group(1)
4034 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
4035 self.report_extraction(video_id)
4036 request = urllib2.Request(videogen_url)
4038 metadataXml = urllib2.urlopen(request).read()
4039 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
4040 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err))
4043 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
4044 renditions = mdoc.findall('.//rendition')
4046 # For now, always pick the highest quality.
4047 rendition = renditions[-1]
4050 _,_,ext = rendition.attrib['type'].partition('/')
4051 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
4052 video_url = rendition.find('./src').text
4054 self._downloader.trouble('Invalid rendition field.')
4057 self._downloader.increment_downloads()
4061 'uploader': performer,
4062 'title': video_title,
4063 'stitle': _simplify_title(video_title),
4069 self._downloader.process_info(info)
4070 except UnavailableVideoError, err:
4071 self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
4074 class PostProcessor(object):
4075 """Post Processor class.
4077 PostProcessor objects can be added to downloaders with their
4078 add_post_processor() method. When the downloader has finished a
4079 successful download, it will take its internal chain of PostProcessors
4080 and start calling the run() method on each one of them, first with
4081 an initial argument and then with the returned value of the previous
4084 The chain will be stopped if one of them ever returns None or the end
4085 of the chain is reached.
4087 PostProcessor objects follow a "mutual registration" process similar
4088 to InfoExtractor objects.
4093 def __init__(self, downloader=None):
4094 self._downloader = downloader
4096 def set_downloader(self, downloader):
4097 """Sets the downloader for this PP."""
4098 self._downloader = downloader
4100 def run(self, information):
4101 """Run the PostProcessor.
4103 The "information" argument is a dictionary like the ones
4104 composed by InfoExtractors. The only difference is that this
4105 one has an extra field called "filepath" that points to the
4108 When this method returns None, the postprocessing chain is
4109 stopped. However, this method may return an information
4110 dictionary that will be passed to the next postprocessing
4111 object in the chain. It can be the one it received after
4112 changing some fields.
4114 In addition, this method may raise a PostProcessingError
4115 exception that will be taken into account by the downloader
4118 return information # by default, do nothing
4120 class AudioConversionError(BaseException):
4121 def __init__(self, message):
4122 self.message = message
4124 class FFmpegExtractAudioPP(PostProcessor):
4126 def __init__(self, downloader=None, preferredcodec=None, preferredquality=None, keepvideo=False):
4127 PostProcessor.__init__(self, downloader)
4128 if preferredcodec is None:
4129 preferredcodec = 'best'
4130 self._preferredcodec = preferredcodec
4131 self._preferredquality = preferredquality
4132 self._keepvideo = keepvideo
4135 def get_audio_codec(path):
4137 cmd = ['ffprobe', '-show_streams', '--', _encodeFilename(path)]
4138 handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
4139 output = handle.communicate()[0]
4140 if handle.wait() != 0:
4142 except (IOError, OSError):
4145 for line in output.split('\n'):
4146 if line.startswith('codec_name='):
4147 audio_codec = line.split('=')[1].strip()
4148 elif line.strip() == 'codec_type=audio' and audio_codec is not None:
4153 def run_ffmpeg(path, out_path, codec, more_opts):
4157 acodec_opts = ['-acodec', codec]
4158 cmd = ['ffmpeg', '-y', '-i', _encodeFilename(path), '-vn'] + acodec_opts + more_opts + ['--', _encodeFilename(out_path)]
4160 p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
4161 stdout,stderr = p.communicate()
4162 except (IOError, OSError):
4163 e = sys.exc_info()[1]
4164 if isinstance(e, OSError) and e.errno == 2:
4165 raise AudioConversionError('ffmpeg not found. Please install ffmpeg.')
4168 if p.returncode != 0:
4169 msg = stderr.strip().split('\n')[-1]
4170 raise AudioConversionError(msg)
4172 def run(self, information):
4173 path = information['filepath']
4175 filecodec = self.get_audio_codec(path)
4176 if filecodec is None:
4177 self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
4181 if self._preferredcodec == 'best' or self._preferredcodec == filecodec or (self._preferredcodec == 'm4a' and filecodec == 'aac'):
4182 if self._preferredcodec == 'm4a' and filecodec == 'aac':
4183 # Lossless, but in another container
4185 extension = self._preferredcodec
4186 more_opts = ['-absf', 'aac_adtstoasc']
4187 elif filecodec in ['aac', 'mp3', 'vorbis']:
4188 # Lossless if possible
4190 extension = filecodec
4191 if filecodec == 'aac':
4192 more_opts = ['-f', 'adts']
4193 if filecodec == 'vorbis':
4197 acodec = 'libmp3lame'
4200 if self._preferredquality is not None:
4201 more_opts += ['-ab', self._preferredquality]
4203 # We convert the audio (lossy)
4204 acodec = {'mp3': 'libmp3lame', 'aac': 'aac', 'm4a': 'aac', 'vorbis': 'libvorbis', 'wav': None}[self._preferredcodec]
4205 extension = self._preferredcodec
4207 if self._preferredquality is not None:
4208 more_opts += ['-ab', self._preferredquality]
4209 if self._preferredcodec == 'aac':
4210 more_opts += ['-f', 'adts']
4211 if self._preferredcodec == 'm4a':
4212 more_opts += ['-absf', 'aac_adtstoasc']
4213 if self._preferredcodec == 'vorbis':
4215 if self._preferredcodec == 'wav':
4217 more_opts += ['-f', 'wav']
4219 prefix, sep, ext = path.rpartition(u'.') # not os.path.splitext, since the latter does not work on unicode in all setups
4220 new_path = prefix + sep + extension
4221 self._downloader.to_screen(u'[ffmpeg] Destination: ' + new_path)
4223 self.run_ffmpeg(path, new_path, acodec, more_opts)
4225 etype,e,tb = sys.exc_info()
4226 if isinstance(e, AudioConversionError):
4227 self._downloader.to_stderr(u'ERROR: audio conversion failed: ' + e.message)
4229 self._downloader.to_stderr(u'ERROR: error running ffmpeg')
4232 # Try to update the date time for extracted audio file.
4233 if information.get('filetime') is not None:
4235 os.utime(_encodeFilename(new_path), (time.time(), information['filetime']))
4237 self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file')
4239 if not self._keepvideo:
4241 os.remove(_encodeFilename(path))
4242 except (IOError, OSError):
4243 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
4246 information['filepath'] = new_path
4250 def updateSelf(downloader, filename):
4251 ''' Update the program file with the latest version from the repository '''
4252 # Note: downloader only used for options
4253 if not os.access(filename, os.W_OK):
4254 sys.exit('ERROR: no write permissions on %s' % filename)
4256 downloader.to_screen(u'Updating to latest version...')
4260 urlh = urllib.urlopen(UPDATE_URL)
4261 newcontent = urlh.read()
4263 vmatch = re.search("__version__ = '([^']+)'", newcontent)
4264 if vmatch is not None and vmatch.group(1) == __version__:
4265 downloader.to_screen(u'youtube-dl is up-to-date (' + __version__ + ')')
4269 except (IOError, OSError), err:
4270 sys.exit('ERROR: unable to download latest version')
4273 outf = open(filename, 'wb')
4275 outf.write(newcontent)
4278 except (IOError, OSError), err:
4279 sys.exit('ERROR: unable to overwrite current version')
4281 downloader.to_screen(u'Updated youtube-dl. Restart youtube-dl to use the new version.')
4284 def _readOptions(filename_bytes):
4286 optionf = open(filename_bytes)
4288 return [] # silently skip if file is not present
4292 res += shlex.split(l, comments=True)
4297 def _format_option_string(option):
4298 ''' ('-o', '--option') -> -o, --format METAVAR'''
4302 if option._short_opts: opts.append(option._short_opts[0])
4303 if option._long_opts: opts.append(option._long_opts[0])
4304 if len(opts) > 1: opts.insert(1, ', ')
4306 if option.takes_value(): opts.append(' %s' % option.metavar)
4308 return "".join(opts)
4310 def _find_term_columns():
4311 columns = os.environ.get('COLUMNS', None)
4316 sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
4317 out,err = sp.communicate()
4318 return int(out.split()[1])
4324 max_help_position = 80
4326 # No need to wrap help messages if we're on a wide console
4327 columns = _find_term_columns()
4328 if columns: max_width = columns
4330 fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
4331 fmt.format_option_strings = _format_option_string
4334 'version' : __version__,
4336 'usage' : '%prog [options] url [url...]',
4337 'conflict_handler' : 'resolve',
4340 parser = optparse.OptionParser(**kw)
4343 general = optparse.OptionGroup(parser, 'General Options')
4344 selection = optparse.OptionGroup(parser, 'Video Selection')
4345 authentication = optparse.OptionGroup(parser, 'Authentication Options')
4346 video_format = optparse.OptionGroup(parser, 'Video Format Options')
4347 postproc = optparse.OptionGroup(parser, 'Post-processing Options')
4348 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
4349 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
4351 general.add_option('-h', '--help',
4352 action='help', help='print this help text and exit')
4353 general.add_option('-v', '--version',
4354 action='version', help='print program version and exit')
4355 general.add_option('-U', '--update',
4356 action='store_true', dest='update_self', help='update this program to latest version')
4357 general.add_option('-i', '--ignore-errors',
4358 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
4359 general.add_option('-r', '--rate-limit',
4360 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
4361 general.add_option('-R', '--retries',
4362 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
4363 general.add_option('--dump-user-agent',
4364 action='store_true', dest='dump_user_agent',
4365 help='display the current browser identification', default=False)
4366 general.add_option('--list-extractors',
4367 action='store_true', dest='list_extractors',
4368 help='List all supported extractors and the URLs they would handle', default=False)
4370 selection.add_option('--playlist-start',
4371 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
4372 selection.add_option('--playlist-end',
4373 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
4374 selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
4375 selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
4376 selection.add_option('--max-downloads', metavar='NUMBER', dest='max_downloads', help='Abort after downloading NUMBER files', default=None)
4378 authentication.add_option('-u', '--username',
4379 dest='username', metavar='USERNAME', help='account username')
4380 authentication.add_option('-p', '--password',
4381 dest='password', metavar='PASSWORD', help='account password')
4382 authentication.add_option('-n', '--netrc',
4383 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
4386 video_format.add_option('-f', '--format',
4387 action='store', dest='format', metavar='FORMAT', help='video format code')
4388 video_format.add_option('--all-formats',
4389 action='store_const', dest='format', help='download all available video formats', const='all')
4390 video_format.add_option('--prefer-free-formats',
4391 action='store_true', dest='prefer_free_formats', default=False, help='prefer free video formats unless a specific one is requested')
4392 video_format.add_option('--max-quality',
4393 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
4394 video_format.add_option('-F', '--list-formats',
4395 action='store_true', dest='listformats', help='list all available formats (currently youtube only)')
4396 video_format.add_option('--write-srt',
4397 action='store_true', dest='writesubtitles',
4398 help='write video closed captions to a .srt file (currently youtube only)', default=False)
4399 video_format.add_option('--srt-lang',
4400 action='store', dest='subtitleslang', metavar='LANG',
4401 help='language of the closed captions to download (optional) use IETF language tags like \'en\'')
4404 verbosity.add_option('-q', '--quiet',
4405 action='store_true', dest='quiet', help='activates quiet mode', default=False)
4406 verbosity.add_option('-s', '--simulate',
4407 action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False)
4408 verbosity.add_option('--skip-download',
4409 action='store_true', dest='skip_download', help='do not download the video', default=False)
4410 verbosity.add_option('-g', '--get-url',
4411 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
4412 verbosity.add_option('-e', '--get-title',
4413 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
4414 verbosity.add_option('--get-thumbnail',
4415 action='store_true', dest='getthumbnail',
4416 help='simulate, quiet but print thumbnail URL', default=False)
4417 verbosity.add_option('--get-description',
4418 action='store_true', dest='getdescription',
4419 help='simulate, quiet but print video description', default=False)
4420 verbosity.add_option('--get-filename',
4421 action='store_true', dest='getfilename',
4422 help='simulate, quiet but print output filename', default=False)
4423 verbosity.add_option('--get-format',
4424 action='store_true', dest='getformat',
4425 help='simulate, quiet but print output format', default=False)
4426 verbosity.add_option('--no-progress',
4427 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
4428 verbosity.add_option('--console-title',
4429 action='store_true', dest='consoletitle',
4430 help='display progress in console titlebar', default=False)
4431 verbosity.add_option('-v', '--verbose',
4432 action='store_true', dest='verbose', help='print various debugging information', default=False)
4435 filesystem.add_option('-t', '--title',
4436 action='store_true', dest='usetitle', help='use title in file name', default=False)
4437 filesystem.add_option('-l', '--literal',
4438 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
4439 filesystem.add_option('-A', '--auto-number',
4440 action='store_true', dest='autonumber',
4441 help='number downloaded files starting from 00000', default=False)
4442 filesystem.add_option('-o', '--output',
4443 dest='outtmpl', metavar='TEMPLATE', help='output filename template. Use %(stitle)s to get the title, %(uploader)s for the uploader name, %(autonumber)s to get an automatically incremented number, %(ext)s for the filename extension, %(upload_date)s for the upload date (YYYYMMDD), and %% for a literal percent. Use - to output to stdout.')
4444 filesystem.add_option('-a', '--batch-file',
4445 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
4446 filesystem.add_option('-w', '--no-overwrites',
4447 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
4448 filesystem.add_option('-c', '--continue',
4449 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=True)
4450 filesystem.add_option('--no-continue',
4451 action='store_false', dest='continue_dl',
4452 help='do not resume partially downloaded files (restart from beginning)')
4453 filesystem.add_option('--cookies',
4454 dest='cookiefile', metavar='FILE', help='file to read cookies from and dump cookie jar in')
4455 filesystem.add_option('--no-part',
4456 action='store_true', dest='nopart', help='do not use .part files', default=False)
4457 filesystem.add_option('--no-mtime',
4458 action='store_false', dest='updatetime',
4459 help='do not use the Last-modified header to set the file modification time', default=True)
4460 filesystem.add_option('--write-description',
4461 action='store_true', dest='writedescription',
4462 help='write video description to a .description file', default=False)
4463 filesystem.add_option('--write-info-json',
4464 action='store_true', dest='writeinfojson',
4465 help='write video metadata to a .info.json file', default=False)
4468 postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
4469 help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
4470 postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
4471 help='"best", "aac", "vorbis", "mp3", "m4a", or "wav"; best by default')
4472 postproc.add_option('--audio-quality', metavar='QUALITY', dest='audioquality', default='128K',
4473 help='ffmpeg audio bitrate specification, 128k by default')
4474 postproc.add_option('-k', '--keep-video', action='store_true', dest='keepvideo', default=False,
4475 help='keeps the video file on disk after the post-processing; the video is erased by default')
4478 parser.add_option_group(general)
4479 parser.add_option_group(selection)
4480 parser.add_option_group(filesystem)
4481 parser.add_option_group(verbosity)
4482 parser.add_option_group(video_format)
4483 parser.add_option_group(authentication)
4484 parser.add_option_group(postproc)
4486 xdg_config_home = os.environ.get('XDG_CONFIG_HOME')
4488 userConf = os.path.join(xdg_config_home, 'youtube-dl.conf')
4490 userConf = os.path.join(os.path.expanduser('~'), '.config', 'youtube-dl.conf')
4491 argv = _readOptions('/etc/youtube-dl.conf') + _readOptions(userConf) + sys.argv[1:]
4492 opts, args = parser.parse_args(argv)
4494 return parser, opts, args
4496 def gen_extractors():
4497 """ Return a list of an instance of every supported extractor.
4498 The order does matter; the first extractor matched is the one handling the URL.
4500 youtube_ie = YoutubeIE()
4501 google_ie = GoogleIE()
4502 yahoo_ie = YahooIE()
4504 YoutubePlaylistIE(youtube_ie),
4505 YoutubeUserIE(youtube_ie),
4506 YoutubeSearchIE(youtube_ie),
4508 MetacafeIE(youtube_ie),
4511 GoogleSearchIE(google_ie),
4514 YahooSearchIE(yahoo_ie),
4527 StanfordOpenClassroomIE(),
4534 parser, opts, args = parseOpts()
4536 # Open appropriate CookieJar
4537 if opts.cookiefile is None:
4538 jar = cookielib.CookieJar()
4541 jar = cookielib.MozillaCookieJar(opts.cookiefile)
4542 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
4544 except (IOError, OSError), err:
4545 sys.exit(u'ERROR: unable to open cookie file')
4548 if opts.dump_user_agent:
4549 print std_headers['User-Agent']
4552 # Batch file verification
4554 if opts.batchfile is not None:
4556 if opts.batchfile == '-':
4559 batchfd = open(opts.batchfile, 'r')
4560 batchurls = batchfd.readlines()
4561 batchurls = [x.strip() for x in batchurls]
4562 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
4564 sys.exit(u'ERROR: batch file could not be read')
4565 all_urls = batchurls + args
4567 # General configuration
4568 cookie_processor = urllib2.HTTPCookieProcessor(jar)
4569 proxy_handler = urllib2.ProxyHandler()
4570 opener = urllib2.build_opener(proxy_handler, cookie_processor, YoutubeDLHandler())
4571 urllib2.install_opener(opener)
4572 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
4575 print(u'[debug] Proxy map: ' + str(proxy_handler.proxies))
4577 extractors = gen_extractors()
4579 if opts.list_extractors:
4580 for ie in extractors:
4582 matchedUrls = filter(lambda url: ie.suitable(url), all_urls)
4583 all_urls = filter(lambda url: url not in matchedUrls, all_urls)
4584 for mu in matchedUrls:
4588 # Conflicting, missing and erroneous options
4589 if opts.usenetrc and (opts.username is not None or opts.password is not None):
4590 parser.error(u'using .netrc conflicts with giving username/password')
4591 if opts.password is not None and opts.username is None:
4592 parser.error(u'account username missing')
4593 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
4594 parser.error(u'using output template conflicts with using title, literal title or auto number')
4595 if opts.usetitle and opts.useliteral:
4596 parser.error(u'using title conflicts with using literal title')
4597 if opts.username is not None and opts.password is None:
4598 opts.password = getpass.getpass(u'Type account password and press return:')
4599 if opts.ratelimit is not None:
4600 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
4601 if numeric_limit is None:
4602 parser.error(u'invalid rate limit specified')
4603 opts.ratelimit = numeric_limit
4604 if opts.retries is not None:
4606 opts.retries = long(opts.retries)
4607 except (TypeError, ValueError), err:
4608 parser.error(u'invalid retry count specified')
4610 opts.playliststart = int(opts.playliststart)
4611 if opts.playliststart <= 0:
4612 raise ValueError(u'Playlist start must be positive')
4613 except (TypeError, ValueError), err:
4614 parser.error(u'invalid playlist start number specified')
4616 opts.playlistend = int(opts.playlistend)
4617 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
4618 raise ValueError(u'Playlist end must be greater than playlist start')
4619 except (TypeError, ValueError), err:
4620 parser.error(u'invalid playlist end number specified')
4621 if opts.extractaudio:
4622 if opts.audioformat not in ['best', 'aac', 'mp3', 'vorbis', 'm4a', 'wav']:
4623 parser.error(u'invalid audio format specified')
4626 fd = FileDownloader({
4627 'usenetrc': opts.usenetrc,
4628 'username': opts.username,
4629 'password': opts.password,
4630 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4631 'forceurl': opts.geturl,
4632 'forcetitle': opts.gettitle,
4633 'forcethumbnail': opts.getthumbnail,
4634 'forcedescription': opts.getdescription,
4635 'forcefilename': opts.getfilename,
4636 'forceformat': opts.getformat,
4637 'simulate': opts.simulate,
4638 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4639 'format': opts.format,
4640 'format_limit': opts.format_limit,
4641 'listformats': opts.listformats,
4642 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
4643 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
4644 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
4645 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
4646 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
4647 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
4648 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
4649 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
4650 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
4651 or u'%(id)s.%(ext)s'),
4652 'ignoreerrors': opts.ignoreerrors,
4653 'ratelimit': opts.ratelimit,
4654 'nooverwrites': opts.nooverwrites,
4655 'retries': opts.retries,
4656 'continuedl': opts.continue_dl,
4657 'noprogress': opts.noprogress,
4658 'playliststart': opts.playliststart,
4659 'playlistend': opts.playlistend,
4660 'logtostderr': opts.outtmpl == '-',
4661 'consoletitle': opts.consoletitle,
4662 'nopart': opts.nopart,
4663 'updatetime': opts.updatetime,
4664 'writedescription': opts.writedescription,
4665 'writeinfojson': opts.writeinfojson,
4666 'writesubtitles': opts.writesubtitles,
4667 'subtitleslang': opts.subtitleslang,
4668 'matchtitle': opts.matchtitle,
4669 'rejecttitle': opts.rejecttitle,
4670 'max_downloads': opts.max_downloads,
4671 'prefer_free_formats': opts.prefer_free_formats,
4672 'verbose': opts.verbose,
4674 for extractor in extractors:
4675 fd.add_info_extractor(extractor)
4678 if opts.extractaudio:
4679 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, keepvideo=opts.keepvideo))
4682 if opts.update_self:
4683 updateSelf(fd, sys.argv[0])
4686 if len(all_urls) < 1:
4687 if not opts.update_self:
4688 parser.error(u'you must provide at least one URL')
4693 retcode = fd.download(all_urls)
4694 except MaxDownloadsReached:
4695 fd.to_screen(u'--max-download limit reached, aborting.')
4698 # Dump cookie jar if requested
4699 if opts.cookiefile is not None:
4702 except (IOError, OSError), err:
4703 sys.exit(u'ERROR: unable to save cookie jar')
4710 except DownloadError:
4712 except SameFileError:
4713 sys.exit(u'ERROR: fixed output name but more than one file to download')
4714 except KeyboardInterrupt:
4715 sys.exit(u'\nERROR: Interrupted by user')
4717 if __name__ == '__main__':
4720 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: