]> asedeno.scripts.mit.edu Git - youtube-dl.git/blob - youtube_dl/jsinterp.py
Try for timestamp, description from window.__INITIAL_DATA__ pages
[youtube-dl.git] / youtube_dl / jsinterp.py
1 from __future__ import unicode_literals
2
3 import json
4 import operator
5 import re
6
7 from .utils import (
8     ExtractorError,
9     remove_quotes,
10 )
11 from .compat import (
12     compat_collections_abc,
13     compat_str,
14 )
15 MutableMapping = compat_collections_abc.MutableMapping
16
17
18 class Nonlocal:
19     pass
20
21
22 _OPERATORS = [
23     ('|', operator.or_),
24     ('^', operator.xor),
25     ('&', operator.and_),
26     ('>>', operator.rshift),
27     ('<<', operator.lshift),
28     ('-', operator.sub),
29     ('+', operator.add),
30     ('%', operator.mod),
31     ('/', operator.truediv),
32     ('*', operator.mul),
33 ]
34 _ASSIGN_OPERATORS = [(op + '=', opfunc) for op, opfunc in _OPERATORS]
35 _ASSIGN_OPERATORS.append(('=', (lambda cur, right: right)))
36
37 _NAME_RE = r'[a-zA-Z_$][a-zA-Z_$0-9]*'
38
39 _MATCHING_PARENS = dict(zip(*zip('()', '{}', '[]')))
40
41
42 class JS_Break(ExtractorError):
43     def __init__(self):
44         ExtractorError.__init__(self, 'Invalid break')
45
46
47 class JS_Continue(ExtractorError):
48     def __init__(self):
49         ExtractorError.__init__(self, 'Invalid continue')
50
51
52 class LocalNameSpace(MutableMapping):
53     def __init__(self, *stack):
54         self.stack = tuple(stack)
55
56     def __getitem__(self, key):
57         for scope in self.stack:
58             if key in scope:
59                 return scope[key]
60         raise KeyError(key)
61
62     def __setitem__(self, key, value):
63         for scope in self.stack:
64             if key in scope:
65                 scope[key] = value
66                 break
67         else:
68             self.stack[0][key] = value
69         return value
70
71     def __delitem__(self, key):
72         raise NotImplementedError('Deleting is not supported')
73
74     def __iter__(self):
75         for scope in self.stack:
76             for scope_item in iter(scope):
77                 yield scope_item
78
79     def __len__(self, key):
80         return len(iter(self))
81
82     def __repr__(self):
83         return 'LocalNameSpace%s' % (self.stack, )
84
85
86 class JSInterpreter(object):
87     def __init__(self, code, objects=None):
88         if objects is None:
89             objects = {}
90         self.code = code
91         self._functions = {}
92         self._objects = objects
93         self.__named_object_counter = 0
94
95     def _named_object(self, namespace, obj):
96         self.__named_object_counter += 1
97         name = '__youtube_dl_jsinterp_obj%s' % (self.__named_object_counter, )
98         namespace[name] = obj
99         return name
100
101     @staticmethod
102     def _separate(expr, delim=',', max_split=None):
103         if not expr:
104             return
105         counters = {k: 0 for k in _MATCHING_PARENS.values()}
106         start, splits, pos, delim_len = 0, 0, 0, len(delim) - 1
107         for idx, char in enumerate(expr):
108             if char in _MATCHING_PARENS:
109                 counters[_MATCHING_PARENS[char]] += 1
110             elif char in counters:
111                 counters[char] -= 1
112             if char != delim[pos] or any(counters.values()):
113                 pos = 0
114                 continue
115             elif pos != delim_len:
116                 pos += 1
117                 continue
118             yield expr[start: idx - delim_len]
119             start, pos = idx + 1, 0
120             splits += 1
121             if max_split and splits >= max_split:
122                 break
123         yield expr[start:]
124
125     @staticmethod
126     def _separate_at_paren(expr, delim):
127         separated = list(JSInterpreter._separate(expr, delim, 1))
128         if len(separated) < 2:
129             raise ExtractorError('No terminating paren {0} in {1}'.format(delim, expr))
130         return separated[0][1:].strip(), separated[1].strip()
131
132     def interpret_statement(self, stmt, local_vars, allow_recursion=100):
133         if allow_recursion < 0:
134             raise ExtractorError('Recursion limit reached')
135
136         sub_statements = list(self._separate(stmt, ';'))
137         stmt = (sub_statements or ['']).pop()
138         for sub_stmt in sub_statements:
139             ret, should_abort = self.interpret_statement(sub_stmt, local_vars, allow_recursion - 1)
140             if should_abort:
141                 return ret
142
143         should_abort = False
144         stmt = stmt.lstrip()
145         stmt_m = re.match(r'var\s', stmt)
146         if stmt_m:
147             expr = stmt[len(stmt_m.group(0)):]
148         else:
149             return_m = re.match(r'return(?:\s+|$)', stmt)
150             if return_m:
151                 expr = stmt[len(return_m.group(0)):]
152                 should_abort = True
153             else:
154                 # Try interpreting it as an expression
155                 expr = stmt
156
157         v = self.interpret_expression(expr, local_vars, allow_recursion)
158         return v, should_abort
159
160     def interpret_expression(self, expr, local_vars, allow_recursion):
161         expr = expr.strip()
162         if expr == '':  # Empty expression
163             return None
164
165         if expr.startswith('{'):
166             inner, outer = self._separate_at_paren(expr, '}')
167             inner, should_abort = self.interpret_statement(inner, local_vars, allow_recursion - 1)
168             if not outer or should_abort:
169                 return inner
170             else:
171                 expr = json.dumps(inner) + outer
172
173         if expr.startswith('('):
174             inner, outer = self._separate_at_paren(expr, ')')
175             inner = self.interpret_expression(inner, local_vars, allow_recursion)
176             if not outer:
177                 return inner
178             else:
179                 expr = json.dumps(inner) + outer
180
181         if expr.startswith('['):
182             inner, outer = self._separate_at_paren(expr, ']')
183             name = self._named_object(local_vars, [
184                 self.interpret_expression(item, local_vars, allow_recursion)
185                 for item in self._separate(inner)])
186             expr = name + outer
187
188         m = re.match(r'try\s*', expr)
189         if m:
190             if expr[m.end()] == '{':
191                 try_expr, expr = self._separate_at_paren(expr[m.end():], '}')
192             else:
193                 try_expr, expr = expr[m.end() - 1:], ''
194             ret, should_abort = self.interpret_statement(try_expr, local_vars, allow_recursion - 1)
195             if should_abort:
196                 return ret
197             return self.interpret_statement(expr, local_vars, allow_recursion - 1)[0]
198
199         m = re.match(r'(?:(?P<catch>catch)|(?P<for>for)|(?P<switch>switch))\s*\(', expr)
200         md = m.groupdict() if m else {}
201         if md.get('catch'):
202             # We ignore the catch block
203             _, expr = self._separate_at_paren(expr, '}')
204             return self.interpret_statement(expr, local_vars, allow_recursion - 1)[0]
205
206         elif md.get('for'):
207             def raise_constructor_error(c):
208                 raise ExtractorError(
209                     'Premature return in the initialization of a for loop in {0!r}'.format(c))
210
211             constructor, remaining = self._separate_at_paren(expr[m.end() - 1:], ')')
212             if remaining.startswith('{'):
213                 body, expr = self._separate_at_paren(remaining, '}')
214             else:
215                 m = re.match(r'switch\s*\(', remaining)  # FIXME
216                 if m:
217                     switch_val, remaining = self._separate_at_paren(remaining[m.end() - 1:], ')')
218                     body, expr = self._separate_at_paren(remaining, '}')
219                     body = 'switch(%s){%s}' % (switch_val, body)
220                 else:
221                     body, expr = remaining, ''
222             start, cndn, increment = self._separate(constructor, ';')
223             if self.interpret_statement(start, local_vars, allow_recursion - 1)[1]:
224                 raise_constructor_error(constructor)
225             while True:
226                 if not self.interpret_expression(cndn, local_vars, allow_recursion):
227                     break
228                 try:
229                     ret, should_abort = self.interpret_statement(body, local_vars, allow_recursion - 1)
230                     if should_abort:
231                         return ret
232                 except JS_Break:
233                     break
234                 except JS_Continue:
235                     pass
236                 if self.interpret_statement(increment, local_vars, allow_recursion - 1)[1]:
237                     raise_constructor_error(constructor)
238             return self.interpret_statement(expr, local_vars, allow_recursion - 1)[0]
239
240         elif md.get('switch'):
241             switch_val, remaining = self._separate_at_paren(expr[m.end() - 1:], ')')
242             switch_val = self.interpret_expression(switch_val, local_vars, allow_recursion)
243             body, expr = self._separate_at_paren(remaining, '}')
244             items = body.replace('default:', 'case default:').split('case ')[1:]
245             for default in (False, True):
246                 matched = False
247                 for item in items:
248                     case, stmt = [i.strip() for i in self._separate(item, ':', 1)]
249                     if default:
250                         matched = matched or case == 'default'
251                     elif not matched:
252                         matched = (case != 'default'
253                                    and switch_val == self.interpret_expression(case, local_vars, allow_recursion))
254                     if not matched:
255                         continue
256                     try:
257                         ret, should_abort = self.interpret_statement(stmt, local_vars, allow_recursion - 1)
258                         if should_abort:
259                             return ret
260                     except JS_Break:
261                         break
262                 if matched:
263                     break
264             return self.interpret_statement(expr, local_vars, allow_recursion - 1)[0]
265
266         # Comma separated statements
267         sub_expressions = list(self._separate(expr))
268         expr = sub_expressions.pop().strip() if sub_expressions else ''
269         for sub_expr in sub_expressions:
270             self.interpret_expression(sub_expr, local_vars, allow_recursion)
271
272         for m in re.finditer(r'''(?x)
273                 (?P<pre_sign>\+\+|--)(?P<var1>%(_NAME_RE)s)|
274                 (?P<var2>%(_NAME_RE)s)(?P<post_sign>\+\+|--)''' % globals(), expr):
275             var = m.group('var1') or m.group('var2')
276             start, end = m.span()
277             sign = m.group('pre_sign') or m.group('post_sign')
278             ret = local_vars[var]
279             local_vars[var] += 1 if sign[0] == '+' else -1
280             if m.group('pre_sign'):
281                 ret = local_vars[var]
282             expr = expr[:start] + json.dumps(ret) + expr[end:]
283
284         for op, opfunc in _ASSIGN_OPERATORS:
285             m = re.match(r'''(?x)
286                 (?P<out>%s)(?:\[(?P<index>[^\]]+?)\])?
287                 \s*%s
288                 (?P<expr>.*)$''' % (_NAME_RE, re.escape(op)), expr)
289             if not m:
290                 continue
291             right_val = self.interpret_expression(m.group('expr'), local_vars, allow_recursion)
292
293             if m.groupdict().get('index'):
294                 lvar = local_vars[m.group('out')]
295                 idx = self.interpret_expression(m.group('index'), local_vars, allow_recursion)
296                 if not isinstance(idx, int):
297                     raise ExtractorError('List indices must be integers: %s' % (idx, ))
298                 cur = lvar[idx]
299                 val = opfunc(cur, right_val)
300                 lvar[idx] = val
301                 return val
302             else:
303                 cur = local_vars.get(m.group('out'))
304                 val = opfunc(cur, right_val)
305                 local_vars[m.group('out')] = val
306                 return val
307
308         if expr.isdigit():
309             return int(expr)
310
311         if expr == 'break':
312             raise JS_Break()
313         elif expr == 'continue':
314             raise JS_Continue()
315
316         var_m = re.match(
317             r'(?!if|return|true|false|null)(?P<name>%s)$' % _NAME_RE,
318             expr)
319         if var_m:
320             return local_vars[var_m.group('name')]
321
322         try:
323             return json.loads(expr)
324         except ValueError:
325             pass
326
327         m = re.match(
328             r'(?P<in>%s)\[(?P<idx>.+)\]$' % _NAME_RE, expr)
329         if m:
330             val = local_vars[m.group('in')]
331             idx = self.interpret_expression(m.group('idx'), local_vars, allow_recursion)
332             return val[idx]
333
334         def raise_expr_error(where, op, exp):
335             raise ExtractorError('Premature {0} return of {1} in {2!r}'.format(where, op, exp))
336
337         for op, opfunc in _OPERATORS:
338             separated = list(self._separate(expr, op))
339             if len(separated) < 2:
340                 continue
341             right_val = separated.pop()
342             left_val = op.join(separated)
343             left_val, should_abort = self.interpret_statement(
344                 left_val, local_vars, allow_recursion - 1)
345             if should_abort:
346                 raise_expr_error('left-side', op, expr)
347             right_val, should_abort = self.interpret_statement(
348                 right_val, local_vars, allow_recursion - 1)
349             if should_abort:
350                 raise_expr_error('right-side', op, expr)
351             return opfunc(left_val or 0, right_val)
352
353         m = re.match(
354             r'(?P<var>%s)(?:\.(?P<member>[^(]+)|\[(?P<member2>[^]]+)\])\s*' % _NAME_RE,
355             expr)
356         if m:
357             variable = m.group('var')
358             nl = Nonlocal()
359
360             nl.member = remove_quotes(m.group('member') or m.group('member2'))
361             arg_str = expr[m.end():]
362             if arg_str.startswith('('):
363                 arg_str, remaining = self._separate_at_paren(arg_str, ')')
364             else:
365                 arg_str, remaining = None, arg_str
366
367             def assertion(cndn, msg):
368                 """ assert, but without risk of getting optimized out """
369                 if not cndn:
370                     raise ExtractorError('{0} {1}: {2}'.format(nl.member, msg, expr))
371
372             def eval_method():
373                 # nonlocal member
374                 member = nl.member
375                 if variable == 'String':
376                     obj = compat_str
377                 elif variable in local_vars:
378                     obj = local_vars[variable]
379                 else:
380                     if variable not in self._objects:
381                         self._objects[variable] = self.extract_object(variable)
382                     obj = self._objects[variable]
383
384                 if arg_str is None:
385                     # Member access
386                     if member == 'length':
387                         return len(obj)
388                     return obj[member]
389
390                 # Function call
391                 argvals = [
392                     self.interpret_expression(v, local_vars, allow_recursion)
393                     for v in self._separate(arg_str)]
394
395                 if obj == compat_str:
396                     if member == 'fromCharCode':
397                         assertion(argvals, 'takes one or more arguments')
398                         return ''.join(map(chr, argvals))
399                     raise ExtractorError('Unsupported string method %s' % (member, ))
400
401                 if member == 'split':
402                     assertion(argvals, 'takes one or more arguments')
403                     assertion(argvals == [''], 'with arguments is not implemented')
404                     return list(obj)
405                 elif member == 'join':
406                     assertion(isinstance(obj, list), 'must be applied on a list')
407                     assertion(len(argvals) == 1, 'takes exactly one argument')
408                     return argvals[0].join(obj)
409                 elif member == 'reverse':
410                     assertion(not argvals, 'does not take any arguments')
411                     obj.reverse()
412                     return obj
413                 elif member == 'slice':
414                     assertion(isinstance(obj, list), 'must be applied on a list')
415                     assertion(len(argvals) == 1, 'takes exactly one argument')
416                     return obj[argvals[0]:]
417                 elif member == 'splice':
418                     assertion(isinstance(obj, list), 'must be applied on a list')
419                     assertion(argvals, 'takes one or more arguments')
420                     index, howMany = map(int, (argvals + [len(obj)])[:2])
421                     if index < 0:
422                         index += len(obj)
423                     add_items = argvals[2:]
424                     res = []
425                     for i in range(index, min(index + howMany, len(obj))):
426                         res.append(obj.pop(index))
427                     for i, item in enumerate(add_items):
428                         obj.insert(index + i, item)
429                     return res
430                 elif member == 'unshift':
431                     assertion(isinstance(obj, list), 'must be applied on a list')
432                     assertion(argvals, 'takes one or more arguments')
433                     for item in reversed(argvals):
434                         obj.insert(0, item)
435                     return obj
436                 elif member == 'pop':
437                     assertion(isinstance(obj, list), 'must be applied on a list')
438                     assertion(not argvals, 'does not take any arguments')
439                     if not obj:
440                         return
441                     return obj.pop()
442                 elif member == 'push':
443                     assertion(argvals, 'takes one or more arguments')
444                     obj.extend(argvals)
445                     return obj
446                 elif member == 'forEach':
447                     assertion(argvals, 'takes one or more arguments')
448                     assertion(len(argvals) <= 2, 'takes at-most 2 arguments')
449                     f, this = (argvals + [''])[:2]
450                     return [f((item, idx, obj), this=this) for idx, item in enumerate(obj)]
451                 elif member == 'indexOf':
452                     assertion(argvals, 'takes one or more arguments')
453                     assertion(len(argvals) <= 2, 'takes at-most 2 arguments')
454                     idx, start = (argvals + [0])[:2]
455                     try:
456                         return obj.index(idx, start)
457                     except ValueError:
458                         return -1
459
460                 if isinstance(obj, list):
461                     member = int(member)
462                     nl.member = member
463                 return obj[member](argvals)
464
465             if remaining:
466                 return self.interpret_expression(
467                     self._named_object(local_vars, eval_method()) + remaining,
468                     local_vars, allow_recursion)
469             else:
470                 return eval_method()
471
472         m = re.match(r'^(?P<func>%s)\((?P<args>[a-zA-Z0-9_$,]*)\)$' % _NAME_RE, expr)
473         if m:
474             fname = m.group('func')
475             argvals = tuple([
476                 int(v) if v.isdigit() else local_vars[v]
477                 for v in self._separate(m.group('args'))])
478             if fname in local_vars:
479                 return local_vars[fname](argvals)
480             elif fname not in self._functions:
481                 self._functions[fname] = self.extract_function(fname)
482             return self._functions[fname](argvals)
483
484         if expr:
485             raise ExtractorError('Unsupported JS expression %r' % expr)
486
487     def extract_object(self, objname):
488         _FUNC_NAME_RE = r'''(?:[a-zA-Z$0-9]+|"[a-zA-Z$0-9]+"|'[a-zA-Z$0-9]+')'''
489         obj = {}
490         obj_m = re.search(
491             r'''(?x)
492                 (?<!this\.)%s\s*=\s*{\s*
493                     (?P<fields>(%s\s*:\s*function\s*\(.*?\)\s*{.*?}(?:,\s*)?)*)
494                 }\s*;
495             ''' % (re.escape(objname), _FUNC_NAME_RE),
496             self.code)
497         fields = obj_m.group('fields')
498         # Currently, it only supports function definitions
499         fields_m = re.finditer(
500             r'''(?x)
501                 (?P<key>%s)\s*:\s*function\s*\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}
502             ''' % _FUNC_NAME_RE,
503             fields)
504         for f in fields_m:
505             argnames = f.group('args').split(',')
506             obj[remove_quotes(f.group('key'))] = self.build_function(argnames, f.group('code'))
507
508         return obj
509
510     def extract_function_code(self, funcname):
511         """ @returns argnames, code """
512         func_m = re.search(
513             r'''(?x)
514                 (?:function\s+%(f_n)s|[{;,]\s*%(f_n)s\s*=\s*function|var\s+%(f_n)s\s*=\s*function)\s*
515                 \((?P<args>[^)]*)\)\s*
516                 (?P<code>\{(?:(?!};)[^"]|"([^"]|\\")*")+\})''' % {'f_n': re.escape(funcname), },
517             self.code)
518         code, _ = self._separate_at_paren(func_m.group('code'), '}')  # refine the match
519         if func_m is None:
520             raise ExtractorError('Could not find JS function %r' % funcname)
521         return func_m.group('args').split(','), code
522
523     def extract_function(self, funcname):
524         return self.extract_function_from_code(*self.extract_function_code(funcname))
525
526     def extract_function_from_code(self, argnames, code, *global_stack):
527         local_vars = {}
528         while True:
529             mobj = re.search(r'function\((?P<args>[^)]*)\)\s*{', code)
530             if mobj is None:
531                 break
532             start, body_start = mobj.span()
533             body, remaining = self._separate_at_paren(code[body_start - 1:], '}')
534             name = self._named_object(
535                 local_vars,
536                 self.extract_function_from_code(
537                     [x.strip() for x in mobj.group('args').split(',')],
538                     body, local_vars, *global_stack))
539             code = code[:start] + name + remaining
540         return self.build_function(argnames, code, local_vars, *global_stack)
541
542     def call_function(self, funcname, *args):
543         return self.extract_function(funcname)(args)
544
545     def build_function(self, argnames, code, *global_stack):
546         global_stack = list(global_stack) or [{}]
547         local_vars = global_stack.pop(0)
548
549         def resf(args, **kwargs):
550             local_vars.update(dict(zip(argnames, args)))
551             local_vars.update(kwargs)
552             var_stack = LocalNameSpace(local_vars, *global_stack)
553             for stmt in self._separate(code.replace('\n', ''), ';'):
554                 ret, should_abort = self.interpret_statement(stmt, var_stack)
555                 if should_abort:
556                     break
557             return ret
558         return resf