]> asedeno.scripts.mit.edu Git - youtube-dl.git/blobdiff - youtube_dl/extractor/instagram.py
Return the item itself if playlist has one entry
[youtube-dl.git] / youtube_dl / extractor / instagram.py
index b061850a187567e0d7ed18b5e64ce10e08927244..12e10143cdc100877f3f0233243f80d224599863 100644 (file)
@@ -12,6 +12,7 @@ from ..compat import (
 )
 from ..utils import (
     ExtractorError,
+    float_or_none,
     get_element_by_attribute,
     int_or_none,
     lowercase_escape,
@@ -22,7 +23,7 @@ from ..utils import (
 
 
 class InstagramIE(InfoExtractor):
-    _VALID_URL = r'(?P<url>https?://(?:www\.)?instagram\.com/(?:p|tv)/(?P<id>[^/?#&]+))'
+    _VALID_URL = r'(?P<url>https?://(?:www\.)?instagram\.com/(?:p|tv|reel)/(?P<id>[^/?#&]+))'
     _TESTS = [{
         'url': 'https://instagram.com/p/aye83DjauH/?foo=bar#abc',
         'md5': '0d2da106a9d2631273e192b372806516',
@@ -32,10 +33,11 @@ class InstagramIE(InfoExtractor):
             'title': 'Video by naomipq',
             'description': 'md5:1f17f0ab29bd6fe2bfad705f58de3cb8',
             'thumbnail': r're:^https?://.*\.jpg',
+            'duration': 0,
             'timestamp': 1371748545,
             'upload_date': '20130620',
             'uploader_id': 'naomipq',
-            'uploader': 'Naomi Leonor Phan-Quang',
+            'uploader': 'B E A U T Y  F O R  A S H E S',
             'like_count': int,
             'comment_count': int,
             'comments': list,
@@ -48,6 +50,7 @@ class InstagramIE(InfoExtractor):
             'ext': 'mp4',
             'title': 'Video by britneyspears',
             'thumbnail': r're:^https?://.*\.jpg',
+            'duration': 0,
             'timestamp': 1453760977,
             'upload_date': '20160125',
             'uploader_id': 'britneyspears',
@@ -86,6 +89,24 @@ class InstagramIE(InfoExtractor):
             'title': 'Post by instagram',
             'description': 'md5:0f9203fc6a2ce4d228da5754bcf54957',
         },
+    }, {
+        # IGTV
+        'url': 'https://www.instagram.com/tv/BkfuX9UB-eK/',
+        'info_dict': {
+            'id': 'BkfuX9UB-eK',
+            'ext': 'mp4',
+            'title': 'Fingerboarding Tricks with @cass.fb',
+            'thumbnail': r're:^https?://.*\.jpg',
+            'duration': 53.83,
+            'timestamp': 1530032919,
+            'upload_date': '20180626',
+            'uploader_id': 'instagram',
+            'uploader': 'Instagram',
+            'like_count': int,
+            'comment_count': int,
+            'comments': list,
+            'description': 'Meet Cass Hirst (@cass.fb), a fingerboarding pro who can perform tiny ollies and kickflips while blindfolded.',
+        }
     }, {
         'url': 'https://instagram.com/p/-Cmh1cukG2/',
         'only_matching': True,
@@ -95,6 +116,9 @@ class InstagramIE(InfoExtractor):
     }, {
         'url': 'https://www.instagram.com/tv/aye83DjauH/',
         'only_matching': True,
+    }, {
+        'url': 'https://www.instagram.com/reel/CDUMkliABpa/',
+        'only_matching': True,
     }]
 
     @staticmethod
@@ -122,9 +146,9 @@ class InstagramIE(InfoExtractor):
 
         webpage = self._download_webpage(url, video_id)
 
-        (video_url, description, thumbnail, timestamp, uploader,
+        (media, video_url, description, thumbnail, timestamp, uploader,
          uploader_id, like_count, comment_count, comments, height,
-         width) = [None] * 11
+         width) = [None] * 12
 
         shared_data = self._parse_json(
             self._search_regex(
@@ -137,59 +161,80 @@ class InstagramIE(InfoExtractor):
                 (lambda x: x['entry_data']['PostPage'][0]['graphql']['shortcode_media'],
                  lambda x: x['entry_data']['PostPage'][0]['media']),
                 dict)
-            if media:
-                video_url = media.get('video_url')
-                height = int_or_none(media.get('dimensions', {}).get('height'))
-                width = int_or_none(media.get('dimensions', {}).get('width'))
-                description = try_get(
-                    media, lambda x: x['edge_media_to_caption']['edges'][0]['node']['text'],
-                    compat_str) or media.get('caption')
-                thumbnail = media.get('display_src')
-                timestamp = int_or_none(media.get('taken_at_timestamp') or media.get('date'))
-                uploader = media.get('owner', {}).get('full_name')
-                uploader_id = media.get('owner', {}).get('username')
-
-                def get_count(key, kind):
-                    return int_or_none(try_get(
+        # _sharedData.entry_data.PostPage is empty when authenticated (see
+        # https://github.com/ytdl-org/youtube-dl/pull/22880)
+        if not media:
+            additional_data = self._parse_json(
+                self._search_regex(
+                    r'window\.__additionalDataLoaded\s*\(\s*[^,]+,\s*({.+?})\s*\)\s*;',
+                    webpage, 'additional data', default='{}'),
+                video_id, fatal=False)
+            if additional_data:
+                media = try_get(
+                    additional_data, lambda x: x['graphql']['shortcode_media'],
+                    dict)
+        if media:
+            video_url = media.get('video_url')
+            height = int_or_none(media.get('dimensions', {}).get('height'))
+            width = int_or_none(media.get('dimensions', {}).get('width'))
+            description = try_get(
+                media, lambda x: x['edge_media_to_caption']['edges'][0]['node']['text'],
+                compat_str) or media.get('caption')
+            title = media.get('title')
+            thumbnail = media.get('display_src') or media.get('display_url')
+            duration = float_or_none(media.get('video_duration'))
+            timestamp = int_or_none(media.get('taken_at_timestamp') or media.get('date'))
+            uploader = media.get('owner', {}).get('full_name')
+            uploader_id = media.get('owner', {}).get('username')
+
+            def get_count(keys, kind):
+                if not isinstance(keys, (list, tuple)):
+                    keys = [keys]
+                for key in keys:
+                    count = int_or_none(try_get(
                         media, (lambda x: x['edge_media_%s' % key]['count'],
                                 lambda x: x['%ss' % kind]['count'])))
-                like_count = get_count('preview_like', 'like')
-                comment_count = get_count('to_comment', 'comment')
-
-                comments = [{
-                    'author': comment.get('user', {}).get('username'),
-                    'author_id': comment.get('user', {}).get('id'),
-                    'id': comment.get('id'),
-                    'text': comment.get('text'),
-                    'timestamp': int_or_none(comment.get('created_at')),
-                } for comment in media.get(
-                    'comments', {}).get('nodes', []) if comment.get('text')]
-                if not video_url:
-                    edges = try_get(
-                        media, lambda x: x['edge_sidecar_to_children']['edges'],
-                        list) or []
-                    if edges:
-                        entries = []
-                        for edge_num, edge in enumerate(edges, start=1):
-                            node = try_get(edge, lambda x: x['node'], dict)
-                            if not node:
-                                continue
-                            node_video_url = url_or_none(node.get('video_url'))
-                            if not node_video_url:
-                                continue
-                            entries.append({
-                                'id': node.get('shortcode') or node['id'],
-                                'title': 'Video %d' % edge_num,
-                                'url': node_video_url,
-                                'thumbnail': node.get('display_url'),
-                                'width': int_or_none(try_get(node, lambda x: x['dimensions']['width'])),
-                                'height': int_or_none(try_get(node, lambda x: x['dimensions']['height'])),
-                                'view_count': int_or_none(node.get('video_view_count')),
-                            })
-                        return self.playlist_result(
-                            entries, video_id,
-                            'Post by %s' % uploader_id if uploader_id else None,
-                            description)
+                    if count is not None:
+                        return count
+            like_count = get_count('preview_like', 'like')
+            comment_count = get_count(
+                ('preview_comment', 'to_comment', 'to_parent_comment'), 'comment')
+
+            comments = [{
+                'author': comment.get('user', {}).get('username'),
+                'author_id': comment.get('user', {}).get('id'),
+                'id': comment.get('id'),
+                'text': comment.get('text'),
+                'timestamp': int_or_none(comment.get('created_at')),
+            } for comment in media.get(
+                'comments', {}).get('nodes', []) if comment.get('text')]
+            if not video_url:
+                edges = try_get(
+                    media, lambda x: x['edge_sidecar_to_children']['edges'],
+                    list) or []
+                if edges:
+                    entries = []
+                    for edge_num, edge in enumerate(edges, start=1):
+                        node = try_get(edge, lambda x: x['node'], dict)
+                        if not node:
+                            continue
+                        node_video_url = url_or_none(node.get('video_url'))
+                        if not node_video_url:
+                            continue
+                        entries.append({
+                            'id': node.get('shortcode') or node['id'],
+                            'title': node.get('title') or 'Video %d' % edge_num,
+                            'url': node_video_url,
+                            'thumbnail': node.get('display_url'),
+                            'duration': float_or_none(node.get('video_duration')),
+                            'width': int_or_none(try_get(node, lambda x: x['dimensions']['width'])),
+                            'height': int_or_none(try_get(node, lambda x: x['dimensions']['height'])),
+                            'view_count': int_or_none(node.get('video_view_count')),
+                        })
+                    return self.playlist_result(
+                        entries, video_id,
+                        'Post by %s' % uploader_id if uploader_id else None,
+                        description)
 
         if not video_url:
             video_url = self._og_search_video_url(webpage, secure=False)
@@ -218,8 +263,9 @@ class InstagramIE(InfoExtractor):
             'id': video_id,
             'formats': formats,
             'ext': 'mp4',
-            'title': 'Video by %s' % uploader_id,
+            'title': title or 'Video by %s' % uploader_id,
             'description': description,
+            'duration': duration,
             'thumbnail': thumbnail,
             'timestamp': timestamp,
             'uploader_id': uploader_id,