]> asedeno.scripts.mit.edu Git - youtube-dl.git/blob - youtube_dl/extractor/googlepodcasts.py
[googlepodcasts] Add new extractor
[youtube-dl.git] / youtube_dl / extractor / googlepodcasts.py
1 # coding: utf-8
2 from __future__ import unicode_literals
3
4 import json
5 import re
6
7 from .common import InfoExtractor
8 from ..utils import (
9     clean_podcast_url,
10     int_or_none,
11     try_get,
12     urlencode_postdata,
13 )
14
15
16 class GooglePodcastsBaseIE(InfoExtractor):
17     _VALID_URL_BASE = r'https?://podcasts\.google\.com/feed/'
18
19     def _batch_execute(self, func_id, video_id, params):
20         return json.loads(self._download_json(
21             'https://podcasts.google.com/_/PodcastsUi/data/batchexecute',
22             video_id, data=urlencode_postdata({
23                 'f.req': json.dumps([[[func_id, json.dumps(params), None, '1']]]),
24             }), transform_source=lambda x: self._search_regex(r'(?s)(\[.+\])', x, 'data'))[0][2])
25
26     def _extract_episode(self, episode):
27         return {
28             'id': episode[4][3],
29             'title': episode[8],
30             'url': clean_podcast_url(episode[13]),
31             'thumbnail': episode[2],
32             'description': episode[9],
33             'creator': try_get(episode, lambda x: x[14]),
34             'timestamp': int_or_none(episode[11]),
35             'duration': int_or_none(episode[12]),
36             'series': episode[1],
37         }
38
39
40 class GooglePodcastsIE(GooglePodcastsBaseIE):
41     IE_NAME = 'google:podcasts'
42     _VALID_URL = GooglePodcastsBaseIE._VALID_URL_BASE + r'(?P<feed_url>[^/]+)/episode/(?P<id>[^/?&#]+)'
43     _TEST = {
44         'url': 'https://podcasts.google.com/feed/aHR0cHM6Ly9mZWVkcy5ucHIub3JnLzM0NDA5ODUzOS9wb2RjYXN0LnhtbA/episode/MzBlNWRlN2UtOWE4Yy00ODcwLTk2M2MtM2JlMmUyNmViOTRh',
45         'md5': 'fa56b2ee8bd0703e27e42d4b104c4766',
46         'info_dict': {
47             'id': '30e5de7e-9a8c-4870-963c-3be2e26eb94a',
48             'ext': 'mp3',
49             'title': 'WWDTM New Year 2021',
50             'description': 'We say goodbye to 2020 with Christine Baranksi, Doug Jones, Jonna Mendez, and Kellee Edwards.',
51             'upload_date': '20210102',
52             'timestamp': 1609606800,
53             'duration': 2901,
54             'series': "Wait Wait... Don't Tell Me!",
55         }
56     }
57
58     def _real_extract(self, url):
59         b64_feed_url, b64_guid = re.match(self._VALID_URL, url).groups()
60         episode = self._batch_execute(
61             'oNjqVe', b64_guid, [b64_feed_url, b64_guid])[1]
62         return self._extract_episode(episode)
63
64
65 class GooglePodcastsFeedIE(GooglePodcastsBaseIE):
66     IE_NAME = 'google:podcasts:feed'
67     _VALID_URL = GooglePodcastsBaseIE._VALID_URL_BASE + r'(?P<id>[^/?&#]+)/?(?:[?#&]|$)'
68     _TEST = {
69         'url': 'https://podcasts.google.com/feed/aHR0cHM6Ly9mZWVkcy5ucHIub3JnLzM0NDA5ODUzOS9wb2RjYXN0LnhtbA',
70         'info_dict': {
71             'title': "Wait Wait... Don't Tell Me!",
72             'description': "NPR's weekly current events quiz. Have a laugh and test your news knowledge while figuring out what's real and what we've made up.",
73         },
74         'playlist_mincount': 20,
75     }
76
77     def _real_extract(self, url):
78         b64_feed_url = self._match_id(url)
79         data = self._batch_execute('ncqJEe', b64_feed_url, [b64_feed_url])
80
81         entries = []
82         for episode in (try_get(data, lambda x: x[1][0]) or []):
83             entries.append(self._extract_episode(episode))
84
85         feed = try_get(data, lambda x: x[3]) or []
86         return self.playlist_result(
87             entries, playlist_title=try_get(feed, lambda x: x[0]),
88             playlist_description=try_get(feed, lambda x: x[2]))