Skip to content

Commit

Permalink
[ie/southpark] Fix extractor
Browse files Browse the repository at this point in the history
detached from MTVServicesInfoExtractor
  • Loading branch information
kclauhk committed Jan 21, 2025
1 parent fa4e174 commit eb850bf
Show file tree
Hide file tree
Showing 2 changed files with 184 additions and 41 deletions.
1 change: 1 addition & 0 deletions yt_dlp/extractor/_extractors.py
Original file line number Diff line number Diff line change
Expand Up @@ -1964,6 +1964,7 @@
SouthParkIE,
SouthParkLatIE,
SouthParkNlIE,
SouthParkSeasonsIE,
)
from .sovietscloset import (
SovietsClosetIE,
Expand Down
224 changes: 183 additions & 41 deletions yt_dlp/extractor/southpark.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,42 @@
from .mtv import MTVServicesInfoExtractor
import codecs
import re
import urllib.parse

from .common import InfoExtractor
from ..utils import (
ExtractorError,
determine_ext,
float_or_none,
int_or_none,
join_nonempty,
merge_dicts,
parse_duration,
str_or_none,
traverse_obj,
unified_strdate,
url_or_none,
)

class SouthParkIE(MTVServicesInfoExtractor):
IE_NAME = 'southpark.cc.com'
_VALID_URL = r'https?://(?:www\.)?(?P<url>southpark(?:\.cc|studios)\.com/((?:video-)?clips|(?:full-)?episodes|collections)/(?P<id>.+?)(\?|#|$))'

_FEED_URL = 'http://feeds.mtvnservices.com/od/feed/intl-mrss-player-feed'

class SouthParkIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?southpark(?:\.cc|studios)\.com/((?:video-)?clips|(?:full-)?episodes|collections)/(?P<id>.+?)(\?|#|$)'
_TESTS = [{
'url': 'https://southpark.cc.com/video-clips/d7wr06/south-park-you-all-agreed-to-counseling',
'info_dict': {
'id': '31929ad5-8269-11eb-8774-70df2f866ace',
'ext': 'mp4',
'title': 'You All Agreed to Counseling',
'description': 'Kenny, Cartman, Stan, and Kyle visit Mr. Mackey and ask for his help getting Mrs. Nelson to come back. Mr. Mackey reveals the only way to get things back to normal is to get the teachers vaccinated.',
'thumbnail': 'https://images.paramount.tech/uri/mgid:arc:imageassetref:shared.southpark.global:c65bada8-b837-4c1a-82e4-fba7935c7c42',
'timestamp': 1615352400,
'upload_date': '20210310',
'release_date': '20210310',
'season': 'Season 24',
'episode': 'Episode 2',
'season_number': 24,
'episode_number': 2,
'tags': 'count:24',
'duration': 134.552,
},
}, {
'url': 'http://southpark.cc.com/collections/7758/fan-favorites/1',
Expand All @@ -24,14 +46,63 @@ class SouthParkIE(MTVServicesInfoExtractor):
'only_matching': True,
}]

def _get_feed_query(self, uri):
return {
'accountOverride': 'intl.mtvi.com',
'arcEp': 'shared.southpark.global',
'ep': '90877963',
'imageEp': 'shared.southpark.global',
'mgid': uri,
}
def _entries(self, video_data, video_id):
chapters, formats, subtitles = None, [], {}
if service_url := codecs.decode(video_data.get('videoServiceUrl', ''), 'unicode-escape'):
mica_json = self._download_json(service_url.split('?')[0] + '?clientPlatform=desktop', video_id)
if source := traverse_obj(mica_json, ('stitchedstream', 'source', {url_or_none})):
if determine_ext(source) == 'm3u8':
formats, subtitles = self._extract_m3u8_formats_and_subtitles(
source, video_id, 'mp4', fatal=False, m3u8_id='hls')
else:
formats.append({'url': source})
chapter_data = traverse_obj(mica_json, ('content', 0, 'chapters'), default=[])
if len(chapter_data) > 1:
chapters = traverse_obj(chapter_data, (..., {
'start_time': ({lambda v: parse_duration(v['contentoffset'])}),
'end_time': ({lambda v: parse_duration(v['contentoffset']) + parse_duration(v['duration'])}),
}))
return merge_dicts(traverse_obj(video_data, {
'id': ('id', {str}),
'title': ('title', {str}),
'description': (('fullDescription', 'description'), {str_or_none}),
'thumbnail': ('images', 0, 'url',
{lambda v: url_or_none(codecs.decode(v, 'unicode-escape')).split('?')[0]}),
'timestamp': ('publishDate', 'timestamp', {int_or_none}),
'upload_date': ('publishDate', 'dateString', {unified_strdate}),
'release_date': ('airDate', 'dateString', {unified_strdate}),
'season_number': ('seasonNumber', {int_or_none}),
'episode_number': ('episodeAiringOrder', {int_or_none}),
'tags': ('keywords', {lambda v: re.split(r',\s*', v)}),
'genres': ('genres'),
'duration': ('duration', 'milliseconds', {lambda v: float_or_none(v, 1000)}),
}, get_all=False), {
'id': video_id,
'chapters': chapters,
'formats': formats,
'subtitles': subtitles,
})

def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
page_data = self._search_json(r'window\.__DATA__\s*=\s*', webpage, 'Page Data', video_id,
end_pattern=r';\n')
video_data = traverse_obj(
page_data, ('children', ..., 'handleTVEAuthRedirection',
(('videoPlaylist', 'videos'), ('videoDetail', {lambda v: [v]}))),
default=[], get_all=False)

if len(video_data) == 1:
return self._entries(video_data[0], video_id)
elif len(video_data) > 1:
return self.playlist_result((self._entries(video, video_id) for video in video_data),
**traverse_obj(page_data, ('children', ..., 'handleTVEAuthRedirection', 'videoPlaylist', {
'id': ('id', {str}),
'title': ('seoInformation', 'title', {str}),
'description': ('seoInformation', 'description', {str_or_none}),
}), get_all=False))
raise ExtractorError('Unable to extract video data')


class SouthParkEsIE(SouthParkIE): # XXX: Do not subclass from concrete IE
Expand Down Expand Up @@ -69,16 +140,36 @@ class SouthParkDeIE(SouthParkIE): # XXX: Do not subclass from concrete IE
'id': 'e99d45ea-ed00-11e0-aca6-0026b9414f30',
'ext': 'mp4',
'title': 'Tooth Fairy Cartman',
'description': 'md5:db02e23818b4dc9cb5f0c5a7e8833a68',
'description': 'md5:11656c34e92f2ab9491e01de43200baa',
'thumbnail': 'https://images.paramount.tech/uri/mgid:arc:imageassetref:shared.southpark.gsa.en:5e4fe2b3-ed07-49ec-9c10-320cb97b7d9a',
'timestamp': 954990360,
'upload_date': '20000406',
'release_date': '20000406',
'season': 'Season 4',
'episode': 'Episode 1',
'season_number': 4,
'episode_number': 1,
'tags': 'count:16',
'duration': 93.26,
},
}, {
# episode
'url': 'https://www.southpark.de/en/episodes/yy0vjs/south-park-the-pandemic-special-season-24-ep-1',
'info_dict': {
'id': 'f5fbd823-04bc-11eb-9b1b-0e40cf2fc285',
'id': '230a4f02-f583-11ea-834d-70df2f866ace',
'ext': 'mp4',
'title': 'South Park',
'title': 'The Pandemic Special',
'description': 'md5:ae0d875eff169dcbed16b21531857ac1',
'thumbnail': 'https://images.paramount.tech/uri/mgid:arc:imageassetref:shared.southpark.gsa.en:31f62b92-5658-4a86-9d47-86b21b4a2abb',
'timestamp': 1601932260,
'upload_date': '20201005',
'release_date': '20201005',
'season': 'Season 24',
'episode': 'Episode 1',
'season_number': 24,
'episode_number': 1,
'genres': ['Comedy'],
'duration': 2724.0,
},
}, {
# clip
Expand All @@ -88,26 +179,39 @@ class SouthParkDeIE(SouthParkIE): # XXX: Do not subclass from concrete IE
'ext': 'mp4',
'title': 'Zahnfee Cartman',
'description': 'md5:b917eec991d388811d911fd1377671ac',
'thumbnail': 'https://images.paramount.tech/uri/mgid:arc:imageassetref:shared.southpark.gsa.de:5e4fe2b3-ed07-49ec-9c10-320cb97b7d9a',
'timestamp': 954990360,
'upload_date': '20000406',
'release_date': '20000406',
'season': 'Season 4',
'episode': 'Episode 1',
'season_number': 4,
'episode_number': 1,
'tags': 'count:16',
'duration': 93.26,
},
}, {
# episode
'url': 'https://www.southpark.de/folgen/242csn/south-park-her-mit-dem-hirn-staffel-1-ep-7',
'url': 'https://www.southpark.de/folgen/scexjh/south-park-ein-fettwanst-in-aethiopien-staffel-1-ep-8',
'info_dict': {
'id': '607115f3-496f-40c3-8647-2b0bcff486c0',
'id': '5fe739ee-ecfd-11e0-aca6-0026b9414f30',
'ext': 'mp4',
'title': 'md5:South Park | Pink Eye | E 0107 | HDSS0107X deu | Version: 634312 | Comedy Central S1',
'title': 'Ein Fettwanst in Äthiopien',
'description': 'md5:f7fd28383b451bf6fc94f10581fc7648',
'thumbnail': 'https://images.paramount.tech/uri/mgid:arc:imageassetref:shared.southpark.gsa.de:bc418f83-7342-11ea-a59c-0a7527021758',
'timestamp': 879915600,
'upload_date': '19971119',
'release_date': '19971119',
'chapters': 'count:4',
'season': 'Season 1',
'episode': 'Episode 8',
'season_number': 1,
'episode_number': 8,
'genres': ['Comedy'],
'duration': 1319.0,
},
}]

def _get_feed_url(self, uri, url=None):
video_id = self._id_from_uri(uri)
config = self._download_json(
f'http://media.mtvnservices.com/pmt/e1/access/index.html?uri={uri}&configtype=edge&ref={url}', video_id)
return self._remove_template_parameter(config['feedWithQueryParams'])

def _get_feed_query(self, uri):
return


class SouthParkLatIE(SouthParkIE): # XXX: Do not subclass from concrete IE
IE_NAME = 'southpark.lat'
Expand Down Expand Up @@ -141,25 +245,15 @@ class SouthParkLatIE(SouthParkIE): # XXX: Do not subclass from concrete IE
},
}]

def _get_feed_url(self, uri, url=None):
video_id = self._id_from_uri(uri)
config = self._download_json(
f'http://media.mtvnservices.com/pmt/e1/access/index.html?uri={uri}&configtype=edge&ref={url}',
video_id)
return self._remove_template_parameter(config['feedWithQueryParams'])

def _get_feed_query(self, uri):
return


class SouthParkNlIE(SouthParkIE): # XXX: Do not subclass from concrete IE
IE_NAME = 'southpark.nl'
_VALID_URL = r'https?://(?:www\.)?(?P<url>southpark\.nl/(?:clips|(?:full-)?episodes|collections)/(?P<id>.+?)(\?|#|$))'
_FEED_URL = 'http://www.southpark.nl/feeds/video-player/mrss/'

_TESTS = [{
'url': 'http://www.southpark.nl/full-episodes/s18e06-freemium-isnt-free',
'info_dict': {
'id': '123',
'title': 'Freemium Isn\'t Free',
'description': 'Stan is addicted to the new Terrance and Phillip mobile game.',
},
Expand All @@ -170,7 +264,6 @@ class SouthParkNlIE(SouthParkIE): # XXX: Do not subclass from concrete IE
class SouthParkDkIE(SouthParkIE): # XXX: Do not subclass from concrete IE
IE_NAME = 'southparkstudios.dk'
_VALID_URL = r'https?://(?:www\.)?(?P<url>southparkstudios\.(?:dk|nu)/(?:clips|full-episodes|collections)/(?P<id>.+?)(\?|#|$))'
_FEED_URL = 'http://www.southparkstudios.dk/feeds/video-player/mrss/'

_TESTS = [{
'url': 'http://www.southparkstudios.dk/full-episodes/s18e07-grounded-vindaloop',
Expand All @@ -186,3 +279,52 @@ class SouthParkDkIE(SouthParkIE): # XXX: Do not subclass from concrete IE
'url': 'http://www.southparkstudios.nu/collections/2476/superhero-showdown/1',
'only_matching': True,
}]


class SouthParkSeasonsIE(InfoExtractor):
IE_NAME = 'SouthPark:Seasons'
_VALID_URL = r'(?P<domain>https?://(?:www\.)?southpark(?:\.cc|studios)\.com)/seasons/(?P<series>[^/]+)/?(?P<id>.+?)?(\?|#|$)'

_TESTS = [{
'url': 'https://www.southparkstudios.com/seasons/south-park/lrnlos/season-25',
'info_dict': {
'id': 'f7f6dc67-7d50-11ec-a4f1-70df2f866ace',
'title': 'South Park - Season 25',
},
'playlist_mincount': 6,
}]

def _real_extract(self, url):
video_id, series, domain = self._match_valid_url(url).group('id', 'series', 'domain')
webpage = self._download_webpage(url, video_id)
page_data = self._search_json(r'window\.__DATA__\s*=\s*', webpage, 'Page Data', video_id,
end_pattern=r';\n')
if mgids := traverse_obj(page_data, (
'children', lambda _, v: v['type'] == 'MainContainer', 'children', lambda _, v: v['type'] == 'LineList',
lambda _, v: v['type'] == 'video-guide', 'items', 0, 'meta', {
'season_mgid': ('seasonMgid', {str}),
'series_mgid': ('seriesMgid', {str}),
}), get_all=False):
if mgid := mgids.get('season_mgid') if video_id else mgids.get('series_mgid'):
api_url = domain + '/api/context/' + urllib.parse.quote_plus(mgid) + '/episode/0/10000'
episodes = self._download_json(api_url, video_id)
if items := episodes.get('items'):
return self.playlist_result((
self.url_result(domain + item['url'], **traverse_obj(item, {
'id': ('id', {str}),
'title': ('meta', 'subHeader', {str}),
'description': ('meta', 'description', {str_or_none}),
'thumbnail': ('media', 'image', 'url', {url_or_none}),
'release_date': ('meta', 'date', {lambda v: v.split('/')},
{lambda v: v[2] + v[0] + v[1]}),
'season_number': ('meta', 'header', 'title', 'text', {lambda v: v.split(' • ')[0]},
{lambda v: int(v.strip('S')) if re.match(r'^S\d+$', v) else None}),
'episode_number': ('meta', 'header', 'title', 'text', {lambda v: v.split(' • ')[-1]},
{lambda v: int(v.strip('E')) if re.match(r'^E\d+$', v) else None}),
'duration': ('media', 'duration', {parse_duration}),
})) for item in items),
playlist_id=mgid.split(':')[-1],
playlist_title=join_nonempty(series.replace('-', ' ').title(),
(video_id or '').split('/')[-1].replace('-', ' ').title(), delim=' - '),
)
raise ExtractorError('Unable to extract ' + ('season' if video_id else 'series') + ' data')

0 comments on commit eb850bf

Please sign in to comment.