[bot] AutoMerging: merge all upstream's changes:

* https://github.com/ytdl-org/youtube-dl: [shahid] fix format extraction(closes ytdl-org#28383) [lbry] add support for channel filters(closes ytdl-org#28385) [bandcamp] Extract release_timestamp [lbry] Extract release_timestamp (closes ytdl-org#28386) Introduce release_timestamp meta field (refs ytdl-org#28386) [pornhub] Detect flagged videos [pornhub] Extract formats from get_media end point (ytdl-org#28395)
hellopony · Mar 10, 2021 · 4bcf4ab · 4bcf4ab
2 parents 5bfecbc + fc2c6d5
commit 4bcf4ab
Showing 6 changed files with 103 additions and 40 deletions.
diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py
@@ -1511,14 +1511,18 @@ def sanitize_numeric_fields(info):
         if 'display_id' not in info_dict and 'id' in info_dict:
             info_dict['display_id'] = info_dict['id']
 
-        if info_dict.get('upload_date') is None and info_dict.get('timestamp') is not None:
-            # Working around out-of-range timestamp values (e.g. negative ones on Windows,
-            # see http://bugs.python.org/issue1646728)
-            try:
-                upload_date = datetime.datetime.utcfromtimestamp(info_dict['timestamp'])
-                info_dict['upload_date'] = upload_date.strftime('%Y%m%d')
-            except (ValueError, OverflowError, OSError):
-                pass
+        for ts_key, date_key in (
+                ('timestamp', 'upload_date'),
+                ('release_timestamp', 'release_date'),
+        ):
+            if info_dict.get(date_key) is None and info_dict.get(ts_key) is not None:
+                # Working around out-of-range timestamp values (e.g. negative ones on Windows,
+                # see http://bugs.python.org/issue1646728)
+                try:
+                    upload_date = datetime.datetime.utcfromtimestamp(info_dict[ts_key])
+                    info_dict[date_key] = upload_date.strftime('%Y%m%d')
+                except (ValueError, OverflowError, OSError):
+                    pass
 
         # Auto generate title fields corresponding to the *_number fields when missing
         # in order to always have clean titles. This is very common for TV series.

diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py
@@ -49,6 +49,7 @@ class BandcampIE(InfoExtractor):
             'uploader': 'Ben Prunty',
             'timestamp': 1396508491,
             'upload_date': '20140403',
+            'release_timestamp': 1396483200,
             'release_date': '20140403',
             'duration': 260.877,
             'track': 'Lanius (Battle)',
@@ -69,6 +70,7 @@ class BandcampIE(InfoExtractor):
             'uploader': 'Mastodon',
             'timestamp': 1322005399,
             'upload_date': '20111122',
+            'release_timestamp': 1076112000,
             'release_date': '20040207',
             'duration': 120.79,
             'track': 'Hail to Fire',
@@ -197,7 +199,7 @@ def _real_extract(self, url):
             'thumbnail': thumbnail,
             'uploader': artist,
             'timestamp': timestamp,
-            'release_date': unified_strdate(tralbum.get('album_release_date')),
+            'release_timestamp': unified_timestamp(tralbum.get('album_release_date')),
             'duration': duration,
             'track': track,
             'track_number': track_number,

diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py
@@ -230,8 +230,10 @@ class InfoExtractor(object):
     uploader:       Full name of the video uploader.
     license:        License name the video is licensed under.
     creator:        The creator of the video.
+    release_timestamp: UNIX timestamp of the moment the video was released.
     release_date:   The date (YYYYMMDD) when the video was released.
-    timestamp:      UNIX timestamp of the moment the video became available.
+    timestamp:      UNIX timestamp of the moment the video became available
+                    (uploaded).
     upload_date:    Video upload date (YYYYMMDD).
                     If not explicitly set, calculated from timestamp.
     uploader_id:    Nickname or id of the video uploader.

diff --git a/youtube_dl/extractor/lbry.py b/youtube_dl/extractor/lbry.py
@@ -6,8 +6,10 @@
 
 from .common import InfoExtractor
 from ..compat import (
+    compat_parse_qs,
     compat_str,
     compat_urllib_parse_unquote,
+    compat_urllib_parse_urlparse,
 )
 from ..utils import (
     determine_ext,
@@ -60,6 +62,7 @@ def _parse_stream(self, stream, url):
             'description': stream_value.get('description'),
             'license': stream_value.get('license'),
             'timestamp': int_or_none(stream.get('timestamp')),
+            'release_timestamp': int_or_none(stream_value.get('release_time')),
             'tags': stream_value.get('tags'),
             'duration': int_or_none(media.get('duration')),
             'channel': try_get(signing_channel, lambda x: x['value']['title']),
@@ -92,6 +95,8 @@ class LBRYIE(LBRYBaseIE):
             'description': 'md5:f6cb5c704b332d37f5119313c2c98f51',
             'timestamp': 1595694354,
             'upload_date': '20200725',
+            'release_timestamp': 1595340697,
+            'release_date': '20200721',
             'width': 1280,
             'height': 720,
         }
@@ -106,6 +111,8 @@ class LBRYIE(LBRYBaseIE):
             'description': 'md5:661ac4f1db09f31728931d7b88807a61',
             'timestamp': 1591312601,
             'upload_date': '20200604',
+            'release_timestamp': 1591312421,
+            'release_date': '20200604',
             'tags': list,
             'duration': 2570,
             'channel': 'The LBRY Foundation',
@@ -181,17 +188,18 @@ class LBRYChannelIE(LBRYBaseIE):
     }]
     _PAGE_SIZE = 50
 
-    def _fetch_page(self, claim_id, url, page):
+    def _fetch_page(self, claim_id, url, params, page):
         page += 1
+        page_params = {
+            'channel_ids': [claim_id],
+            'claim_type': 'stream',
+            'no_totals': True,
+            'page': page,
+            'page_size': self._PAGE_SIZE,
+        }
+        page_params.update(params)
         result = self._call_api_proxy(
-            'claim_search', claim_id, {
-                'channel_ids': [claim_id],
-                'claim_type': 'stream',
-                'no_totals': True,
-                'page': page,
-                'page_size': self._PAGE_SIZE,
-                'stream_types': self._SUPPORTED_STREAM_TYPES,
-            }, 'page %d' % page)
+            'claim_search', claim_id, page_params, 'page %d' % page)
         for item in (result.get('items') or []):
             stream_claim_name = item.get('name')
             stream_claim_id = item.get('claim_id')
@@ -212,8 +220,31 @@ def _real_extract(self, url):
         result = self._resolve_url(
             'lbry://' + display_id, display_id, 'channel')
         claim_id = result['claim_id']
+        qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
+        content = qs.get('content', [None])[0]
+        params = {
+            'fee_amount': qs.get('fee_amount', ['>=0'])[0],
+            'order_by': {
+                'new': ['release_time'],
+                'top': ['effective_amount'],
+                'trending': ['trending_group', 'trending_mixed'],
+            }[qs.get('order', ['new'])[0]],
+            'stream_types': [content] if content in ['audio', 'video'] else self._SUPPORTED_STREAM_TYPES,
+        }
+        duration = qs.get('duration', [None])[0]
+        if duration:
+            params['duration'] = {
+                'long': '>=1200',
+                'short': '<=240',
+            }[duration]
+        language = qs.get('language', ['all'])[0]
+        if language != 'all':
+            languages = [language]
+            if language == 'en':
+                languages.append('none')
+            params['any_languages'] = languages
         entries = OnDemandPagedList(
-            functools.partial(self._fetch_page, claim_id, url),
+            functools.partial(self._fetch_page, claim_id, url, params),
             self._PAGE_SIZE)
         result_value = result.get('value') or {}
         return self.playlist_result(

diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py
@@ -167,6 +167,7 @@ class PornHubIE(PornHubBaseIE):
         'params': {
             'skip_download': True,
         },
+        'skip': 'Video has been flagged for verification in accordance with our trust and safety policy',
     }, {
         # subtitles
         'url': 'https://www.pornhub.com/view_video.php?viewkey=ph5af5fef7c2aa7',
@@ -265,7 +266,8 @@ def dl_webpage(platform):
         webpage = dl_webpage('pc')
 
         error_msg = self._html_search_regex(
-            r'(?s)<div[^>]+class=(["\'])(?:(?!\1).)*\b(?:removed|userMessageSection)\b(?:(?!\1).)*\1[^>]*>(?P<error>.+?)</div>',
+            (r'(?s)<div[^>]+class=(["\'])(?:(?!\1).)*\b(?:removed|userMessageSection)\b(?:(?!\1).)*\1[^>]*>(?P<error>.+?)</div>',
+             r'(?s)<section[^>]+class=["\']noVideo["\'][^>]*>(?P<error>.+?)</section>'),
             webpage, 'error message', default=None, group='error')
         if error_msg:
             error_msg = re.sub(r'\s+', ' ', error_msg)
@@ -394,6 +396,21 @@ def parse_quality_items(quality_items):
 
         upload_date = None
         formats = []
+
+        def add_format(format_url, height=None):
+            tbr = None
+            mobj = re.search(r'(?P<height>\d+)[pP]?_(?P<tbr>\d+)[kK]', format_url)
+            if mobj:
+                if not height:
+                    height = int(mobj.group('height'))
+                tbr = int(mobj.group('tbr'))
+            formats.append({
+                'url': format_url,
+                'format_id': '%dp' % height if height else None,
+                'height': height,
+                'tbr': tbr,
+            })
+
         for video_url, height in video_urls:
             if not upload_date:
                 upload_date = self._search_regex(
@@ -410,18 +427,19 @@ def parse_quality_items(quality_items):
                     video_url, video_id, 'mp4', entry_protocol='m3u8_native',
                     m3u8_id='hls', fatal=False))
                 continue
-            tbr = None
-            mobj = re.search(r'(?P<height>\d+)[pP]?_(?P<tbr>\d+)[kK]', video_url)
-            if mobj:
-                if not height:
-                    height = int(mobj.group('height'))
-                tbr = int(mobj.group('tbr'))
-            formats.append({
-                'url': video_url,
-                'format_id': '%dp' % height if height else None,
-                'height': height,
-                'tbr': tbr,
-            })
+            if '/video/get_media' in video_url:
+                medias = self._download_json(video_url, video_id, fatal=False)
+                if isinstance(medias, list):
+                    for media in medias:
+                        if not isinstance(media, dict):
+                            continue
+                        video_url = url_or_none(media.get('videoUrl'))
+                        if not video_url:
+                            continue
+                        height = int_or_none(media.get('quality'))
+                        add_format(video_url, height)
+                continue
+            add_format(video_url)
         self._sort_formats(formats)
 
         video_uploader = self._html_search_regex(

diff --git a/youtube_dl/extractor/shahid.py b/youtube_dl/extractor/shahid.py
@@ -51,13 +51,16 @@ class ShahidIE(ShahidBaseIE):
     _NETRC_MACHINE = 'shahid'
     _VALID_URL = r'https?://shahid\.mbc\.net/ar/(?:serie|show|movie)s/[^/]+/(?P<type>episode|clip|movie)-(?P<id>\d+)'
     _TESTS = [{
-        'url': 'https://shahid.mbc.net/ar/shows/%D9%85%D8%AC%D9%84%D8%B3-%D8%A7%D9%84%D8%B4%D8%A8%D8%A7%D8%A8-%D8%A7%D9%84%D9%85%D9%88%D8%B3%D9%85-1-%D9%83%D9%84%D9%8A%D8%A8-1/clip-275286',
+        'url': 'https://shahid.mbc.net/ar/shows/%D9%85%D8%AA%D8%AD%D9%81-%D8%A7%D9%84%D8%AF%D8%AD%D9%8A%D8%AD-%D8%A7%D9%84%D9%85%D9%88%D8%B3%D9%85-1-%D9%83%D9%84%D9%8A%D8%A8-1/clip-816924',
         'info_dict': {
-            'id': '275286',
+            'id': '816924',
             'ext': 'mp4',
-            'title': 'مجلس الشباب الموسم 1 كليب 1',
-            'timestamp': 1506988800,
-            'upload_date': '20171003',
+            'title': 'متحف الدحيح الموسم 1 كليب 1',
+            'timestamp': 1602806400,
+            'upload_date': '20201016',
+            'description': 'برومو',
+            'duration': 22,
+            'categories': ['كوميديا'],
         },
         'params': {
             # m3u8 download
@@ -109,12 +112,15 @@ def _real_extract(self, url):
             page_type = 'episode'
 
         playout = self._call_api(
-            'playout/url/' + video_id, video_id)['playout']
+            'playout/new/url/' + video_id, video_id)['playout']
 
         if playout.get('drm'):
             raise ExtractorError('This video is DRM protected.', expected=True)
 
-        formats = self._extract_m3u8_formats(playout['url'], video_id, 'mp4')
+        formats = self._extract_m3u8_formats(re.sub(
+            # https://docs.aws.amazon.com/mediapackage/latest/ug/manifest-filtering.html
+            r'aws\.manifestfilter=[\w:;,-]+&?',
+            '', playout['url']), video_id, 'mp4')
         self._sort_formats(formats)
 
         # video = self._call_api(