From 2ece620c06f9e5b8768e4c9257f0540d0015e55a Mon Sep 17 00:00:00 2001 From: XCanG Date: Tue, 8 Oct 2024 00:21:10 +0500 Subject: [PATCH] add experimental video and audio support video quality order is still in question streams is not supported yet hls-only need additional steps to handling, probably involving `ffmpeg` to download and merge streamed content --- boosty_archiver.py | 472 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 466 insertions(+), 6 deletions(-) diff --git a/boosty_archiver.py b/boosty_archiver.py index c553fef..cd9c9e2 100644 --- a/boosty_archiver.py +++ b/boosty_archiver.py @@ -48,6 +48,18 @@ "image/x-ms-bmp": "bmp", "image/webp": "webp", "image/svg+xml": "svg", + "video/webm": "webm", + "video/mp4": "mp4", + "video/x-matroska": "mkv", + "video/x-msvideo": "avi", + "video/quicktime": "mov", + "audio/mpeg": "mp3", + "audio/x-wav": "wav", + "audio/x-flac": "flac", + "audio/x-m4a": "m4a", + "video/ogg": "ogv", + "audio/ogg": "ogg", # ? .opus also guessed as audio/ogg by magic + "audio/x-hx-aac-adts": "aac", } # ? DB @@ -72,6 +84,8 @@ FILENAME_CONTROLS: re.Pattern[str] = re.compile(r"[\000-\031]") +VIDEO_QUALITY_ORDER: Sequence[str] = ("ultra_hd", "quad_hd", "full_hd", "high", "medium", "tiny", "low", "lowest") + # ? Models @@ -182,6 +196,169 @@ class PostDataFile(TypedDict): url: str # NOTE: url is https://cdn.boosty.to/file/00000000-0000-0000-0000-0000000000000 need migrate to https://cdn.boosty.to/file/00000000-0000-0000-0000-0000000000000?user_id=0000000&content_id=00000000-0000-0000-0000-0000000000000&expire_time=0000000000&sign=0000000000000000000000000000000000000000000000000000000000000000&is_migrated=true +type PostDataVideoURLsType = Literal[ + "live_cmaf", + "live_playback_dash", + "ultra_hd", + "live_playback_hls", + "hls", + "live_ondemand_hls", + "lowest", + "tiny", + "high", + "medium", + "full_hd", + "live_dash", + "dash_uni", + "low", + "live_hls", + "quad_hd", + "dash", +] + + +class PostDataVideoURLs(TypedDict): + """ + NOTE: seems like when video does not have specified type (quality preset), url = "" + + Regular static .mp4 ~720p upload have this types: ["hls", "lowest", "tiny', "medium", "low", "dash"] + + URL look like this: + https://vd123.okcdn.ru/video.m3u8?cmd=videoPlayerCdn&expires=0000000000000&srcIp=1.2.3.4&pr=12&srcAg=UNKNOWN&ms=1.2.3.4&type=2&sig=00000000000&ct=8&urls=1.2.3.4&clientType=18&id=0000000000000 + https://vd123.okcdn.ru/?expires=0000000000000&srcIp=1.2.3.4&pr=42&srcAg=UNKNOWN&ms=1.2.3.4&type=0&sig=00000000000&ct=0&urls=1.2.3.4&clientType=18&id=0000000000000 + + Where: + * expires - unix timestamp + * srcIp, ms, urls - 3 different IPs + * pr - seems always = 42 + * ct - seems always = 0 for static types and 8 for hls, 6 for dash + * clientType - seems always = 18 + * srcAg - seems always = UNKNOWN + * type - is integer representation of text type + * hls = 2 + * lowest = 0 + * tiny = 4 + * medium = 2 + * low = 1 + * dash = 1 + + Approximate priority: + ultra_hd > quad_hd > full_hd > high > medium > tiny > low > lowest + hls and dash seems to use highest quality + """ + + type: PostDataVideoURLsType + url: str + + +class PostDataVideo(TypedDict): + """ + Example JSON: + + ```json + { + "height":480, + "showViewsCounter":true, + "viewsCounter":0, + "timeCode":5, + "complete":true, + "uploadStatus": "ok", + "playerUrls": [ + {"type": "live_cmaf", "url": ""}, + {"url": "", "type": "live_playback_dash"}, + {"type": "ultra_hd", "url": ""}, + {"url": "", "type": "live_playback_hls"}, + {"type": "hls", "url": "https://vd123.okcdn.ru/video.m3u8?cmd=videoPlayerCdn&expires=0000000000000&srcIp=1.2.3.4&pr=42&srcAg=UNKNOWN&ms=1.2.3.4&type=2&sig=00000000000&ct=8&urls=1.2.3.4&clientType=18&id=0000000000000"}, + {"type": "live_ondemand_hls", "url": ""}, + {"url": "https://vd123.okcdn.ru/?expires=0000000000000&srcIp=1.2.3.4&pr=42&srcAg=UNKNOWN&ms=1.2.3.4&type=0&sig=00000000000&ct=0&urls=1.2.3.4&clientType=18&id=0000000000000", "type": "lowest"}, + {"url": "https://vd123.okcdn.ru/?expires=0000000000000&srcIp=1.2.3.4&pr=42&srcAg=UNKNOWN&ms=1.2.3.4&type=4&sig=00000000000&ct=0&urls=1.2.3.4&clientType=18&id=0000000000000", "type": "tiny"}, + {"type": "high", "url": ""}, + {"type": "medium", "url": "https://vd123.okcdn.ru/?expires=0000000000000&srcIp=1.2.3.4&pr=42&srcAg=UNKNOWN&ms=1.2.3.4&type=2&sig=00000000000&ct=0&urls=1.2.3.4&clientType=18&id=0000000000000"}, + {"url": "", "type": "full_hd"}, + {"type": "live_dash", "url": ""}, + {"type": "dash_uni", "url": ""}, + {"type": "low", "url": "https://vd123.okcdn.ru/?expires=0000000000000&srcIp=1.2.3.4&pr=42&srcAg=UNKNOWN&ms=1.2.3.4&type=1&sig=00000000000&ct=0&urls=1.2.3.4&clientType=18&id=0000000000000"}, + {"type": "live_hls", "url": ""}, + {"type": "quad_hd", "url": ""}, + {"url": "https://vd123.okcdn.ru/?expires=0000000000000&srcIp=1.2.3.4&pr=42&srcAg=UNKNOWN&ms=1.2.3.4&type=1&sig=00000000000&ct=6&urls=1.2.3.4&clientType=18&id=0000000000000", "type": "dash"} + ], + "duration":34, + "vid": "0000000000000", + "width": 754, + "failoverHost": "vd234.okcdn.ru", + "preview": "https://i.okcdn.ru/videoPreview?id=0000000000000&type=39&idx=13&tkn=0000000000000000-0000000000", + "defaultPreview": "https://i.okcdn.ru/videoPreview?id=0000000000000&type=39&idx=13&tkn=0000000000000000-0000000000", + "url": "", + "title": "Title", + "type": "ok_video", + "id": "00000000-0000-0000-0000-000000000000" + } + ``` + """ # noqa: E501 + + type: Literal["ok_video"] + complete: bool + id: str + height: int + showViewsCounter: bool + viewsCounter: NotRequired[int] + timeCode: int + uploadStatus: Literal["ok"] | str # noqa: PYI051 # TODO: other statuses? + playerUrls: Sequence[PostDataVideoURLs] + duration: NotRequired[int] + vid: str + width: int + failoverHost: NotRequired[str] + preview: str + defaultPreview: str + url: NotRequired[str] + title: str + + +class PostDataAudio(TypedDict): + """ + Example JSON: + + ```json + { + "uploadStatus": "", + "timeCode": 0, + "complete": True, + "size": 7526176, + "viewsCounter": 0, + "showViewsCounter": True, + "track": "Day and Night", + "duration": 188, + "url": "https://cdn.boosty.to/audio/b46f05b1-c6d1-4a10-8791-6a65077aa1c2", + "album": "Shiki Original Soundtrack Mini Album ''Rouge''", + "title": "Day and Night.mp3", + "artist": "Yasuharu Takanashi", + "id": "b46f05b1-c6d1-4a10-8791-6a65077aa1c2", + "isMigrated": True, + "fileType": "MP3", + "type": "audio_file" + } + ``` + """ + + type: Literal["audio_file"] + complete: bool + id: str + isMigrated: bool + uploadStatus: str + timeCode: int + size: int + viewsCounter: NotRequired[int] + showViewsCounter: bool + track: NotRequired[str] # ? from file metadata + duration: int + url: str + album: NotRequired[str] # ? from file metadata + title: str # ? filename + artist: NotRequired[str] # ? from file metadata + fileType: Literal["MP3"] | str # noqa: PYI051 # TODO: other types? + + class PostDataImage(TypedDict): """NOTE: When file deleted from CDN, "width", "height", "size" are not present and any image URL redirect to https://images.boosty.to/stubs/default.png""" @@ -205,8 +382,9 @@ class Posts(TypedDict): int_id: int title: str hasAccess: bool - data: Sequence[PostDataText | PostDataLink | PostDataFile | PostDataImage] + data: Sequence[PostDataText | PostDataLink | PostDataFile | PostDataVideo | PostDataAudio | PostDataImage] signedQuery: NotRequired[str] + poll: NotRequired[dict] class PostsExtra(TypedDict): @@ -403,13 +581,13 @@ def handle_image( iterator: Iterator[bytes] = stream.iter_bytes(chunk_size=16_384) - chunk = next(iterator) + chunk: bytes = next(iterator) mime_type: str = magic.from_buffer(chunk, mime=True) - extension = MIME_TO_EXTENSION.get(mime_type, "png") + extension: str = MIME_TO_EXTENSION.get(mime_type, "png") path = output_dir / f"{int_id}_{title}_{incremental_id}_{filename}.{extension}" - if not force_redownload and path.exists() and path.stat().st_size == size: + if not force_redownload and not db_conn and path.exists() and path.stat().st_size == size: ctx.progress.print(f"[yellow]Skipping downloaded image ({size:_} B):[/yellow]", url) return @@ -442,6 +620,242 @@ def handle_image( cur.commit() +def best_video(urls: dict[PostDataVideoURLsType, str], /) -> tuple[str, str] | tuple[None, None]: + """Return best video quality""" + + for q in VIDEO_QUALITY_ORDER: + if q in urls: + return (q, urls[q]) + + return (None, None) + + +def handle_video( + *, + client: httpx.Client, + headers: dict[str, str], + int_id: int, + title: str, + incremental_id: int, + urls: dict[PostDataVideoURLsType, str], + failover_host: str | None = None, + filename: str, + user: str, + output_dir: Path, + width: int | None = None, + height: int | None = None, + duration: int | None = None, + ctx: ProgressContext, + force_redownload: bool = False, + db_conn: sqlite3.Connection | None = None, + retry: int = 0, +) -> None: + """ + Handle video downloads. Extension is not known, so has to be guessed with magic + """ + + if width is None or height is None: # or duration is None: + ctx.progress.print("[yellow]Skipping downloading deleted video from CDN:[/yellow]", urls) + return + + entry = f"boosty_{user}_{int_id}_{incremental_id}" + + if not force_redownload and db_conn: + with db_conn as cur, suppress(ValueError, sqlite3.Error): + [[check]] = cur.execute(CHECK_ENTRY.format(entry=entry)) + if check: + ctx.progress.print(f"[yellow]Skipping downloaded video ({width}x{height} {duration or 0}s):[/yellow]", urls, "(DB)") + return + + quality, url = best_video(urls) + if not url or not quality: + ctx.progress.print("[red]Not found supported video quality[/red]", urls) + return + + # ? retry using failover host + if retry >= 5 and failover_host: + url = re.sub(r"(?<=https:\/\/)[\w.\-]+(?=/)", failover_host, url) + ctx.progress.print("Trying downloading video using failover host:", failover_host) + + try: + with client.stream("GET", url, headers=headers, timeout=60.0) as stream: + if stream.is_server_error: + ctx.progress.print("Get server error:", stream.status_code, "Retrying after 5 seconds...") + sleep(5.0) + handle_video( + client=client, + headers=headers, + int_id=int_id, + title=title, + incremental_id=incremental_id, + urls=urls, + failover_host=failover_host, + filename=filename, + user=user, + output_dir=output_dir, + width=width, + height=height, + duration=duration, + ctx=ctx, + force_redownload=force_redownload, + db_conn=db_conn, + retry=retry + 1, + ) + return + if not stream.is_success: + rich.inspect(stream, title="Downloading video error") + return + + total = int(stream.headers["Content-Length"]) + + ctx.progress_download.start_task(ctx.download) + ctx.progress_download.update(ctx.download, total=total, visible=True) + + iterator: Iterator[bytes] = stream.iter_bytes(chunk_size=16_384) + + chunk: bytes = next(iterator) + mime_type: str = magic.from_buffer(chunk, mime=True) + extension: str = MIME_TO_EXTENSION.get(mime_type, "mp4") + + path = output_dir / f"{int_id}_{title}_{incremental_id}_{filename}.{quality}.{extension}" + + if not force_redownload and not db_conn and path.exists() and path.stat().st_size > 0: + ctx.progress.print(f"[yellow]Skipping downloaded video ({width}x{height} {duration or 0}s):[/yellow]", url) + return + + ctx.progress.print(f"[green]Downloading ({width}x{height} {duration or 0}s):[/green]", url) + + with path.open("wb") as f: + f.write(chunk) + for chunk in iterator: + f.write(chunk) + ctx.progress_download.update(ctx.download, completed=stream.num_bytes_downloaded) + + ctx.progress_download.update(ctx.download, completed=0, visible=False) + ctx.progress_download.stop_task(ctx.download) + except httpx.TimeoutException: + ctx.progress.print(f"[red italic]Timeout exception: {url}[/red italic]") + return + except httpx.NetworkError as e: + rich.inspect(e, title=f"Network error: {url}") + return + except httpx.ProtocolError as e: + rich.inspect(e, title=f"Protocol error: {url}") + return + except httpx.StreamError as e: + rich.inspect(e, title=f"Streaming video error: {url}") + return + + if db_conn: + with db_conn as cur, suppress(sqlite3.Error): + cur.execute(INSERT_ENTRY.format(entry=entry)) + cur.commit() + + +def handle_audio( + *, + client: httpx.Client, + headers: dict[str, str], + int_id: int, + title: str, + incremental_id: int, + url: str, + filename: str, + filename_fallback: str, + file_type: str | None = None, + size: int | None = None, + is_migrated: bool, + user: str, + output_dir: Path, + signed_query: str, + ctx: ProgressContext, + force_redownload: bool = False, + db_conn: sqlite3.Connection | None = None, +) -> None: + """ + Handle audio downloads. Extension is within a title, however if not, there is file_type in CAPS from API. + """ + + if size is None: + ctx.progress.print("[yellow]Skipping downloading deleted audio from CDN:[/yellow]", url) + return + + if not filename: + filename = f"{filename_fallback}.{(file_type or "mp3").lower()}" + + final_url = f"{url}{signed_query}&is_migrated={str(is_migrated).lower()}" + path = output_dir / f"{int_id}_{title}_{incremental_id}_{filename}" + entry = f"boosty_{user}_{int_id}_{incremental_id}" + + if not force_redownload and db_conn: + with db_conn as cur, suppress(ValueError, sqlite3.Error): + [[check]] = cur.execute(CHECK_ENTRY.format(entry=entry)) + if check: + ctx.progress.print(f"[yellow]Skipping downloaded audio ({size:_} B):[/yellow]", url, "(DB)") + return + + try: + with client.stream("GET", final_url, headers=headers, timeout=60.0) as stream: + if stream.is_server_error: + ctx.progress.print("Get server error:", stream.status_code, "Retrying after 5 seconds...") + sleep(5.0) + handle_audio( + client=client, + headers=headers, + int_id=int_id, + title=title, + incremental_id=incremental_id, + url=url, + filename=filename, + filename_fallback=filename_fallback, + file_type=file_type, + size=size, + is_migrated=is_migrated, + user=user, + output_dir=output_dir, + signed_query=signed_query, + ctx=ctx, + force_redownload=force_redownload, + db_conn=db_conn, + ) + return + if not stream.is_success: + rich.inspect(stream, title="Downloading audio error") + # rich.inspect(stream.request) + return + + with path.open("wb") as f: + total = int(stream.headers["Content-Length"]) + + ctx.progress_download.start_task(ctx.download) + ctx.progress_download.update(ctx.download, total=total, visible=True) + ctx.progress.print(f"[green]Downloading ({size:_} B):[/green]", final_url) + + for chunk in stream.iter_bytes(): + f.write(chunk) + ctx.progress_download.update(ctx.download, completed=stream.num_bytes_downloaded) + + ctx.progress_download.update(ctx.download, completed=0, visible=False) + ctx.progress_download.stop_task(ctx.download) + except httpx.TimeoutException: + ctx.progress.print(f"[red italic]Timeout exception: {final_url}[/red italic]") + return + except httpx.NetworkError as e: + rich.inspect(e, title=f"Network error: {final_url}") + return + except httpx.ProtocolError as e: + rich.inspect(e, title=f"Protocol error: {final_url}") + return + except httpx.StreamError as e: + rich.inspect(e, title=f"Streaming audio error: {final_url}") + return + + if db_conn: + with db_conn as cur, suppress(sqlite3.Error): + cur.execute(INSERT_ENTRY.format(entry=entry)) + cur.commit() + + def parse_text(raw_text: str) -> str: """ Get text from stringified JSON @@ -583,6 +997,52 @@ def handle_posts( # if "password" in parse_text(raw_text).lower(): # found_password = True + elif d["type"] == "ok_video": + dl_tasks.append( + partial( + handle_video, + client=client, + headers=headers, + int_id=int_id, + title=title, + incremental_id=incremental_id, + urls={u["type"]: u["url"] for u in d["playerUrls"] if u["url"]}, + filename=d["id"], + user=user, + output_dir=output_dir, + width=d.get("width"), + height=d.get("height"), + failover_host=d.get("failoverHost"), + ctx=ctx, + force_redownload=force_redownload, + db_conn=db_conn, + ), + ) + incremental_id += 1 + elif d["type"] == "audio_file": + dl_tasks.append( + partial( + handle_audio, + client=client, + headers=headers, + int_id=int_id, + title=title, + incremental_id=incremental_id, + url=d["url"], + filename=d.get("title"), + filename_fallback=d["id"], + file_type=d.get("fileType"), + is_migrated=d["isMigrated"], + user=user, + output_dir=output_dir, + size=d.get("size"), + ctx=ctx, + signed_query=signed_query, + force_redownload=force_redownload, + db_conn=db_conn, + ), + ) + incremental_id += 1 else: ctx.progress.print("\n\n[red italic]Unsupported data type:[/red italic]", d["type"]) rich.inspect(d, title="Unsupported data example", docs=False) @@ -724,7 +1184,7 @@ def archive_user( force_redownload=force_redownload, all_links=all_links, signed_query=signed_query, - db_conn=conn, + db_conn=conn if use_db else None, ctx=ctx, ) @@ -763,7 +1223,7 @@ def archive_user( force_redownload=force_redownload, all_links=all_links, signed_query=signed_query, - db_conn=conn, + db_conn=conn if use_db else None, ctx=ctx, )