feat: include metadata in parsers

Parsers now parse metadata as well and include them in exported data.
QUT-Digital-Observatory · May 16, 2023 · d5d79fe · d5d79fe
1 parent ce0188a
commit d5d79fe
Show file tree

Hide file tree

Showing 6 changed files with 56 additions and 11 deletions.
diff --git a/src/youte/_typing.py b/src/youte/_typing.py
@@ -21,7 +21,7 @@ class SearchResult(TypedDict):
     pageInfo: dict
     items: List[dict]
     related_to_video_id: NotRequired[str]
-    collection_time: datetime.datetime
+    _youte: NotRequired[dict]
 
 
 class StandardResult(TypedDict):
@@ -30,7 +30,7 @@ class StandardResult(TypedDict):
     nextPageToken: str
     pageInfo: dict
     items: List[dict]
-    collection_time: datetime.datetime
+    _youte: NotRequired[dict]
 
 
 class VideoChannelResult(TypedDict):
@@ -41,7 +41,7 @@ class VideoChannelResult(TypedDict):
     prevPageToken: str
     pageInfo: dict
     items: List[dict]
-    collection_time: datetime.datetime
+    _youte: NotRequired[dict]
 
 
 APIResponse = Union[SearchResult, StandardResult, VideoChannelResult]
diff --git a/src/youte/cli.py b/src/youte/cli.py
@@ -6,7 +6,6 @@
 
 import logging
 import sys
-from datetime import datetime
 from json.decoder import JSONDecodeError
 from pathlib import Path
 from typing import IO, Callable, Literal
@@ -120,6 +119,7 @@ def _validate_select_values(ctx, param, value: str) -> str:
 
 def _check_file_overwrite(ctx, param, value: str) -> Path:
     value = Path(value)
+
     if value.exists():
         try:
             if click.confirm(
@@ -841,6 +841,7 @@ def dehydrate(infile: Path, output: IO) -> None:
     help="Name of SQLite database to store output (must have .db ending)",
     type=click.Path(),
     callback=_check_file_overwrite,
+    required=True,
 )
 @default_options
 @click.option(
@@ -998,7 +999,6 @@ def full_archive(
     if you want to archive the replies, both 'thread' and 'reply' will have to be
     specified.
     """
-
     _check_compatibility(select)
 
     api_key = key if key else _get_api_key(name=name)
@@ -1031,6 +1031,8 @@ def full_archive(
         )
     ]
 
+    click.echo(out_db)
+
     searches = parser.parse_searches(results)
 
     engine = database.set_up_database(out_db)

diff --git a/src/youte/collector.py b/src/youte/collector.py
@@ -12,7 +12,7 @@
 from youte._typing import APIResponse, SearchOrder
 from youte.exceptions import APIError, CommentsDisabled, InvalidRequest, MaxQuotaReached
 from youte.utilities import create_utc_datetime_string
-from youte.version import version, user_agent
+from youte.version import user_agent, version
 
 logger = logging.getLogger(__name__)
 

diff --git a/src/youte/common.py b/src/youte/common.py
@@ -15,7 +15,7 @@ def to_json(self, filepath: Path | str, pretty: bool = False) -> None:
         json_array: list = []
         indent: Optional[int] = 4 if pretty else None
         for item in self.items:
-            json_array.append(asdict(item))
+            json_array.append(_flatten_json(asdict(item)))
             with open(filepath, mode="w", encoding="utf-8") as f:
                 f.write(
                     json.dumps(
@@ -25,8 +25,27 @@ def to_json(self, filepath: Path | str, pretty: bool = False) -> None:
 
     def to_csv(self, filepath: Path | str, encoding: str = "utf-8") -> None:
         with open(filepath, "w", newline="", encoding=encoding) as csvfile:
-            fieldnames = asdict(self.items[0]).keys()
+            fieldnames = _flatten_json(asdict(self.items[0])).keys()
             writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
             writer.writeheader()
             for item in self.items:
-                writer.writerow(asdict(item))
+                writer.writerow(_flatten_json(asdict(item)))
+
+
+def _flatten_json(obj: dict[str, Any]) -> dict[str, Any]:
+    out = {}
+
+    def flatten(x: str | dict | list, name: str = ""):
+        if type(x) is dict:
+            for a in x:
+                flatten(x[a], name + a + "_")
+        elif type(x) is list:
+            i = 0
+            for a in x:
+                flatten(a, name + str(i) + "_")
+                i += 1
+        else:
+            out[name[:-1]] = x
+
+    flatten(obj)
+    return out
diff --git a/src/youte/parser.py b/src/youte/parser.py
@@ -3,7 +3,7 @@
 import html
 import logging
 from datetime import datetime
-from typing import Iterable, Iterator, Optional
+from typing import Callable, Iterable, Iterator, Optional
 
 from pydantic import ValidationError
 
@@ -176,6 +176,10 @@ def _parse_search(input_: SearchResult) -> Iterator[Search]:
         )
 
     items: list[dict] = input_["items"]
+    if "_youte" in input_:
+        meta: dict = input_["_youte"]
+    else:
+        meta: dict = {}
 
     for item in items:
         snippet = item["snippet"]
@@ -197,6 +201,7 @@ def _parse_search(input_: SearchResult) -> Iterator[Search]:
             channel_title=snippet.get("channelTitle"),
             live_broadcast_content=snippet["liveBroadcastContent"],
             channel_id=snippet["channelId"],
+            meta=meta,
         )
         yield search
 
@@ -208,6 +213,10 @@ def _parse_video(input_: VideoChannelResult) -> Iterator[Video]:
         )
 
     items: list[dict] = input_["items"]
+    if "_youte" in input_:
+        meta: dict = input_["_youte"]
+    else:
+        meta: dict = {}
 
     for item in items:
         snippet = item["snippet"]
@@ -272,6 +281,7 @@ def _parse_video(input_: VideoChannelResult) -> Iterator[Video]:
             live_streaming_concurrent_viewers=int(live_stream["concurrentViewers"])
             if "concurrentViewers" in live_stream
             else None,
+            meta=meta,
         )
         yield search
 
@@ -281,6 +291,10 @@ def _parse_channel(input_: VideoChannelResult) -> Iterator[Channel]:
         raise ValueError("Object passed to input is not a channelListResponse")
 
     items: list[dict] = input_["items"]
+    if "_youte" in input_:
+        meta: dict = input_["_youte"]
+    else:
+        meta: dict = {}
 
     for item in items:
         snippet = item["snippet"]
@@ -317,6 +331,7 @@ def _parse_channel(input_: VideoChannelResult) -> Iterator[Channel]:
                 made_for_kids=status.get("madeForKids"),
                 branding_keywords=_list(branding["channel"].get("keywords")),
                 moderated_comments=branding["channel"].get("moderatedComments"),
+                meta=meta,
             )
         except ValidationError:
             print(branding["channel"].get("keywords"))
@@ -330,6 +345,10 @@ def _parse_comment(input_: StandardResult) -> Iterable[Comment]:
         raise ValueError("Object passed to input is not a comment")
 
     items: list[dict] = input_["items"]
+    if "_youte" in input_:
+        meta: dict = input_["_youte"]
+    else:
+        meta: dict = {}
 
     for item in items:
         can_reply: Optional[bool] = None
@@ -364,6 +383,7 @@ def _parse_comment(input_: StandardResult) -> Iterable[Comment]:
             can_reply=can_reply,
             is_public=is_public,
             total_reply_count=total_reply_count,
+            meta=meta,
         )
         yield comment
 

diff --git a/src/youte/resources.py b/src/youte/resources.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 
 from datetime import datetime
-from typing import List, Literal, Optional
+from typing import Any, List, Literal, Optional
 
 from pydantic.dataclasses import dataclass
 
@@ -21,6 +21,7 @@ class Search:
     thumbnail_height: int
     channel_title: str
     live_broadcast_content: str
+    meta: dict[str, Any]
 
 
 @dataclass
@@ -67,6 +68,7 @@ class Video:
     live_streaming_start_scheduled: Optional[datetime]
     live_streaming_end_scheduled: Optional[datetime]
     live_streaming_concurrent_viewers: Optional[int]
+    meta: dict[str, Any]
 
 
 @dataclass
@@ -99,6 +101,7 @@ class Channel:
     made_for_kids: Optional[bool]
     branding_keywords: Optional[List[str]]
     moderated_comments: Optional[bool]
+    meta: dict[str, Any]
 
 
 @dataclass
@@ -125,6 +128,7 @@ class Comment:
     like_count: int
     published_at: datetime
     updated_at: datetime
+    meta: dict[str, Any]
 
 
 @dataclass