Skip to content

Commit

Permalink
feat: include metadata in parsers
Browse files Browse the repository at this point in the history
Parsers now parse metadata as well and include them in exported data.
  • Loading branch information
boyd-nguyen committed May 16, 2023
1 parent ce0188a commit d5d79fe
Show file tree
Hide file tree
Showing 6 changed files with 56 additions and 11 deletions.
6 changes: 3 additions & 3 deletions src/youte/_typing.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ class SearchResult(TypedDict):
pageInfo: dict
items: List[dict]
related_to_video_id: NotRequired[str]
collection_time: datetime.datetime
_youte: NotRequired[dict]


class StandardResult(TypedDict):
Expand All @@ -30,7 +30,7 @@ class StandardResult(TypedDict):
nextPageToken: str
pageInfo: dict
items: List[dict]
collection_time: datetime.datetime
_youte: NotRequired[dict]


class VideoChannelResult(TypedDict):
Expand All @@ -41,7 +41,7 @@ class VideoChannelResult(TypedDict):
prevPageToken: str
pageInfo: dict
items: List[dict]
collection_time: datetime.datetime
_youte: NotRequired[dict]


APIResponse = Union[SearchResult, StandardResult, VideoChannelResult]
6 changes: 4 additions & 2 deletions src/youte/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@

import logging
import sys
from datetime import datetime
from json.decoder import JSONDecodeError
from pathlib import Path
from typing import IO, Callable, Literal
Expand Down Expand Up @@ -120,6 +119,7 @@ def _validate_select_values(ctx, param, value: str) -> str:

def _check_file_overwrite(ctx, param, value: str) -> Path:
value = Path(value)

if value.exists():
try:
if click.confirm(
Expand Down Expand Up @@ -841,6 +841,7 @@ def dehydrate(infile: Path, output: IO) -> None:
help="Name of SQLite database to store output (must have .db ending)",
type=click.Path(),
callback=_check_file_overwrite,
required=True,
)
@default_options
@click.option(
Expand Down Expand Up @@ -998,7 +999,6 @@ def full_archive(
if you want to archive the replies, both 'thread' and 'reply' will have to be
specified.
"""

_check_compatibility(select)

api_key = key if key else _get_api_key(name=name)
Expand Down Expand Up @@ -1031,6 +1031,8 @@ def full_archive(
)
]

click.echo(out_db)

searches = parser.parse_searches(results)

engine = database.set_up_database(out_db)
Expand Down
2 changes: 1 addition & 1 deletion src/youte/collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from youte._typing import APIResponse, SearchOrder
from youte.exceptions import APIError, CommentsDisabled, InvalidRequest, MaxQuotaReached
from youte.utilities import create_utc_datetime_string
from youte.version import version, user_agent
from youte.version import user_agent, version

logger = logging.getLogger(__name__)

Expand Down
25 changes: 22 additions & 3 deletions src/youte/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ def to_json(self, filepath: Path | str, pretty: bool = False) -> None:
json_array: list = []
indent: Optional[int] = 4 if pretty else None
for item in self.items:
json_array.append(asdict(item))
json_array.append(_flatten_json(asdict(item)))
with open(filepath, mode="w", encoding="utf-8") as f:
f.write(
json.dumps(
Expand All @@ -25,8 +25,27 @@ def to_json(self, filepath: Path | str, pretty: bool = False) -> None:

def to_csv(self, filepath: Path | str, encoding: str = "utf-8") -> None:
with open(filepath, "w", newline="", encoding=encoding) as csvfile:
fieldnames = asdict(self.items[0]).keys()
fieldnames = _flatten_json(asdict(self.items[0])).keys()
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for item in self.items:
writer.writerow(asdict(item))
writer.writerow(_flatten_json(asdict(item)))


def _flatten_json(obj: dict[str, Any]) -> dict[str, Any]:
out = {}

def flatten(x: str | dict | list, name: str = ""):
if type(x) is dict:
for a in x:
flatten(x[a], name + a + "_")
elif type(x) is list:
i = 0
for a in x:
flatten(a, name + str(i) + "_")
i += 1
else:
out[name[:-1]] = x

flatten(obj)
return out
22 changes: 21 additions & 1 deletion src/youte/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import html
import logging
from datetime import datetime
from typing import Iterable, Iterator, Optional
from typing import Callable, Iterable, Iterator, Optional

from pydantic import ValidationError

Expand Down Expand Up @@ -176,6 +176,10 @@ def _parse_search(input_: SearchResult) -> Iterator[Search]:
)

items: list[dict] = input_["items"]
if "_youte" in input_:
meta: dict = input_["_youte"]
else:
meta: dict = {}

for item in items:
snippet = item["snippet"]
Expand All @@ -197,6 +201,7 @@ def _parse_search(input_: SearchResult) -> Iterator[Search]:
channel_title=snippet.get("channelTitle"),
live_broadcast_content=snippet["liveBroadcastContent"],
channel_id=snippet["channelId"],
meta=meta,
)
yield search

Expand All @@ -208,6 +213,10 @@ def _parse_video(input_: VideoChannelResult) -> Iterator[Video]:
)

items: list[dict] = input_["items"]
if "_youte" in input_:
meta: dict = input_["_youte"]
else:
meta: dict = {}

for item in items:
snippet = item["snippet"]
Expand Down Expand Up @@ -272,6 +281,7 @@ def _parse_video(input_: VideoChannelResult) -> Iterator[Video]:
live_streaming_concurrent_viewers=int(live_stream["concurrentViewers"])
if "concurrentViewers" in live_stream
else None,
meta=meta,
)
yield search

Expand All @@ -281,6 +291,10 @@ def _parse_channel(input_: VideoChannelResult) -> Iterator[Channel]:
raise ValueError("Object passed to input is not a channelListResponse")

items: list[dict] = input_["items"]
if "_youte" in input_:
meta: dict = input_["_youte"]
else:
meta: dict = {}

for item in items:
snippet = item["snippet"]
Expand Down Expand Up @@ -317,6 +331,7 @@ def _parse_channel(input_: VideoChannelResult) -> Iterator[Channel]:
made_for_kids=status.get("madeForKids"),
branding_keywords=_list(branding["channel"].get("keywords")),
moderated_comments=branding["channel"].get("moderatedComments"),
meta=meta,
)
except ValidationError:
print(branding["channel"].get("keywords"))
Expand All @@ -330,6 +345,10 @@ def _parse_comment(input_: StandardResult) -> Iterable[Comment]:
raise ValueError("Object passed to input is not a comment")

items: list[dict] = input_["items"]
if "_youte" in input_:
meta: dict = input_["_youte"]
else:
meta: dict = {}

for item in items:
can_reply: Optional[bool] = None
Expand Down Expand Up @@ -364,6 +383,7 @@ def _parse_comment(input_: StandardResult) -> Iterable[Comment]:
can_reply=can_reply,
is_public=is_public,
total_reply_count=total_reply_count,
meta=meta,
)
yield comment

Expand Down
6 changes: 5 additions & 1 deletion src/youte/resources.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from __future__ import annotations

from datetime import datetime
from typing import List, Literal, Optional
from typing import Any, List, Literal, Optional

from pydantic.dataclasses import dataclass

Expand All @@ -21,6 +21,7 @@ class Search:
thumbnail_height: int
channel_title: str
live_broadcast_content: str
meta: dict[str, Any]


@dataclass
Expand Down Expand Up @@ -67,6 +68,7 @@ class Video:
live_streaming_start_scheduled: Optional[datetime]
live_streaming_end_scheduled: Optional[datetime]
live_streaming_concurrent_viewers: Optional[int]
meta: dict[str, Any]


@dataclass
Expand Down Expand Up @@ -99,6 +101,7 @@ class Channel:
made_for_kids: Optional[bool]
branding_keywords: Optional[List[str]]
moderated_comments: Optional[bool]
meta: dict[str, Any]


@dataclass
Expand All @@ -125,6 +128,7 @@ class Comment:
like_count: int
published_at: datetime
updated_at: datetime
meta: dict[str, Any]


@dataclass
Expand Down

0 comments on commit d5d79fe

Please sign in to comment.