web-add: improve audio metadata extraction

chapmanjacobd · Jan 23, 2025 · fe0230e · fe0230e
1 parent 1b17260
commit fe0230e
Show file tree

Hide file tree

Showing 7 changed files with 95 additions and 73 deletions.
diff --git a/SECURITY.md b/SECURITY.md
@@ -1,4 +1,4 @@
-# Security Policy
+# Coordinated Disclosure Form
 
 ## Reporting a Vulnerability
 

diff --git a/library/createdb/av.py b/library/createdb/av.py
@@ -1,9 +1,8 @@
 import math
-from datetime import datetime
 
 from library.createdb import subtitle
 from library.mediafiles import media_check
-from library.utils import consts, file_utils, iterables, nums, objects, path_utils, processes, strings
+from library.utils import consts, date_utils, file_utils, iterables, nums, objects, path_utils, processes, strings
 from library.utils.consts import DBType
 from library.utils.log_utils import log
 
@@ -37,52 +36,64 @@ def get_subtitle_tags(path, streams, scan_subtitles=False) -> dict:
 
 
 def parse_tags(mu: dict, ti: dict) -> dict:
+    mu = objects.dumbcopy(mu)
+    ti = objects.dumbcopy(ti)
+
     tags = {
         "mood": strings.combine(
-            mu.get("albummood"),
-            mu.get("MusicMatch_Situation"),
-            mu.get("Songs-DB_Occasion"),
-            mu.get("albumgrouping"),
+            mu.pop("albummood", None),
+            mu.pop("MusicMatch_Situation", None),
+            mu.pop("Songs-DB_Occasion", None),
+            mu.pop("albumgrouping", None),
+        ),
+        "genre": strings.combine(mu.pop("genre", None), ti.pop("genre", None), mu.pop("albumgenre", None)),
+        "time_created": date_utils.specific_date(
+            mu.pop("originalyear", None),
+            mu.pop("TDOR", None),
+            mu.pop("TORY", None),
+            mu.pop("date", None),
+            mu.pop("TDRC", None),
+            mu.pop("TDRL", None),
+            ti.pop("year", None),
+        ),
+        "bpm": nums.safe_int(
+            iterables.safe_unpack(mu.pop("fBPM", None), mu.pop("bpm", None), mu.pop("bpm_start", None))
         ),
-        "genre": strings.combine(mu.get("genre"), ti.get("genre"), mu.get("albumgenre")),
-        "year": strings.combine(
-            mu.get("originalyear"),
-            mu.get("TDOR"),
-            mu.get("TORY"),
-            mu.get("date"),
-            mu.get("TDRC"),
-            mu.get("TDRL"),
-            ti.get("year"),
+        "key": iterables.safe_unpack(
+            mu.pop("TIT1", None), mu.pop("key", None), mu.pop("TKEY", None), mu.pop("key_start", None)
         ),
-        "bpm": nums.safe_int(iterables.safe_unpack(mu.get("fBPM"), mu.get("bpm"), mu.get("bpm_start"))),
-        "key": iterables.safe_unpack(mu.get("TIT1"), mu.get("key"), mu.get("TKEY"), mu.get("key_start")),
-        "decade": iterables.safe_unpack(mu.get("Songs-DB_Custom1")),
-        "categories": iterables.safe_unpack(mu.get("Songs-DB_Custom2")),
-        "city": iterables.safe_unpack(mu.get("Songs-DB_Custom3")),
+        "decade": iterables.safe_unpack(mu.pop("Songs-DB_Custom1", None)),
+        "categories": iterables.safe_unpack(mu.pop("Songs-DB_Custom2", None)),
+        "city": iterables.safe_unpack(mu.pop("Songs-DB_Custom3", None)),
         "country": strings.combine(
-            mu.get("Songs-DB_Custom4"),
-            mu.get("MusicBrainz Album Release Country"),
-            mu.get("musicbrainz album release country"),
-            mu.get("language"),
+            mu.pop("Songs-DB_Custom4", None),
+            mu.pop("MusicBrainz Album Release Country", None),
+            mu.pop("musicbrainz album release country", None),
+            mu.pop("language", None),
         ),
         "description": strings.combine(
-            mu.get("description"),
-            mu.get("lyrics"),
-            ti.get("comment"),
+            mu.pop("description", None),
+            mu.pop("synopsis", None),
+            mu.pop("lyrics", None),
+            mu.pop("publisher", None),
+            mu.pop("comment", None),
+            ti.pop("comment", None),
         ),
-        "album": iterables.safe_unpack(ti.get("album"), mu.get("album")),
-        "title": iterables.safe_unpack(ti.get("title"), mu.get("title")),
+        "album": iterables.safe_unpack(ti.pop("album", None), mu.pop("album", None)),
+        "title": iterables.safe_unpack(ti.pop("title", None), mu.pop("title", None)),
         "artist": strings.combine(
-            ti.get("artist"),
-            mu.get("artist"),
-            mu.get("artists"),
-            ti.get("albumartist"),
-            ti.get("composer"),
+            ti.pop("artist", None),
+            mu.pop("artist", None),
+            mu.pop("artists", None),
+            ti.pop("albumartist", None),
+            ti.pop("composer", None),
         ),
     }
 
-    # print(mutagen)
-    # breakpoint()
+    mu = {k: v for k, v in mu.items() if not (k in consts.MEDIA_KNOWN_KEYS or v is None)}
+    if mu != {}:
+        log.debug("Extra av data %s", mu)
+        # breakpoint()
 
     return tags
 
@@ -126,11 +137,15 @@ def munge_av_tags(args, media) -> dict:
             raise e
         elif e.errno == 5:  # IO Error
             raise e
-        raise e
+        raise
     except processes.UnplayableFile as e:
         log.error(f"Failed reading header. {path}")
         log.debug(e)
-        if getattr(args, "delete_unplayable", False) and not file_utils.is_file_open(path):
+        if (
+            getattr(args, "delete_unplayable", False)
+            and not path.startswith("http")
+            and not file_utils.is_file_open(path)
+        ):
             file_utils.trash(args, path, detach=False)
             media["time_deleted"] = consts.APPLICATION_START
         media["error"] = "Metadata check failed"
@@ -193,26 +208,11 @@ def munge_av_tags(args, media) -> dict:
 
     tags = format_.pop("tags", None)
     if tags:
-        upload_date = tags.get("DATE")
-        upload_time = None
-        if upload_date:
-            try:
-                upload_time = nums.to_timestamp(datetime.strptime(upload_date, "%Y%m%d"))
-            except Exception:
-                upload_time = None
-
         tags = objects.dict_filter_bool(
             {
-                "title": tags.get("title"),
-                "webpath": tags.get("PURL"),
-                "description": strings.combine(
-                    tags.get("DESCRIPTION"),
-                    tags.get("SYNOPSIS"),
-                    tags.get("ARTIST"),
-                    tags.get("COMMENT"),
-                    tags.get("comment"),
-                ),
-                "time_uploaded": upload_time,
+                "title": tags.pop("title", None),
+                "webpath": tags.pop("PURL", None),
+                **{k: v for k, v in parse_tags(tags, tags).items() if v},
             },
         )
 
@@ -275,8 +275,9 @@ def munge_av_tags(args, media) -> dict:
     if objects.is_profile(args, DBType.video):
         video_tags = get_subtitle_tags(path, streams, scan_subtitles=getattr(args, "scan_subtitles", False))
         media = {**media, **video_tags}
-    elif objects.is_profile(args, DBType.audio):
+    elif objects.is_profile(args, DBType.audio) and not str(path).startswith("http"):
         stream_tags = get_audio_tags(path)
+        stream_tags = {k: v for k, v in stream_tags.items() if v}
         media = {**media, **stream_tags}
 
     return media
diff --git a/library/createdb/web_add.py b/library/createdb/web_add.py
@@ -77,7 +77,7 @@ def parse_args(action, **kwargs):
     arggroups.selenium_post(args)
 
     if not args.profiles:
-        if args.size:
+        if args.sizes:
             args.profiles = [DBType.filesystem]
         else:
             args.profiles = []
@@ -206,31 +206,39 @@ def spider(args, paths: list):
         media = [consolidate_media(args, k) | (v or {}) for k, v in new_paths.items()]
         new_media_count += len(media)
 
+        # get basic metadata
         if DBType.filesystem in args.profiles or args.hash:
-            with concurrent.futures.ThreadPoolExecutor(max_workers=args.threads) as executor:
+            enriched_media = []
+            with concurrent.futures.ThreadPoolExecutor(
+                max_workers=1 if args.verbose >= consts.LOG_DEBUG else args.threads
+            ) as executor:
                 gen_media = (f.result() for f in [executor.submit(add_basic_metadata, args, m) for m in media])
                 for i, m in enumerate(gen_media):
-                    media[i] = m
+                    enriched_media.append(m)
                     printing.print_overwrite(
                         f"Pages to scan {len(paths)} link scan: {new_media_count} new [{len(known_paths)} known]; basic metadata {i + 1} of {len(media)}"
                     )
+            media = enriched_media
         if media:
             add_media(args, media)
 
+        # get extra_metadata
         if args.sizes:
-            extra_metadata = [d for d in media if d.get("size") is None or args.sizes(d["size"])]
-        else:
-            extra_metadata = media
+            media = [d for d in media if d.get("size") is None or args.sizes(d["size"])]
 
-        with concurrent.futures.ThreadPoolExecutor(max_workers=args.threads) as executor:
-            gen_media = (f.result() for f in [executor.submit(add_extra_metadata, args, m) for m in extra_metadata])
+        enriched_media = []
+        with concurrent.futures.ThreadPoolExecutor(
+            max_workers=1 if args.verbose >= consts.LOG_DEBUG else args.threads
+        ) as executor:
+            gen_media = (f.result() for f in [executor.submit(add_extra_metadata, args, m) for m in media])
             for i, m in enumerate(gen_media):
-                extra_metadata[i] = m
+                enriched_media.append(m)
                 printing.print_overwrite(
                     f"Pages to scan {len(paths)} link scan: {new_media_count} new [{len(known_paths)} known]; extra metadata {i + 1} of {len(media)}"
                 )
-        if extra_metadata:
-            add_media(args, extra_metadata)
+        media = enriched_media
+        if media:
+            add_media(args, media)
 
         printing.print_overwrite(
             f"Pages to scan {len(paths)} link scan: {new_media_count} new [{len(known_paths)} known]"

diff --git a/library/playback/torrents_info.py b/library/playback/torrents_info.py
@@ -160,7 +160,7 @@ def shorten(s, width):
     qbt_client = torrents_start.start_qBittorrent(args)
     torrents = qbt_client.torrents_info()
 
-    error_torrents = [t for t in torrents if t.state in ["missingFiles", "error"]]
+    error_torrents = [t for t in torrents if t.state_enum.is_errored]
     error_torrents = sorted(
         error_torrents, key=lambda t: (t.amount_left == t.total_size, t.eta, t.amount_left), reverse=True
     )

diff --git a/library/usage.py b/library/usage.py
@@ -1949,6 +1949,7 @@ def play(action) -> str:
     Stop incomplete downloads
 
         library torrents --time-unseeded=+90days --time-active=+60days --time-stalled=+30days --stop
+        library torrents --time-active=+45days --inactive --progress=0 --stop
 
     Move files
 
@@ -1964,7 +1965,7 @@ def play(action) -> str:
     When --mark-deleted is provided, the torrents are tagged with 'delete' in qBittorrent
     When --delete-rows is provided, the metadata is removed from qBittorrent
     When --delete-files is provided, the downloaded files are deleted
-    When --delete-incomplete 80%% is provided, any files that were downloaded less than 80%% are deleted
+    When --delete-incomplete 80% is provided, any files that were downloaded less than 80% are deleted
 """
 
 

diff --git a/library/utils/date_utils.py b/library/utils/date_utils.py
@@ -5,6 +5,18 @@
 from library.utils import iterables, nums
 
 
+def specific_date(*dates):
+    valid_dates = [dateutil.parser.parse(s, fuzzy=True) for s in dates if s]
+    past_dates = [d for d in valid_dates if d < datetime.datetime.now()]
+    if not past_dates:
+        return None
+
+    earliest_specific_date = sorted(
+        past_dates, key=lambda d: (bool(d.month), bool(d.day), -d.timestamp()), reverse=True
+    )[0]
+    return nums.to_timestamp(earliest_specific_date)
+
+
 def tube_date(v):
     upload_date = iterables.safe_unpack(
         v.pop("release_date", None),
@@ -23,7 +35,7 @@ def tube_date(v):
             upload_date = nums.to_timestamp(upload_date)
         else:
             try:
-                upload_date = nums.to_timestamp(dateutil.parser.parse(upload_date))
+                upload_date = nums.to_timestamp(dateutil.parser.parse(str(upload_date)))
             except Exception:
                 upload_date = None
     return upload_date

diff --git a/pyproject.toml b/pyproject.toml
@@ -49,7 +49,7 @@ deluxe = [
   "pdf2image",
   "pillow",
   "PyExifTool",
-  "pymcdm<1.3",  # dominant_alts.size > 0 ValueError should be a warning instead...
+  "pymcdm",
   "pyvirtualdisplay",
   "qbittorrent-api",
   "scikit-learn",