fix add_actor category restriction, fix duplicate actor_cat on reinse…

…rt, add quality metric cli commands top tags without actor and top actors without jur (#41)
graphsense · Feb 28, 2023 · 9b2bd10 · 9b2bd10
1 parent dc453a7
commit 9b2bd10
Show file tree

Hide file tree

Showing 4 changed files with 200 additions and 8 deletions.
diff --git a/src/tagpack/cli.py b/src/tagpack/cli.py
@@ -374,7 +374,7 @@ def get_label(actor_row):
 
                 return [(x["id"], get_label(x)) for x in res]
 
-            category_filter = strip_empty(args.only_categories.split(","))
+            category_filter = strip_empty(args.categories.split(","))
             updated = tagpack.add_actors(
                 find_actor_candidates,
                 only_categories=category_filter if len(category_filter) > 0 else None,
@@ -884,6 +884,56 @@ def sync_repos(args):
         print_fail(f"Repos to sync file {args.repos} does not exist.")
 
 
+def list_low_quality_actors(args):
+    tagstore = TagStore(args.url, args.schema)
+
+    res = tagstore.get_actors_with_jurisdictions(
+        category=args.category, max_results=args.max, include_not_used=args.not_used
+    )
+    df = pd.DataFrame(res)
+
+    print_line("Actors without Jurisdictions")
+    if args.csv:
+        print(df.to_csv(header=True, sep=",", index=True))
+    else:
+        with pd.option_context(
+            "display.max_rows", None, "display.max_columns", None
+        ):  # more options can be specified also
+            print(
+                tabulate(
+                    df,
+                    headers=df.columns,
+                    tablefmt="psql",
+                    maxcolwidths=[None, None, 10, 10, 60, 10],
+                )
+            )
+
+
+def list_top_labels_without_actor(args):
+    tagstore = TagStore(args.url, args.schema)
+
+    res = tagstore.top_labels_without_actor(
+        category=args.category, max_results=args.max
+    )
+    df = pd.DataFrame(res)
+
+    print_line("Top labels without actor")
+    if args.csv:
+        print(df.to_csv(header=True, sep=",", index=True))
+    else:
+        with pd.option_context(
+            "display.max_rows", None, "display.max_columns", None
+        ):  # more options can be specified also
+            print(
+                tabulate(
+                    df,
+                    headers=df.columns,
+                    tablefmt="psql",
+                    maxcolwidths=[None, None, 10, 50],
+                )
+            )
+
+
 def main():
     if sys.version_info < (3, 7):
         sys.exit("This program requires python version 3.7 or later")
@@ -1104,7 +1154,7 @@ def print_help_subparser(subparser, args):
         help="Limits the number of results",
     )
     ptp_add_actor.add_argument(
-        "--only_categories",
+        "--categories",
         default="",
         help="Only edit tags of a certain category (multiple possible with semi-colon)",
     )
@@ -1435,6 +1485,60 @@ def print_help_subparser(subparser, args):
     pqp_l.add_argument("--csv", action="store_true", help="Show csv output.")
     pqp_l.set_defaults(func=low_quality_addresses, url=def_url)
 
+    # parser for actors missing Jur
+    pqp_j = pqp.add_parser(
+        "list-actors-without-jur", help="actors without jurisdictions."
+    )
+    pqp_j.add_argument(
+        "--category", default="", help="List actors of a specific category"
+    )
+    pqp_j.add_argument(
+        "--max",
+        default=5,
+        help="Limits the number of results",
+    )
+    pqp_j.add_argument(
+        "--schema",
+        default=_DEFAULT_SCHEMA,
+        metavar="DB_SCHEMA",
+        help="PostgreSQL schema for quality measures tables",
+    )
+    pqp_j.add_argument(
+        "-u", "--url", help="postgresql://user:password@db_host:port/database"
+    )
+    pqp_j.add_argument("--csv", action="store_true", help="Show csv output.")
+    pqp_j.add_argument(
+        "--not_used",
+        action="store_true",
+        help="Include actors that are not used in tags.",
+    )
+    pqp_j.set_defaults(func=list_low_quality_actors, url=def_url)
+
+    # parser top labels with no actor
+    pqp_j = pqp.add_parser(
+        "list-labels-without-actor",
+        help="List the top labels used in tags without actors.",
+    )
+    pqp_j.add_argument(
+        "--category", default="", help="List actors of a specific category"
+    )
+    pqp_j.add_argument(
+        "--max",
+        default=5,
+        help="Limits the number of results",
+    )
+    pqp_j.add_argument(
+        "--schema",
+        default=_DEFAULT_SCHEMA,
+        metavar="DB_SCHEMA",
+        help="PostgreSQL schema for quality measures tables",
+    )
+    pqp_j.add_argument(
+        "-u", "--url", help="postgresql://user:password@db_host:port/database"
+    )
+    pqp_j.add_argument("--csv", action="store_true", help="Show csv output.")
+    pqp_j.set_defaults(func=list_top_labels_without_actor, url=def_url)
+
     # parser for quality measures show
     pqp_s = pqp.add_parser("show", help="show average quality measures")
     pqp_s.add_argument(

diff --git a/src/tagpack/db/tagstore_schema.sql b/src/tagpack/db/tagstore_schema.sql
@@ -61,13 +61,15 @@ CREATE TABLE actor (
 CREATE TABLE actor_categories (
     id                  SERIAL      PRIMARY KEY,
     actor_id            VARCHAR     REFERENCES actor(id) ON DELETE CASCADE,
-    category_id         VARCHAR     REFERENCES concept(id) ON DELETE CASCADE
+    category_id         VARCHAR     REFERENCES concept(id) ON DELETE CASCADE,
+    CONSTRAINT unique_category UNIQUE (actor_id, category_id)
 );
 
 CREATE TABLE actor_jurisdictions (
     id                  SERIAL      PRIMARY KEY,
     actor_id            VARCHAR     REFERENCES actor(id) ON DELETE CASCADE,
-    country_id          VARCHAR     REFERENCES concept(id) ON DELETE CASCADE
+    country_id          VARCHAR     REFERENCES concept(id) ON DELETE CASCADE,
+    CONSTRAINT unique_jurisdiction UNIQUE (actor_id, country_id)
 );
 
 

diff --git a/src/tagpack/tagpack.py b/src/tagpack/tagpack.py
@@ -443,13 +443,13 @@ def get_user_choice_cached(hl, cache):
             # Continue if tag is not of a selected category
             if (
                 only_categories is not None
-                and tag.all_fields.get("category") in only_categories
+                and tag.all_fields.get("category") not in only_categories
             ):
                 continue
 
             if "label" in tag.explicit_fields and "actor" not in tag.explicit_fields:
                 tl = tag.explicit_fields.get("label")
-                # candidates = find_actor_candidates(tl)
+                print("Working on tag: \n", tag)
                 actor = get_user_choice_cached(tl, user_choice_cache)
                 if actor:
                     tag.contents["actor"] = actor

diff --git a/src/tagpack/tagstore.py b/src/tagpack/tagstore.py
@@ -221,12 +221,12 @@ def insert_actorpack(
         act_cat_sql = (
             "INSERT INTO actor_categories (actor_id, category_id) "
             "VALUES (%(actor_id)s, %(category_id)s) "
-            "ON CONFLICT DO NOTHING;"
+            "ON CONFLICT (actor_id, category_id) DO NOTHING;"
         )
         act_jur_sql = (
             "INSERT INTO actor_jurisdictions (actor_id, country_id) "
             "VALUES (%(actor_id)s, %(country_id)s) "
-            "ON CONFLICT DO NOTHING;"
+            "ON CONFLICT (actor_id, country_id) DO NOTHING;"
         )
 
         actor_data = []
@@ -306,6 +306,92 @@ def find_actors_for(
             {k: v for k, v in zip(fields_output, x)} for x in self.cursor.fetchall()
         ]
 
+    def get_actors_with_jurisdictions(
+        self, category="", max_results=5, include_not_used=False
+    ) -> list[dict]:
+        fields = ["actor.id", "actor.label", "actor.uri", "actor.context"]
+        fields_str = ",".join(fields)
+        fields_output = fields + ["categories", "jurisdictions", "#tags"]
+        params = {
+            "max_results": max_results,
+        }
+
+        cat_clause = ""
+        if len(category) > 0:
+            params["category"] = category.strip()
+            cat_clause = "and actor_categories.category_id = %(category)s "
+
+        actor_join = "LEFT OUTER JOIN " if include_not_used else "INNER JOIN"
+
+        query = (
+            f"SELECT {fields_str} "
+            ", string_agg(actor_categories.category_id, ', ') as categories  "
+            ", string_agg(actor_jurisdictions.country_id, ', ') as jurisdictions  "
+            ", count(distinct tag.id) as nr_tags  "
+            "FROM actor "
+            f"{actor_join} tag on actor.id = tag.actor "
+            "INNER JOIN actor_categories on actor.id = actor_categories.actor_id "
+            "LEFT OUTER JOIN "
+            "actor_jurisdictions on actor.id = actor_jurisdictions.actor_id "
+            "WHERE "
+            "actor_jurisdictions.country_id is NULL "
+            f"{cat_clause} "
+            "GROUP BY actor.id "
+            "LIMIT %(max_results)s"
+        )
+
+        self.cursor.execute(query, params)
+
+        def format_value(k, v):
+            if k == "categories" or k == "jurisdictions" and v:
+                return ", ".join(set(v.split(", ")))
+            else:
+                return v
+
+        return [
+            {k: format_value(k, v) for k, v in zip(fields_output, x)}
+            for x in self.cursor.fetchall()
+        ]
+
+    def top_labels_without_actor(self, category="", max_results=5) -> list[dict]:
+        fields = ["tag.label"]
+        fields_str = ",".join(fields)
+        fields_output = fields + ["count", "tagpacks"]
+        params = {
+            "max_results": max_results,
+        }
+
+        cat_clause = ""
+        if len(category) > 0:
+            params["category"] = category.strip()
+            cat_clause = "and tag.category = %(category)s "
+
+        query = (
+            f"SELECT {fields_str}, "
+            "count(tag.id) as count, "
+            "string_agg(tagpack.uri,', ') as tagpacks "
+            "FROM tag "
+            "INNER JOIN tagpack on tagpack.id = tag.tagpack "
+            "WHERE actor is NULL "
+            f"{cat_clause} "
+            "GROUP BY tag.label "
+            "ORDER BY count DESC "
+            "LIMIT %(max_results)s"
+        )
+
+        self.cursor.execute(query, params)
+
+        def format_value(k, v):
+            if k == "tagpacks" and v:
+                return ", ".join(set(v.split(", ")))
+            else:
+                return v
+
+        return [
+            {k: format_value(k, v) for k, v in zip(fields_output, x)}
+            for x in self.cursor.fetchall()
+        ]
+
     def low_quality_address_labels(self, th=0.25, currency="", category="") -> dict:
         """
         This function returns a list of addresses having a quality meassure