Skip to content

Commit

Permalink
fix add_actor category restriction, fix duplicate actor_cat on reinse…
Browse files Browse the repository at this point in the history
…rt, add quality metric cli commands top tags without actor and top actors without jur (#41)
  • Loading branch information
soad003 committed Feb 28, 2023
1 parent dc453a7 commit 9b2bd10
Show file tree
Hide file tree
Showing 4 changed files with 200 additions and 8 deletions.
108 changes: 106 additions & 2 deletions src/tagpack/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -374,7 +374,7 @@ def get_label(actor_row):

return [(x["id"], get_label(x)) for x in res]

category_filter = strip_empty(args.only_categories.split(","))
category_filter = strip_empty(args.categories.split(","))
updated = tagpack.add_actors(
find_actor_candidates,
only_categories=category_filter if len(category_filter) > 0 else None,
Expand Down Expand Up @@ -884,6 +884,56 @@ def sync_repos(args):
print_fail(f"Repos to sync file {args.repos} does not exist.")


def list_low_quality_actors(args):
tagstore = TagStore(args.url, args.schema)

res = tagstore.get_actors_with_jurisdictions(
category=args.category, max_results=args.max, include_not_used=args.not_used
)
df = pd.DataFrame(res)

print_line("Actors without Jurisdictions")
if args.csv:
print(df.to_csv(header=True, sep=",", index=True))
else:
with pd.option_context(
"display.max_rows", None, "display.max_columns", None
): # more options can be specified also
print(
tabulate(
df,
headers=df.columns,
tablefmt="psql",
maxcolwidths=[None, None, 10, 10, 60, 10],
)
)


def list_top_labels_without_actor(args):
tagstore = TagStore(args.url, args.schema)

res = tagstore.top_labels_without_actor(
category=args.category, max_results=args.max
)
df = pd.DataFrame(res)

print_line("Top labels without actor")
if args.csv:
print(df.to_csv(header=True, sep=",", index=True))
else:
with pd.option_context(
"display.max_rows", None, "display.max_columns", None
): # more options can be specified also
print(
tabulate(
df,
headers=df.columns,
tablefmt="psql",
maxcolwidths=[None, None, 10, 50],
)
)


def main():
if sys.version_info < (3, 7):
sys.exit("This program requires python version 3.7 or later")
Expand Down Expand Up @@ -1104,7 +1154,7 @@ def print_help_subparser(subparser, args):
help="Limits the number of results",
)
ptp_add_actor.add_argument(
"--only_categories",
"--categories",
default="",
help="Only edit tags of a certain category (multiple possible with semi-colon)",
)
Expand Down Expand Up @@ -1435,6 +1485,60 @@ def print_help_subparser(subparser, args):
pqp_l.add_argument("--csv", action="store_true", help="Show csv output.")
pqp_l.set_defaults(func=low_quality_addresses, url=def_url)

# parser for actors missing Jur
pqp_j = pqp.add_parser(
"list-actors-without-jur", help="actors without jurisdictions."
)
pqp_j.add_argument(
"--category", default="", help="List actors of a specific category"
)
pqp_j.add_argument(
"--max",
default=5,
help="Limits the number of results",
)
pqp_j.add_argument(
"--schema",
default=_DEFAULT_SCHEMA,
metavar="DB_SCHEMA",
help="PostgreSQL schema for quality measures tables",
)
pqp_j.add_argument(
"-u", "--url", help="postgresql://user:password@db_host:port/database"
)
pqp_j.add_argument("--csv", action="store_true", help="Show csv output.")
pqp_j.add_argument(
"--not_used",
action="store_true",
help="Include actors that are not used in tags.",
)
pqp_j.set_defaults(func=list_low_quality_actors, url=def_url)

# parser top labels with no actor
pqp_j = pqp.add_parser(
"list-labels-without-actor",
help="List the top labels used in tags without actors.",
)
pqp_j.add_argument(
"--category", default="", help="List actors of a specific category"
)
pqp_j.add_argument(
"--max",
default=5,
help="Limits the number of results",
)
pqp_j.add_argument(
"--schema",
default=_DEFAULT_SCHEMA,
metavar="DB_SCHEMA",
help="PostgreSQL schema for quality measures tables",
)
pqp_j.add_argument(
"-u", "--url", help="postgresql://user:password@db_host:port/database"
)
pqp_j.add_argument("--csv", action="store_true", help="Show csv output.")
pqp_j.set_defaults(func=list_top_labels_without_actor, url=def_url)

# parser for quality measures show
pqp_s = pqp.add_parser("show", help="show average quality measures")
pqp_s.add_argument(
Expand Down
6 changes: 4 additions & 2 deletions src/tagpack/db/tagstore_schema.sql
Original file line number Diff line number Diff line change
Expand Up @@ -61,13 +61,15 @@ CREATE TABLE actor (
CREATE TABLE actor_categories (
id SERIAL PRIMARY KEY,
actor_id VARCHAR REFERENCES actor(id) ON DELETE CASCADE,
category_id VARCHAR REFERENCES concept(id) ON DELETE CASCADE
category_id VARCHAR REFERENCES concept(id) ON DELETE CASCADE,
CONSTRAINT unique_category UNIQUE (actor_id, category_id)
);

CREATE TABLE actor_jurisdictions (
id SERIAL PRIMARY KEY,
actor_id VARCHAR REFERENCES actor(id) ON DELETE CASCADE,
country_id VARCHAR REFERENCES concept(id) ON DELETE CASCADE
country_id VARCHAR REFERENCES concept(id) ON DELETE CASCADE,
CONSTRAINT unique_jurisdiction UNIQUE (actor_id, country_id)
);


Expand Down
4 changes: 2 additions & 2 deletions src/tagpack/tagpack.py
Original file line number Diff line number Diff line change
Expand Up @@ -443,13 +443,13 @@ def get_user_choice_cached(hl, cache):
# Continue if tag is not of a selected category
if (
only_categories is not None
and tag.all_fields.get("category") in only_categories
and tag.all_fields.get("category") not in only_categories
):
continue

if "label" in tag.explicit_fields and "actor" not in tag.explicit_fields:
tl = tag.explicit_fields.get("label")
# candidates = find_actor_candidates(tl)
print("Working on tag: \n", tag)
actor = get_user_choice_cached(tl, user_choice_cache)
if actor:
tag.contents["actor"] = actor
Expand Down
90 changes: 88 additions & 2 deletions src/tagpack/tagstore.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,12 +221,12 @@ def insert_actorpack(
act_cat_sql = (
"INSERT INTO actor_categories (actor_id, category_id) "
"VALUES (%(actor_id)s, %(category_id)s) "
"ON CONFLICT DO NOTHING;"
"ON CONFLICT (actor_id, category_id) DO NOTHING;"
)
act_jur_sql = (
"INSERT INTO actor_jurisdictions (actor_id, country_id) "
"VALUES (%(actor_id)s, %(country_id)s) "
"ON CONFLICT DO NOTHING;"
"ON CONFLICT (actor_id, country_id) DO NOTHING;"
)

actor_data = []
Expand Down Expand Up @@ -306,6 +306,92 @@ def find_actors_for(
{k: v for k, v in zip(fields_output, x)} for x in self.cursor.fetchall()
]

def get_actors_with_jurisdictions(
self, category="", max_results=5, include_not_used=False
) -> list[dict]:
fields = ["actor.id", "actor.label", "actor.uri", "actor.context"]
fields_str = ",".join(fields)
fields_output = fields + ["categories", "jurisdictions", "#tags"]
params = {
"max_results": max_results,
}

cat_clause = ""
if len(category) > 0:
params["category"] = category.strip()
cat_clause = "and actor_categories.category_id = %(category)s "

actor_join = "LEFT OUTER JOIN " if include_not_used else "INNER JOIN"

query = (
f"SELECT {fields_str} "
", string_agg(actor_categories.category_id, ', ') as categories "
", string_agg(actor_jurisdictions.country_id, ', ') as jurisdictions "
", count(distinct tag.id) as nr_tags "
"FROM actor "
f"{actor_join} tag on actor.id = tag.actor "
"INNER JOIN actor_categories on actor.id = actor_categories.actor_id "
"LEFT OUTER JOIN "
"actor_jurisdictions on actor.id = actor_jurisdictions.actor_id "
"WHERE "
"actor_jurisdictions.country_id is NULL "
f"{cat_clause} "
"GROUP BY actor.id "
"LIMIT %(max_results)s"
)

self.cursor.execute(query, params)

def format_value(k, v):
if k == "categories" or k == "jurisdictions" and v:
return ", ".join(set(v.split(", ")))
else:
return v

return [
{k: format_value(k, v) for k, v in zip(fields_output, x)}
for x in self.cursor.fetchall()
]

def top_labels_without_actor(self, category="", max_results=5) -> list[dict]:
fields = ["tag.label"]
fields_str = ",".join(fields)
fields_output = fields + ["count", "tagpacks"]
params = {
"max_results": max_results,
}

cat_clause = ""
if len(category) > 0:
params["category"] = category.strip()
cat_clause = "and tag.category = %(category)s "

query = (
f"SELECT {fields_str}, "
"count(tag.id) as count, "
"string_agg(tagpack.uri,', ') as tagpacks "
"FROM tag "
"INNER JOIN tagpack on tagpack.id = tag.tagpack "
"WHERE actor is NULL "
f"{cat_clause} "
"GROUP BY tag.label "
"ORDER BY count DESC "
"LIMIT %(max_results)s"
)

self.cursor.execute(query, params)

def format_value(k, v):
if k == "tagpacks" and v:
return ", ".join(set(v.split(", ")))
else:
return v

return [
{k: format_value(k, v) for k, v in zip(fields_output, x)}
for x in self.cursor.fetchall()
]

def low_quality_address_labels(self, th=0.25, currency="", category="") -> dict:
"""
This function returns a list of addresses having a quality meassure
Expand Down

0 comments on commit 9b2bd10

Please sign in to comment.