From c24fa1a9af33b847d3ddf846a368471e12d252f8 Mon Sep 17 00:00:00 2001 From: Gibran Gomez Date: Wed, 30 Nov 2022 00:07:01 +0100 Subject: [PATCH 1/7] Added new feature to list addresses with a quality below some threshold, to identify potential actors --- bin/tagpack-tool | 62 +++++++++++++++++++++++++++++++++++++++++++++ tagpack/tagstore.py | 43 ++++++++++++++++++++++++++++--- 2 files changed, 102 insertions(+), 3 deletions(-) diff --git a/bin/tagpack-tool b/bin/tagpack-tool index 6045ed0..dca1208 100755 --- a/bin/tagpack-tool +++ b/bin/tagpack-tool @@ -158,6 +158,48 @@ def insert_taxonomy(args, remote=False): print_line("Aborted insert", "fail") +def low_quality_addresses(args): + print_line("Addresses with low quality") + tagstore = TagStore(args.url, args.schema) + + try: + la = tagstore.low_quality_address_labels(args.threshold, args.currency) + if la: + c = args.currency if args.currency else 'all' + print(f"List of {c} addresses and labels ({len(la)}):") + intersections = [] + for (currency, address), labels in la.items(): + print(f"\t{currency}\t{address}\t{labels}") + + if not args.cluster: + continue + + # Produce clusters of addresses based on tag intersections + seen = set() + for i, (e, n) in enumerate(intersections): + seen = e.intersection(labels) + if seen: + e.update(labels) + n += 1 + intersections[i] = (e, n) + break + if not seen: + intersections.append((set(labels), 1)) + + if args.cluster: + print("\nSets of tags appearing in several addresses:") + s_int = sorted(intersections, key=lambda x: x[1], reverse=True) + for (k, v) in s_int: + if v > 1: + print(f"\t{v}: {', '.join(k)}") + else: + print("\tNone") + + except Exception as e: + print_fail(e) + print_line("Operation failed", 'fail') + + def show_quality_measures(args): print_line("Show quality measures") tagstore = TagStore(args.url, args.schema) @@ -722,6 +764,26 @@ def main(): ) pqp_i.set_defaults(func=calc_quality_measures, url=def_url) + # parser for quality measures list + pqp_l = pqp.add_parser('list', help='list low quality addresses') + pqp_l.add_argument( + '--currency', default='', + choices=['BCH', 'BTC', 'ETH', 'LTC', 'ZEC'], + help="Show low quality addresses of a specific crypto-currency") + pqp_l.add_argument( + '--threshold', default=0.25, + help="List addresses having a quality lower than this threshold") + pqp_l.add_argument( + '-c', '--cluster', action='store_true', + help="Cluster addresses having intersections of similar tags") + pqp_l.add_argument( + '--schema', default=_DEFAULT_SCHEMA, metavar='DB_SCHEMA', + help="PostgreSQL schema for quality measures tables") + pqp_l.add_argument( + '-u', '--url', + help="postgresql://user:password@db_host:port/database") + pqp_l.set_defaults(func=low_quality_addresses, url=def_url) + # parser for quality measures show pqp_s = pqp.add_parser("show", help="show average quality measures") pqp_s.add_argument( diff --git a/tagpack/tagstore.py b/tagpack/tagstore.py index e30538f..c0bd4e0 100644 --- a/tagpack/tagstore.py +++ b/tagpack/tagstore.py @@ -109,6 +109,43 @@ def insert_tagpack( self.conn.commit() + def low_quality_address_labels(self, th=0.25, currency='') -> dict: + ''' + This function returns a list of addresses having a quality meassure + equal or lower than a threshold value, along with the corresponding + tags for each address. + ''' + currency = currency.upper() + if currency not in ['', 'BCH', 'BTC', 'ETH', 'LTC', 'ZEC']: + raise ValidationError(f"Currency not supported: {currency}") + + if not currency: + currency = '%' + + msg = "Threshold must be a float number between 0 and 1" + try: + th = float(th) + if th < 0 or th > 1: + raise ValidationError(msg) + except ValueError: + raise ValidationError(msg) + + q = "SELECT j.currency, j.address, array_agg(j.label) labels \ + FROM ( \ + SELECT q.currency, q.address, t.label \ + FROM address_quality q, tag t \ + WHERE q.currency::text LIKE %s \ + AND q.address=t.address \ + AND q.quality <= %s \ + ) as j \ + GROUP BY j.currency, j.address" + + self.cursor.execute(q, (currency, th, )) + + return { + (row[0], row[1]): row[2] for row in self.cursor.fetchall() + } + def remove_duplicates(self): self.cursor.execute( """ @@ -206,7 +243,7 @@ def get_ingested_tagpacks(self) -> list: self.cursor.execute("SELECT id from tagpack") return [i[0] for i in self.cursor.fetchall()] - def get_quality_measures(self, currency="") -> float: + def get_quality_measures(self, currency="") -> dict: """ This function returns a dict with the quality measures (count, avg, and stddev) for a specific currency, or for all if currency is not @@ -214,7 +251,7 @@ def get_quality_measures(self, currency="") -> float: """ currency = currency.upper() if currency not in ["", "BCH", "BTC", "ETH", "LTC", "ZEC"]: - raise ValidationError("Currency not supported: {currency}") + raise ValidationError(f"Currency not supported: {currency}") query = "SELECT COUNT(quality), AVG(quality), STDDEV(quality)" query += " FROM address_quality" @@ -227,7 +264,7 @@ def get_quality_measures(self, currency="") -> float: keys = ["count", "avg", "stddev"] return {keys[i]: v for row in self.cursor.fetchall() for i, v in enumerate(row)} - def calculate_quality_measures(self) -> float: + def calculate_quality_measures(self) -> dict: self.cursor.execute("CALL calculate_quality()") self.cursor.execute("CALL insert_address_quality()") self.conn.commit() From 8d8777a85acd2f27db743b7899d84daa52baf753 Mon Sep 17 00:00:00 2001 From: Gibran Gomez Date: Fri, 2 Dec 2022 16:17:26 +0100 Subject: [PATCH 2/7] First draft for supporting ActorPacks --- MANIFEST.in | 3 +- bin/tagpack-tool | 213 +++++++++++++++++++++++- tagpack/__init__.py | 13 ++ tagpack/actorpack.py | 205 +++++++++++++++++++++++ tagpack/actorpack_schema.py | 98 +++++++++++ tagpack/conf/actorpack_schema.yaml | 37 +++++ tagpack/db/countries.csv | 250 +++++++++++++++++++++++++++++ tagpack/db/tagstore_schema.sql | 33 ++++ tagpack/tagpack.py | 14 +- tagpack/tagstore.py | 102 ++++++++++++ tagpack/taxonomy.py | 9 +- tests/test_actorpack_schema.py | 141 ++++++++++++++++ 12 files changed, 1098 insertions(+), 20 deletions(-) create mode 100644 tagpack/actorpack.py create mode 100644 tagpack/actorpack_schema.py create mode 100644 tagpack/conf/actorpack_schema.yaml create mode 100644 tagpack/db/countries.csv create mode 100644 tests/test_actorpack_schema.py diff --git a/MANIFEST.in b/MANIFEST.in index cfc3bd1..568d227 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,4 +1,5 @@ include tagpack/db/*.cql include tagpack/db/*.csv include tagpack/conf/tagpack_schema.yaml -include tagpack/conf/confidence.csv \ No newline at end of file +include tagpack/conf/actorpack_schema.yaml +include tagpack/conf/confidence.csv diff --git a/bin/tagpack-tool b/bin/tagpack-tool index dca1208..e25c3b5 100755 --- a/bin/tagpack-tool +++ b/bin/tagpack-tool @@ -29,6 +29,8 @@ from tagpack.tagpack import ( get_uri_for_tagpack, ) from tagpack.tagpack_schema import TagPackSchema, ValidationError +from tagpack.actorpack_schema import ActorPackSchema +from tagpack.actorpack import ActorPack from tagpack.tagstore import TagStore from tagpack.taxonomy import Taxonomy @@ -48,6 +50,7 @@ DEFAULT_CONFIG = { "entity": f"{TAXONOMY_URL}/DW-VA-Taxonomy/assets/data/entities.csv", "abuse": f"{TAXONOMY_URL}/DW-VA-Taxonomy/assets/data/abuses.csv", "confidence": "tagpack/db/confidence.csv", + "country": "tagpack/db/countries.csv" } } @@ -55,12 +58,17 @@ DEFAULT_CONFIG = { _DEFAULT_SCHEMA = "tagstore" +def _solve_remote(taxonomy): + # Actually we work local files for confidence and country taxonomies, but + # this may change in the future + return not (taxonomy == "confidence" or taxonomy == "country") + def _load_taxonomies(config): if "taxonomies" not in config: return None taxonomies = {} for key in config["taxonomies"]: - remote = not (key == "confidence") + remote = _solve_remote(key) taxonomy = _load_taxonomy(config, key, remote=remote) taxonomies[key] = taxonomy return taxonomies @@ -101,7 +109,7 @@ def show_taxonomy_concepts(args, remote=False): return print_line("Showing concepts of taxonomy {}".format(args.taxonomy)) - remote = not (args.taxonomy == "confidence") + remote = _solve_remote(args.taxonomy) uri = config["taxonomies"][args.taxonomy] print(f"{'Remote' if remote else 'Local'} URI: {uri}\n") taxonomy = _load_taxonomy(config, args.taxonomy, remote=remote) @@ -142,7 +150,7 @@ def insert_taxonomy(args, remote=False): print(f"Taxonomy: {t}") try: # TODO this should change when having local taxonomies - remote = not (t == "confidence") + remote = _solve_remote(t) taxonomy = _load_taxonomy(config, t, remote=remote) tagstore.insert_taxonomy(taxonomy) @@ -507,6 +515,127 @@ def show_tagstore_composition(args): print(tabulate(df, headers=headers, tablefmt="psql")) +def validate_actorpack(args): + config = _load_config(args.config) + + t0 = time.time() + print_line("ActorPack validation starts") + print(f"Path: {args.path}") + + taxonomies = _load_taxonomies(config) + taxonomy_keys = taxonomies.keys() + print(f"Loaded taxonomies: {taxonomy_keys}") + + schema = ActorPackSchema() + print(f"Loaded schema: {schema.definition}") + + actorpack_files = collect_tagpack_files(args.path) + n_actorpacks = len([f for fs in actorpack_files.values() for f in fs]) + print_info(f"Collected {n_actorpacks} ActorPack files\n") + + no_passed = 0 + try: + for headerfile_dir, files in actorpack_files.items(): + for actorpack_file in files: + actorpack = ActorPack.load_from_file( + '', actorpack_file, schema, taxonomies, headerfile_dir + ) + + print(f"{actorpack_file}: ", end="") + + actorpack.validate() + print_success("PASSED") + + no_passed += 1 + except (ValidationError, TagPackFileError) as e: + print_fail("FAILED", e) + + status = "fail" if no_passed < n_actorpacks else "success" + + duration = round(time.time() - t0, 2) + msg = f"{no_passed}/{n_actorpacks} ActorPacks passed in {duration}s" + print_line(msg, status) + + +def insert_actorpacks(args): + t0 = time.time() + print_line("ActorPack insert starts") + print(f"Path: {args.path}") + + if args.no_git: + base_url = args.path + print_line("No repository detection done.") + else: + base_url = get_repository(args.path) + print_line(f"Detected repository root in {base_url}") + + tagstore = TagStore(args.url, args.schema) + + schema = ActorPackSchema() + print_info(f"Loaded ActorPack schema definition: {schema.definition}") + + config = _load_config(args.config) + taxonomies = _load_taxonomies(config) + taxonomy_keys = taxonomies.keys() + print(f"Loaded taxonomies: {taxonomy_keys}") + + actorpack_files = collect_tagpack_files(args.path) + + # resolve backlinks to remote repository and relative paths + # For the URI we use the same logic for ActorPacks than for TagPacks + scheck, nogit = not args.no_strict_check, args.no_git + prepared_packs = [ + (m, h, n[0], n[1]) + for m, h, n in [ + (a, h, get_uri_for_tagpack(base_url, a, scheck, nogit)) + for h, fs in actorpack_files.items() + for a in fs + ] + ] + + prefix = config.get("prefix", "") + if args.add_new: # don't re-insert existing tagpacks + print_info("Checking which ActorPacks are new to the tagstore:") + prepared_packs = [ + (t, h, u, r) + for (t, h, u, r) in prepared_packs + if not tagstore.actorpack_exists(prefix, r) + ] + + n_ppacks = len(prepared_packs) + print_info(f"Collected {n_ppacks} ActorPack files\n") + + no_passed = 0 + no_actors = 0 + public, force = args.public, args.force + supported = tagstore.supported_currencies + for i, pack in enumerate(sorted(prepared_packs), start=1): + actorpack_file, headerfile_dir, uri, relpath = pack + + actorpack = ActorPack.load_from_file( + uri, actorpack_file, schema, taxonomies, headerfile_dir + ) + + print(f"{i} {actorpack_file}: ", end="") + try: + tagstore.insert_actorpack(actorpack, public, force, prefix, relpath) + print_success(f"PROCESSED {len(actorpack.actors)} Actors") + no_passed += 1 + no_actors += len(actorpack.actors) + except Exception as e: + print_fail("FAILED", e) + + status = "fail" if no_passed < n_ppacks else "success" + + duration = round(time.time() - t0, 2) + msg = "Processed {}/{} ActorPacks with {} Tags in {}s." + print_line(msg.format(no_passed, n_ppacks, no_actors, duration), status) +# msg = "Don't forget to run 'tagstore refresh_views' soon to keep the database" +# msg += " consistent!" +# print_info(msg) + + + def main(): parser = ArgumentParser( description="GraphSense TagPack validation and insert tool", @@ -609,6 +738,84 @@ def main(): ) ptp_i.set_defaults(func=insert_tagpack, url=def_url) + + + # parsers for actorpack command + parser_ap = subparsers.add_parser("actorpack", help="actorpack commands") + + app = parser_ap.add_subparsers(title="ActorPack commands") + + # TODO parser for list command +# app_l = app.add_parser("list", help="list ActorPacks") + + # parser for validate command + app_v = app.add_parser("validate", help="validate ActorPacks") + app_v.add_argument( + "path", + nargs="?", + metavar="PATH", + default=os.getcwd(), + help="ActorPack file or folder root path (current folder by default)", + ) + app_v.set_defaults(func=validate_actorpack) + + # parser for insert command + app_i = app.add_parser("insert", help="insert ActorPacks") + app_i.add_argument( + "path", + nargs="?", + metavar="PATH", + default=os.getcwd(), + help="ActorPacks file or folder root path", + ) + app_i.add_argument( + "--schema", + default=_DEFAULT_SCHEMA, + metavar="DB_SCHEMA", + help="PostgreSQL schema for actorpack tables", + ) + app_i.add_argument( + "-u", "--url", help="postgresql://user:password@db_host:port/database" + ) + app_i.add_argument( + "-b", + "--batch_size", + nargs="?", + type=int, + default=1000, + help="batch size for insert", + ) + app_i.add_argument( + "--public", + action="store_true", + help="By default, actorpacks are declared private in the database.\ + Use this switch to declare them public.", + ) + app_i.add_argument( + "--force", + action="store_true", + help="By default, actorpack insertion stops when an already inserted \ + actorpack exists in the database. Use this switch to force \ + re-insertion.", + ) + app_i.add_argument( + "--add_new", + action="store_true", + help="By default, actorpack insertion stops when an already inserted \ + actorpack exists in the database. Use this switch to insert \ + new actorpacks while skipping over existing ones.", + ) + app_i.add_argument( + "--no_strict_check", + action="store_true", + help="Disables check for local modifications in git repository", + ) + app_i.add_argument( + "--no_git", action="store_true", help="Disables check for local git repository" + ) + app_i.set_defaults(func=insert_actorpacks, url=def_url) + + # parser for taxonomy command parser_t = subparsers.add_parser("taxonomy", help="taxonomy commands") parser_t.set_defaults(func=list_taxonomies) diff --git a/tagpack/__init__.py b/tagpack/__init__.py index 5f1224c..ad2c631 100644 --- a/tagpack/__init__.py +++ b/tagpack/__init__.py @@ -1,6 +1,7 @@ """Module functions and classes for tagpack-tool""" from tagpack._version import __version__ +import yaml def get_version(): @@ -33,3 +34,15 @@ def __str__(self): if self.nested_exception: msg = msg + "\nError Details: " + str(self.nested_exception) return msg + + +# https://gist.github.com/pypt/94d747fe5180851196eb +class UniqueKeyLoader(yaml.FullLoader): + def construct_mapping(self, node, deep=False): + mapping = set() + for key_node, value_node in node.value: + key = self.construct_object(key_node, deep=deep) + if key in mapping: + raise ValidationError(f"Duplicate {key!r} key found in YAML.") + mapping.add(key) + return super().construct_mapping(node, deep) diff --git a/tagpack/actorpack.py b/tagpack/actorpack.py new file mode 100644 index 0000000..8a0ae03 --- /dev/null +++ b/tagpack/actorpack.py @@ -0,0 +1,205 @@ +"""ActorPack - A wrapper for ActorPack files""" +import os +import sys +import yaml +import json +from yamlinclude import YamlIncludeConstructor +from tagpack.cmd_utils import print_info +from tagpack import TagPackFileError, ValidationError, UniqueKeyLoader + + +class ActorPack(object): + """Represents an ActorPack""" + + def __init__(self, uri, contents, schema, taxonomies): + self.uri = uri + self.contents = contents + self.schema = schema + self.taxonomies = taxonomies + self._unique_actors = [] + self._duplicates = [] + + def load_from_file(uri, pathname, schema, taxonomies, header_dir=None): + YamlIncludeConstructor.add_to_loader_class( + loader_class=yaml.FullLoader, base_dir=header_dir + ) + + if not os.path.isfile(pathname): + sys.exit("This program requires {} to be a file".format(pathname)) + contents = yaml.load(open(pathname, "r"), UniqueKeyLoader) + + if "header" in contents.keys(): + for k, v in contents["header"].items(): + contents[k] = v + contents.pop("header") + return ActorPack(uri, contents, schema, taxonomies) + + @property + def all_header_fields(self): + """Returns all ActorPack header fields, including generic actor fields""" + try: + return {k: v for k, v in self.contents.items()} + except AttributeError: + raise TagPackFileError("Cannot extract ActorPack fields") + + @property + def header_fields(self): + """Returns only ActorPack header fields that are defined as such""" + try: + return { + k: v for k, v in self.contents.items() if k in self.schema.header_fields + } + except AttributeError: + raise TagPackFileError("Cannot extract ActorPack fields") + + @property + def actor_fields(self): + """Returns actor fields defined in the ActorPack header""" + try: + return { + k: v + for k, v in self.contents.items() + if k != "actors" and k in self.schema.actor_fields + } + except AttributeError: + raise TagPackFileError("Cannot extract ActorPack fields") + + @property + def actors(self): + """Returns all actors defined in a ActorPack's body""" + try: + return [ + Actor.from_contents(actor, self) + for actor in self.contents["actors"] + ] + except AttributeError: + raise TagPackFileError("Cannot extract actors from ActorPack") + + def get_unique_actors(self): + if self._unique_actors: + return self._unique_actors + + seen = set() + duplicates = [] + + for actor in self.actors: + # check if duplicate entry + t = tuple( + str(actor.all_fields.get(k)).lower() for k in ["id", "label"] + ) + if t in seen: + duplicates.append(t) + else: + seen.add(t) + self._unique_actors.append(actor) + + self._duplicates = duplicates + return self._unique_actors + + def validate(self): + """Validates an ActorPack against its schema and used taxonomies""" + + # check if mandatory header fields are used by an ActorPack + for schema_field in self.schema.mandatory_header_fields: + if schema_field not in self.header_fields: + msg = f"Mandatory header field {schema_field} missing" + raise ValidationError(msg) + + # check header fields' types, taxonomy and mandatory use + for field, value in self.all_header_fields.items(): + # check a field is defined + if field not in self.schema.all_fields: + raise ValidationError(f"Field {field} not allowed in header") + # check for None values + if value is None: + msg = f"Value of header field {field} must not be empty (None)" + raise ValidationError(msg) + + self.schema.check_type(field, value) + self.schema.check_taxonomies(field, value, self.taxonomies) + + if len(self.actors) < 1: + raise ValidationError("No actors found.") + + # iterate over all tags, check types, taxonomy and mandatory use + e2 = "Mandatory tag field {} missing in {}" + e3 = "Field {} not allowed in {}" + e4 = "Value of body field {} must not be empty (None) in {}" + for actor in self.get_unique_actors(): + # check if mandatory actor fields are defined + if not isinstance(actor, Actor): + raise ValidationError(f"Unknown actor type {type(actor)}") + + for schema_field in self.schema.mandatory_actor_fields: + if schema_field not in actor.explicit_fields \ + and schema_field not in self.actor_fields: + raise ValidationError(e2.format(schema_field, actor)) + + for field, value in actor.explicit_fields.items(): + # check whether field is defined as body field + if field not in self.schema.actor_fields: + raise ValidationError(e3.format(field, actor)) + + # check for None values + if value is None: + raise ValidationError(e4.format(field, actor)) + + # check types and taxomomy use + try: + self.schema.check_type(field, value) + self.schema.check_taxonomies(field, value, self.taxonomies) + except ValidationError as e: + raise ValidationError(f"{e} in {actor}") + + if self._duplicates: + msg = f"{len(self._duplicates)} duplicate(s) found, starting "\ + f"with {self._duplicates[0]}\n" + print_info(msg) + return True + + def to_json(self): + """Returns a JSON representation of an ActorPack's header""" + actorpack = {} + for k, v in self.header_fields.items(): + if k != "actors": + actorpack[k] = v + return json.dumps(actorpack, indent=4, sort_keys=True, default=str) + + def __str__(self): + """Returns a string serialization of the entire ActorPack""" + return str(self.contents) + + +class Actor(object): + """An actor""" + + def __init__(self, contents, actorpack): + self.contents = contents + self.actorpack = actorpack + + @staticmethod + def from_contents(contents, actorpack): + return Actor(contents, actorpack) + + @property + def explicit_fields(self): + """Return only explicitly defined actor fields""" + return {k: v for k, v in self.contents.items()} + + @property + def all_fields(self): + """Return all actor fields (explicit and generic)""" + return { + **self.actorpack.actor_fields, + **self.explicit_fields, + } + + def to_json(self): + """Returns a JSON serialization of all actor fields""" + actor = self.all_fields + return json.dumps(actor, indent=4, sort_keys=True, default=str) + + def __str__(self): + """ "Returns a string serialization of an Actor""" + return str(self.all_fields) + diff --git a/tagpack/actorpack_schema.py b/tagpack/actorpack_schema.py new file mode 100644 index 0000000..51531cd --- /dev/null +++ b/tagpack/actorpack_schema.py @@ -0,0 +1,98 @@ +"""ActorPack - A wrappers ActorPack Schema""" +import os +import json +import yaml +import datetime +#from json import JSONDecodeError +import pandas as pd +import importlib.resources as pkg_resources + +from . import conf +from . import db +from tagpack import ValidationError + +ACTORPACK_SCHEMA_FILE = "actorpack_schema.yaml" +COUNTRIES_FILE = "countries.csv" + +class ActorPackSchema(object): + """Defines the structure of an ActorPack and supports validation""" + + def __init__(self): + schema = pkg_resources.read_text(conf, ACTORPACK_SCHEMA_FILE) + self.schema = yaml.safe_load(schema) + countries = pkg_resources.open_text(db, COUNTRIES_FILE) + self.countries = pd.read_csv(countries, index_col="id") + self.definition = ACTORPACK_SCHEMA_FILE + + @property + def header_fields(self): + return {k: v for k, v in self.schema["header"].items()} + + @property + def mandatory_header_fields(self): + return {k: v for k, v in self.schema["header"].items() if v["mandatory"]} + + @property + def actor_fields(self): + return {k: v for k, v in self.schema["actor"].items()} + + @property + def mandatory_actor_fields(self): + return {k: v for k, v in self.actor_fields.items() if v["mandatory"]} + + @property + def all_fields(self): + """Returns all header and body fields""" + return {**self.header_fields, **self.actor_fields} + + def field_type(self, field): + return self.all_fields[field]["type"] + + def field_taxonomy(self, field): + try: + return self.all_fields[field].get("taxonomy") + except KeyError: + return None + + def check_type(self, field, value): + """Checks whether a field's type matches the definition""" + schema_type = self.field_type(field) + if schema_type == "text": + if not isinstance(value, str): + raise ValidationError("Field {} must be of type text".format(field)) + if len(value.strip()) == 0: + raise ValidationError("Empty value in text field {}".format(field)) + elif schema_type == "datetime": + if not isinstance(value, datetime.date): + raise ValidationError(f"Field {field} must be of type datetime") + elif schema_type == "boolean": + if not isinstance(value, bool): + raise ValidationError(f"Field {field} must be of type boolean") + elif schema_type == "list": + if not isinstance(value, list): + raise ValidationError(f"Field {field} must be of type list") + else: + raise ValidationError("Unsupported schema type {}".format(schema_type)) + return True + + def check_taxonomies(self, field, value, taxonomies): + """Checks whether a field uses values from given taxonomies""" + if not self.field_taxonomy(field): + # No taxonomy was requested + return True + elif not taxonomies: + raise ValidationError("No taxonomies loaded") + + expected_taxonomy_id = self.field_taxonomy(field) + expected_taxonomy = taxonomies.get(expected_taxonomy_id) + + if expected_taxonomy is None: + raise ValidationError(f"Unknown taxonomy {expected_taxonomy_id}") + + for v in value if isinstance(value, list) else [value]: + if v not in expected_taxonomy.concept_ids: + msg = f"Undefined concept {v} for {field} field" + raise ValidationError(msg) + + return True + diff --git a/tagpack/conf/actorpack_schema.yaml b/tagpack/conf/actorpack_schema.yaml new file mode 100644 index 0000000..7197453 --- /dev/null +++ b/tagpack/conf/actorpack_schema.yaml @@ -0,0 +1,37 @@ +header: + title: + type: text + mandatory: true + creator: + type: text + mandatory: true + description: + type: text + mandatory: false + is_public: + type: boolean + mandatory: false + actors: + type: list + mandatory: true +actor: + id: + type: text + mandatory: true + uri: + type: text + mandatory: true + label: + type: text + mandatory: true + lastmod: + type: datetime + mandatory: true + categories: + type: list + mandatory: true + taxonomy: entity + jurisdictions: + type: list + mandatory: false + taxonomy: country diff --git a/tagpack/db/countries.csv b/tagpack/db/countries.csv new file mode 100644 index 0000000..add2e7a --- /dev/null +++ b/tagpack/db/countries.csv @@ -0,0 +1,250 @@ +label,id,description +Afghanistan,AF,ISO-3166_AF +Åland Islands,AX,ISO-3166_AX +Albania,AL,ISO-3166_AL +Algeria,DZ,ISO-3166_DZ +American Samoa,AS,ISO-3166_AS +Andorra,AD,ISO-3166_AD +Angola,AO,ISO-3166_AO +Anguilla,AI,ISO-3166_AI +Antarctica,AQ,ISO-3166_AQ +Antigua and Barbuda,AG,ISO-3166_AG +Argentina,AR,ISO-3166_AR +Armenia,AM,ISO-3166_AM +Aruba,AW,ISO-3166_AW +Australia,AU,ISO-3166_AU +Austria,AT,ISO-3166_AT +Azerbaijan,AZ,ISO-3166_AZ +Bahamas,BS,ISO-3166_BS +Bahrain,BH,ISO-3166_BH +Bangladesh,BD,ISO-3166_BD +Barbados,BB,ISO-3166_BB +Belarus,BY,ISO-3166_BY +Belgium,BE,ISO-3166_BE +Belize,BZ,ISO-3166_BZ +Benin,BJ,ISO-3166_BJ +Bermuda,BM,ISO-3166_BM +Bhutan,BT,ISO-3166_BT +"Bolivia, Plurinational State of",BO,ISO-3166_BO +"Bonaire, Sint Eustatius and Saba",BQ,ISO-3166_BQ +Bosnia and Herzegovina,BA,ISO-3166_BA +Botswana,BW,ISO-3166_BW +Bouvet Island,BV,ISO-3166_BV +Brazil,BR,ISO-3166_BR +British Indian Ocean Territory,IO,ISO-3166_IO +Brunei Darussalam,BN,ISO-3166_BN +Bulgaria,BG,ISO-3166_BG +Burkina Faso,BF,ISO-3166_BF +Burundi,BI,ISO-3166_BI +Cambodia,KH,ISO-3166_KH +Cameroon,CM,ISO-3166_CM +Canada,CA,ISO-3166_CA +Cape Verde,CV,ISO-3166_CV +Cayman Islands,KY,ISO-3166_KY +Central African Republic,CF,ISO-3166_CF +Chad,TD,ISO-3166_TD +Chile,CL,ISO-3166_CL +China,CN,ISO-3166_CN +Christmas Island,CX,ISO-3166_CX +Cocos (Keeling) Islands,CC,ISO-3166_CC +Colombia,CO,ISO-3166_CO +Comoros,KM,ISO-3166_KM +Congo,CG,ISO-3166_CG +"Congo, the Democratic Republic of the",CD,ISO-3166_CD +Cook Islands,CK,ISO-3166_CK +Costa Rica,CR,ISO-3166_CR +Côte d'Ivoire,CI,ISO-3166_CI +Croatia,HR,ISO-3166_HR +Cuba,CU,ISO-3166_CU +Curaçao,CW,ISO-3166_CW +Cyprus,CY,ISO-3166_CY +Czech Republic,CZ,ISO-3166_CZ +Denmark,DK,ISO-3166_DK +Djibouti,DJ,ISO-3166_DJ +Dominica,DM,ISO-3166_DM +Dominican Republic,DO,ISO-3166_DO +Ecuador,EC,ISO-3166_EC +Egypt,EG,ISO-3166_EG +El Salvador,SV,ISO-3166_SV +Equatorial Guinea,GQ,ISO-3166_GQ +Eritrea,ER,ISO-3166_ER +Estonia,EE,ISO-3166_EE +Ethiopia,ET,ISO-3166_ET +Falkland Islands (Malvinas),FK,ISO-3166_FK +Faroe Islands,FO,ISO-3166_FO +Fiji,FJ,ISO-3166_FJ +Finland,FI,ISO-3166_FI +France,FR,ISO-3166_FR +French Guiana,GF,ISO-3166_GF +French Polynesia,PF,ISO-3166_PF +French Southern Territories,TF,ISO-3166_TF +Gabon,GA,ISO-3166_GA +Gambia,GM,ISO-3166_GM +Georgia,GE,ISO-3166_GE +Germany,DE,ISO-3166_DE +Ghana,GH,ISO-3166_GH +Gibraltar,GI,ISO-3166_GI +Greece,GR,ISO-3166_GR +Greenland,GL,ISO-3166_GL +Grenada,GD,ISO-3166_GD +Guadeloupe,GP,ISO-3166_GP +Guam,GU,ISO-3166_GU +Guatemala,GT,ISO-3166_GT +Guernsey,GG,ISO-3166_GG +Guinea,GN,ISO-3166_GN +Guinea-Bissau,GW,ISO-3166_GW +Guyana,GY,ISO-3166_GY +Haiti,HT,ISO-3166_HT +Heard Island and McDonald Islands,HM,ISO-3166_HM +Holy See (Vatican City State),VA,ISO-3166_VA +Honduras,HN,ISO-3166_HN +Hong Kong,HK,ISO-3166_HK +Hungary,HU,ISO-3166_HU +Iceland,IS,ISO-3166_IS +India,IN,ISO-3166_IN +Indonesia,ID,ISO-3166_ID +"Iran, Islamic Republic of",IR,ISO-3166_IR +Iraq,IQ,ISO-3166_IQ +Ireland,IE,ISO-3166_IE +Isle of Man,IM,ISO-3166_IM +Israel,IL,ISO-3166_IL +Italy,IT,ISO-3166_IT +Jamaica,JM,ISO-3166_JM +Japan,JP,ISO-3166_JP +Jersey,JE,ISO-3166_JE +Jordan,JO,ISO-3166_JO +Kazakhstan,KZ,ISO-3166_KZ +Kenya,KE,ISO-3166_KE +Kiribati,KI,ISO-3166_KI +"Korea, Democratic People's Republic of",KP,ISO-3166_KP +"Korea, Republic of",KR,ISO-3166_KR +Kuwait,KW,ISO-3166_KW +Kyrgyzstan,KG,ISO-3166_KG +Lao People's Democratic Republic,LA,ISO-3166_LA +Latvia,LV,ISO-3166_LV +Lebanon,LB,ISO-3166_LB +Lesotho,LS,ISO-3166_LS +Liberia,LR,ISO-3166_LR +Libya,LY,ISO-3166_LY +Liechtenstein,LI,ISO-3166_LI +Lithuania,LT,ISO-3166_LT +Luxembourg,LU,ISO-3166_LU +Macao,MO,ISO-3166_MO +"Macedonia, the Former Yugoslav Republic of",MK,ISO-3166_MK +Madagascar,MG,ISO-3166_MG +Malawi,MW,ISO-3166_MW +Malaysia,MY,ISO-3166_MY +Maldives,MV,ISO-3166_MV +Mali,ML,ISO-3166_ML +Malta,MT,ISO-3166_MT +Marshall Islands,MH,ISO-3166_MH +Martinique,MQ,ISO-3166_MQ +Mauritania,MR,ISO-3166_MR +Mauritius,MU,ISO-3166_MU +Mayotte,YT,ISO-3166_YT +Mexico,MX,ISO-3166_MX +"Micronesia, Federated States of",FM,ISO-3166_FM +"Moldova, Republic of",MD,ISO-3166_MD +Monaco,MC,ISO-3166_MC +Mongolia,MN,ISO-3166_MN +Montenegro,ME,ISO-3166_ME +Montserrat,MS,ISO-3166_MS +Morocco,MA,ISO-3166_MA +Mozambique,MZ,ISO-3166_MZ +Myanmar,MM,ISO-3166_MM +Namibia,NA,ISO-3166_NA +Nauru,NR,ISO-3166_NR +Nepal,NP,ISO-3166_NP +Netherlands,NL,ISO-3166_NL +New Caledonia,NC,ISO-3166_NC +New Zealand,NZ,ISO-3166_NZ +Nicaragua,NI,ISO-3166_NI +Niger,NE,ISO-3166_NE +Nigeria,NG,ISO-3166_NG +Niue,NU,ISO-3166_NU +Norfolk Island,NF,ISO-3166_NF +Northern Mariana Islands,MP,ISO-3166_MP +Norway,NO,ISO-3166_NO +Oman,OM,ISO-3166_OM +Pakistan,PK,ISO-3166_PK +Palau,PW,ISO-3166_PW +"Palestine, State of",PS,ISO-3166_PS +Panama,PA,ISO-3166_PA +Papua New Guinea,PG,ISO-3166_PG +Paraguay,PY,ISO-3166_PY +Peru,PE,ISO-3166_PE +Philippines,PH,ISO-3166_PH +Pitcairn,PN,ISO-3166_PN +Poland,PL,ISO-3166_PL +Portugal,PT,ISO-3166_PT +Puerto Rico,PR,ISO-3166_PR +Qatar,QA,ISO-3166_QA +Réunion,RE,ISO-3166_RE +Romania,RO,ISO-3166_RO +Russian Federation,RU,ISO-3166_RU +Rwanda,RW,ISO-3166_RW +Saint Barthélemy,BL,ISO-3166_BL +"Saint Helena, Ascension and Tristan da Cunha",SH,ISO-3166_SH +Saint Kitts and Nevis,KN,ISO-3166_KN +Saint Lucia,LC,ISO-3166_LC +Saint Martin (French part),MF,ISO-3166_MF +Saint Pierre and Miquelon,PM,ISO-3166_PM +Saint Vincent and the Grenadines,VC,ISO-3166_VC +Samoa,WS,ISO-3166_WS +San Marino,SM,ISO-3166_SM +Sao Tome and Principe,ST,ISO-3166_ST +Saudi Arabia,SA,ISO-3166_SA +Senegal,SN,ISO-3166_SN +Serbia,RS,ISO-3166_RS +Seychelles,SC,ISO-3166_SC +Sierra Leone,SL,ISO-3166_SL +Singapore,SG,ISO-3166_SG +Sint Maarten (Dutch part),SX,ISO-3166_SX +Slovakia,SK,ISO-3166_SK +Slovenia,SI,ISO-3166_SI +Solomon Islands,SB,ISO-3166_SB +Somalia,SO,ISO-3166_SO +South Africa,ZA,ISO-3166_ZA +South Georgia and the South Sandwich Islands,GS,ISO-3166_GS +South Sudan,SS,ISO-3166_SS +Spain,ES,ISO-3166_ES +Sri Lanka,LK,ISO-3166_LK +Sudan,SD,ISO-3166_SD +Suriname,SR,ISO-3166_SR +Svalbard and Jan Mayen,SJ,ISO-3166_SJ +Swaziland,SZ,ISO-3166_SZ +Sweden,SE,ISO-3166_SE +Switzerland,CH,ISO-3166_CH +Syrian Arab Republic,SY,ISO-3166_SY +"Taiwan, Province of China",TW,ISO-3166_TW +Tajikistan,TJ,ISO-3166_TJ +"Tanzania, United Republic of",TZ,ISO-3166_TZ +Thailand,TH,ISO-3166_TH +Timor-Leste,TL,ISO-3166_TL +Togo,TG,ISO-3166_TG +Tokelau,TK,ISO-3166_TK +Tonga,TO,ISO-3166_TO +Trinidad and Tobago,TT,ISO-3166_TT +Tunisia,TN,ISO-3166_TN +Turkey,TR,ISO-3166_TR +Turkmenistan,TM,ISO-3166_TM +Turks and Caicos Islands,TC,ISO-3166_TC +Tuvalu,TV,ISO-3166_TV +Uganda,UG,ISO-3166_UG +Ukraine,UA,ISO-3166_UA +United Arab Emirates,AE,ISO-3166_AE +United Kingdom,GB,ISO-3166_GB +United States,US,ISO-3166_US +United States Minor Outlying Islands,UM,ISO-3166_UM +Uruguay,UY,ISO-3166_UY +Uzbekistan,UZ,ISO-3166_UZ +Vanuatu,VU,ISO-3166_VU +"Venezuela, Bolivarian Republic of",VE,ISO-3166_VE +Viet Nam,VN,ISO-3166_VN +"Virgin Islands, British",VG,ISO-3166_VG +"Virgin Islands, U.S.",VI,ISO-3166_VI +Wallis and Futuna,WF,ISO-3166_WF +Western Sahara,EH,ISO-3166_EH +Yemen,YE,ISO-3166_YE +Zambia,ZM,ISO-3166_ZM +Zimbabwe,ZW,ISO-3166_ZW diff --git a/tagpack/db/tagstore_schema.sql b/tagpack/db/tagstore_schema.sql index edadecd..12d69c4 100644 --- a/tagpack/db/tagstore_schema.sql +++ b/tagpack/db/tagstore_schema.sql @@ -80,6 +80,39 @@ CREATE INDEX tag_label_index ON tag (label); CREATE INDEX tag_address_index ON tag (address); CREATE INDEX tag_is_cluster_definer_index ON tag (is_cluster_definer); +-- Actor and ActorPack tables + +CREATE TABLE actorpack ( + id VARCHAR PRIMARY KEY, + title VARCHAR NOT NULL, + creator VARCHAR NOT NULL, + description VARCHAR NOT NULL, + is_public BOOLEAN DEFAULT FALSE, + uri VARCHAR , + lastmod TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP +); + +CREATE TABLE actor ( + id VARCHAR PRIMARY KEY, + uri VARCHAR , + label VARCHAR NOT NULL, + lastmod TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, + actorpack VARCHAR REFERENCES actorpack(id) ON DELETE CASCADE, + CONSTRAINT unique_actor UNIQUE (id) +); + +CREATE TABLE actor_categories ( + id SERIAL PRIMARY KEY, + actor_id VARCHAR REFERENCES actor(id) ON DELETE CASCADE, + category_id VARCHAR REFERENCES concept(id) ON DELETE CASCADE +); + +CREATE TABLE actor_jurisdictions ( + id SERIAL PRIMARY KEY, + actor_id VARCHAR REFERENCES actor(id) ON DELETE CASCADE, + country_id VARCHAR REFERENCES concept(id) ON DELETE CASCADE +); + -- GraphSense mapping table CREATE TABLE address_cluster_mapping ( diff --git a/tagpack/tagpack.py b/tagpack/tagpack.py index 969fb4f..816106f 100644 --- a/tagpack/tagpack.py +++ b/tagpack/tagpack.py @@ -9,7 +9,7 @@ import pathlib import coinaddrvalidator from collections import defaultdict -from tagpack import TagPackFileError, ValidationError +from tagpack import TagPackFileError, ValidationError, UniqueKeyLoader from yamlinclude import YamlIncludeConstructor from tagpack.cmd_utils import print_info, print_warn @@ -134,18 +134,6 @@ def collect_tagpack_files(path): return tagpack_files -# https://gist.github.com/pypt/94d747fe5180851196eb -class UniqueKeyLoader(yaml.FullLoader): - def construct_mapping(self, node, deep=False): - mapping = set() - for key_node, value_node in node.value: - key = self.construct_object(key_node, deep=deep) - if key in mapping: - raise ValidationError(f"Duplicate {key!r} key found in YAML.") - mapping.add(key) - return super().construct_mapping(node, deep) - - class TagPack(object): """Represents a TagPack""" diff --git a/tagpack/tagstore.py b/tagpack/tagstore.py index c0bd4e0..67a0fcf 100644 --- a/tagpack/tagstore.py +++ b/tagpack/tagstore.py @@ -19,6 +19,7 @@ def __init__(self, url, schema): self.cursor.execute("SELECT unnest(enum_range(NULL::currency))") self.supported_currencies = [i[0] for i in self.cursor.fetchall()] self.existing_packs = None + self.existing_actorpacks = None def insert_taxonomy(self, taxonomy): if taxonomy.key == "confidence": @@ -109,6 +110,74 @@ def insert_tagpack( self.conn.commit() + def actorpack_exists(self, prefix, actorpack_name): + if not self.existing_actorpacks: + self.existing_actorpacks = self.get_ingested_actorpacks() + actorpack_id = self.create_actorpack_id(prefix, actorpack_name) + return actorpack_id in self.existing_actorpacks + + def create_actorpack_id(self, prefix, actorpack_name): + return ":".join([prefix, actorpack_name]) if prefix else actorpack_name + + def get_ingested_actorpacks(self) -> list: + self.cursor.execute("SELECT id from actorpack") + return [i[0] for i in self.cursor.fetchall()] + + def insert_actorpack(self, actorpack, is_public, force_insert, prefix, + rel_path, batch=1000): + print(f"Inserting: {rel_path}:{actorpack}") + actorpack_id = self.create_actorpack_id(prefix, rel_path) + h = _get_actor_header(actorpack, actorpack_id) + + if force_insert: + print(f"Evicting and re-inserting actorpack {actorpack_id}") + q = "DELETE FROM actorpack WHERE id = (%s)" + self.cursor.execute(q, (actorpack_id,)) + + q = "INSERT INTO actorpack \ + (id, title, creator, description, is_public, uri) \ + VALUES (%s,%s,%s,%s,%s,%s)" + v = ( + h.get("id"), + h.get("title"), + h.get("creator"), + h.get("description"), + is_public, + actorpack.uri, + ) + self.cursor.execute(q, v) + self.conn.commit() + + actor_sql = "INSERT INTO actor (id, label, uri, lastmod, actorpack) \ + VALUES (%s, %s, %s, %s, %s)" + act_cat_sql = "INSERT INTO actor_categories (actor_id, category_id) \ + VALUES (%s, %s)" + act_jur_sql = "INSERT INTO actor_jurisdictions (actor_id, country_id) \ + VALUES (%s, %s)" + + actor_data = [] + cat_data = [] + jur_data = [] + for actor in actorpack.get_unique_actors(): + actor_data.append(_get_actor(actor, actorpack_id)) + cat_data.extend(_get_actor_categories(actor)) + jur_data.extend(_get_actor_jurisdictions(actor)) + if len(actor_data) > batch: + execute_batch(self.cursor, actor_sql, actor_data) + execute_batch(self.cursor, act_cat_sql, cat_data) + execute_batch(self.cursor, act_jur_sql, jur_data) + + actor_data = [] + cat_data = [] + jur_data = [] + + # insert remaining items + execute_batch(self.cursor, actor_sql, actor_data) + execute_batch(self.cursor, act_cat_sql, cat_data) + execute_batch(self.cursor, act_jur_sql, jur_data) + + self.conn.commit() + def low_quality_address_labels(self, th=0.25, currency='') -> dict: ''' This function returns a list of addresses having a quality meassure @@ -307,3 +376,36 @@ def _get_header(tagpack, tid): "creator": tc["creator"], "description": tc.get("description", "not provided"), } + +def _get_actor_header(actorpack, id): + ac = actorpack.contents + return { + "id": id, + "title": ac["title"], + "creator": ac["creator"], + "description": ac.get("description", "not provided"), + } + +def _get_actor(actor, actorpack_id): + return ( + actor.all_fields.get("id"), + actor.all_fields.get("label").strip(), + actor.all_fields.get("uri", None).strip(), + actor.all_fields.get("lastmod", datetime.now().isoformat()), + actorpack_id, + ) + +def _get_actor_categories(actor): + data = [] + actor_id = actor.all_fields.get("id") + for category in actor.all_fields.get("categories"): + data.append((actor_id, category)) + return data + +def _get_actor_jurisdictions(actor): + data = [] + actor_id = actor.all_fields.get("id") + for country in actor.all_fields.get("jurisdictions"): + data.append((actor_id, country)) + return data + diff --git a/tagpack/taxonomy.py b/tagpack/taxonomy.py index 2cd4ae9..e32c5eb 100644 --- a/tagpack/taxonomy.py +++ b/tagpack/taxonomy.py @@ -78,11 +78,14 @@ def load_from_remote(self): def load_from_local(self): with open(self.uri, "r") as f: csv_reader = csv.DictReader(f, delimiter=",") + uri = self.uri for row in csv_reader: + id = row["id"] + label = row["label"] if "label" in row else None level = row["level"] if "level" in row else None - concept = Concept( - self, row["id"], self.uri, row["label"], level, row["description"] - ) + desc = row["description"] if "description" in row else '' + + concept = Concept(self, id, uri, label, level, desc) self.concepts.append(concept) @property diff --git a/tests/test_actorpack_schema.py b/tests/test_actorpack_schema.py new file mode 100644 index 0000000..54c13ce --- /dev/null +++ b/tests/test_actorpack_schema.py @@ -0,0 +1,141 @@ +from datetime import date +import pytest + +from tagpack.tagpack_schema import ValidationError +from tagpack.actorpack_schema import ActorPackSchema +from tagpack.taxonomy import Taxonomy + + +field_types = { + "title": "text", + "creator": "text", + "description": "text", + "is_public": "boolean", + "actors": "list", + "id": "text", + "uri": "text", + "label": "text", + "lastmod": "datetime", + "categories": "list", + "jurisdictions": "list" +} + +field_values = { + "title": "some text string", + "creator": "some text string", + "description": "some text string", + "is_public": True, + "actors": [1, 2, 3], + "id": "some text string", + "uri": "some text string", + "label": "some text string", + "lastmod": date.fromisoformat("2022-01-01"), + "categories": [1, 2, 3], + "jurisdictions": [1, 2, 3] +} + + +@pytest.fixture +def schema(monkeypatch): + actorpack_schema = ActorPackSchema() + + return actorpack_schema + + +@pytest.fixture +def taxonomies(): + tax_entity = Taxonomy("entity", "http://example.com/entity") + tax_entity.add_concept("exchange", "Exchange", None, "Some description") + + tax_country = Taxonomy("country", "http://example.com/country") + tax_country.add_concept("MX", "Mexico", None, None) + + taxonomies = {"entity": tax_entity, "country": tax_country} + return taxonomies + + +def test_init(schema): + assert isinstance(schema, ActorPackSchema) + assert schema.definition == "actorpack_schema.yaml" + + +def test_header_fields(schema): + assert isinstance(schema.header_fields, dict) + fields = {"title", "creator", "description", "is_public", "actors"} + assert fields - set(schema.header_fields) == set() + for field in fields: + assert field in schema.header_fields + assert "type" in schema.header_fields[field] + assert "mandatory" in schema.header_fields[field] + + +def test_mandatory_header_fields(schema): + assert isinstance(schema.mandatory_header_fields, dict) + fields = ["title", "creator", "actors"] + for field in fields: + assert field in schema.mandatory_header_fields + assert schema.header_fields[field]["mandatory"] is True + + +def test_actor_fields(schema): + assert isinstance(schema.actor_fields, dict) + fields = {"id", "uri", "label", "lastmod", "categories", "jurisdictions"} + assert fields - set(schema.actor_fields) == set() + for field in fields: + assert field in schema.actor_fields + assert "type" in schema.actor_fields[field] + assert "mandatory" in schema.actor_fields[field] + + +def test_mandatory_actor_fields(schema): + assert isinstance(schema.mandatory_actor_fields, dict) + fields = ["id", "uri", "label", "lastmod", "categories"] + for field in fields: + assert field in schema.mandatory_actor_fields + assert schema.actor_fields[field]["mandatory"] is True + + +def test_field_type(schema): + for field, ftype in field_types.items(): + assert schema.field_type(field) == ftype + + +def test_field_taxonomy(schema): + assert schema.field_taxonomy("categories") == "entity" + assert schema.field_taxonomy("jurisdictions") == "country" + + +def test_field_no_taxonomy(schema): + assert schema.field_taxonomy("title") is None + + +def test_check_type(schema): + for field, value in field_values.items(): + assert schema.check_type(field, value) + with (pytest.raises(ValidationError)) as e: + assert schema.check_type(field, 5) + msg = f"Field {field} must be of type {field_types[field]}" + assert msg in str(e.value) + + +def test_check_taxonomies(schema, taxonomies): + schema.schema["actor"]["test"] = {"taxonomy": "nonexistent"} + with (pytest.raises(ValidationError)) as e: + assert schema.check_taxonomies("test", "invalid", None) + assert "No taxonomies loaded" in str(e.value) + + schema.schema["actor"]["invalidtax"] = {"taxonomy": "nonexistent"} + with (pytest.raises(ValidationError)) as e: + assert schema.check_taxonomies("invalidtax", "value", taxonomies) + assert "Unknown taxonomy nonexistent" in str(e.value) + + assert schema.check_taxonomies("categories", "exchange", taxonomies) + with (pytest.raises(ValidationError)) as e: + assert schema.check_taxonomies("categories", "test", taxonomies) + assert "Undefined concept test for categories field" in str(e.value) + + assert schema.check_taxonomies("jurisdictions", "MX", taxonomies) + with (pytest.raises(ValidationError)) as e: + assert schema.check_taxonomies("jurisdictions", "test", taxonomies) + assert "Undefined concept test for jurisdictions field" in str(e.value) + From 048c69381bdbc124a7a35c58ce0ba6f2d296082b Mon Sep 17 00:00:00 2001 From: Gibran Gomez Date: Mon, 5 Dec 2022 15:58:37 +0100 Subject: [PATCH 3/7] Some fixes for the actorpack feature --- bin/tagpack-tool | 2 +- tagpack/tagstore.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/bin/tagpack-tool b/bin/tagpack-tool index e25c3b5..cde90a6 100755 --- a/bin/tagpack-tool +++ b/bin/tagpack-tool @@ -831,7 +831,7 @@ def main(): pxp_s.add_argument( "taxonomy", metavar="TAXONOMY_KEY", - choices=["abuse", "entity", "confidence"], + choices=["abuse", "entity", "confidence", "country"], help="the selected taxonomy", ) pxp_s.add_argument("-v", "--verbose", action="store_true", help="verbose concepts") diff --git a/tagpack/tagstore.py b/tagpack/tagstore.py index 67a0fcf..07a83b8 100644 --- a/tagpack/tagstore.py +++ b/tagpack/tagstore.py @@ -125,7 +125,6 @@ def get_ingested_actorpacks(self) -> list: def insert_actorpack(self, actorpack, is_public, force_insert, prefix, rel_path, batch=1000): - print(f"Inserting: {rel_path}:{actorpack}") actorpack_id = self.create_actorpack_id(prefix, rel_path) h = _get_actor_header(actorpack, actorpack_id) From 14cdf9ac42be3c37a4acd473497b52bfafe1f8c3 Mon Sep 17 00:00:00 2001 From: Gibran Gomez Date: Tue, 6 Dec 2022 13:47:04 +0100 Subject: [PATCH 4/7] Include country among the taxonomy insert choices --- bin/tagpack-tool | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/tagpack-tool b/bin/tagpack-tool index cde90a6..0ed8ad5 100755 --- a/bin/tagpack-tool +++ b/bin/tagpack-tool @@ -843,7 +843,7 @@ def main(): "taxonomy", metavar="TAXONOMY_KEY", nargs="?", - choices=["abuse", "entity", "confidence"], + choices=["abuse", "entity", "confidence", "country"], default=None, help="the selected taxonomy", ) From 46b6b001cd453d7bf77a860d8389bda39a6f75a5 Mon Sep 17 00:00:00 2001 From: Gibran Gomez Date: Tue, 6 Dec 2022 14:47:32 +0100 Subject: [PATCH 5/7] Added ActorPack section to the README.md file, along with some other fixes --- README.md | 46 +++++++++++++++++++++++++++++++++------------- 1 file changed, 33 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index ae6d198..508b0d8 100644 --- a/README.md +++ b/README.md @@ -5,9 +5,10 @@ This repository provides a command line tool for managing [GraphSense TagPacks](https://github.com/graphsense/graphsense-tagpacks/wiki/GraphSense-TagPacks). It can be used for 1. [validating TagPacks against the TagPack schema](#validation) -2. [handling taxonomies and concepts](#taxonomies) -3. [ingesting TagPacks and related data into a TagStore](#tagstore) -4. [calculating the quality of the tags in the TagStore](#quality) +2. [validating ActorPacks against the ActorPack schema](#actorpack_validation) +3. [handling taxonomies and concepts](#taxonomies) +4. [ingesting TagPacks and related data into a TagStore](#tagstore) +5. [calculating the quality of the tags in the TagStore](#quality) Please note that the last feature requires (installation of) a [Postgresql](https://www.postgresql.org/) database. @@ -19,8 +20,7 @@ Please note that the last feature requires (installation of) a [Postgresql](http Validate a single TagPack file - tagpack-tool tagpack validate tests/testfiles/ex_addr_tagpack.yaml - tagpack-tool tagpack validate tests/testfiles/ex_entity_tagpack.yaml + tagpack-tool tagpack validate tests/testfiles/simple/ex_addr_tagpack.yaml Recursively validate all TagPacks in (a) given folder(s). @@ -28,7 +28,21 @@ Recursively validate all TagPacks in (a) given folder(s). Tagpacks are validated against the [tagpack schema](tagpack/conf/tagpack_schema.yaml). -Confidence settings are validated against a set of acceptable [confidence](tagpack/conf/confidence.csv) values. +Confidence settings are validated against a set of acceptable [confidence](tagpack/db/confidence.csv) values. + +## Validate an ActorPack + +Validate a single ActorPack file + + tagpack-tool actorpack validate tests/testfiles/actors/ex_actorpack.yaml + +Recursively validate all TagPacks in (a) given folder(s). + + tagpack-tool actorpack validate tests/testfiles/actors/ + +Actorpacks are validated against the [actorpack schema](tagpack/conf/actorpack_schema.yaml). + +Values in the field jurisdictions are validated against a set of [country codes](tagpack/db/countries.csv). ## View available taxonomies and concepts @@ -36,7 +50,7 @@ List configured taxonomy keys and URIs tagpack-tool taxonomy list -Fetch and show concepts of a specific remote taxonomy (referenced by key) +Fetch and show concepts of a specific remote/local taxonomy (referenced by key: abuse, entity, confidence, country) tagpack-tool taxonomy show entity @@ -97,16 +111,12 @@ To use a specific config file pass the file's location: tagpack-tool --config path/to/config.yaml config - - - ### Initialize the tagstore database To initialize the database with all the taxonomies needed for ingesting the tagpacks, use: tagpack-tool tagstore init - ### Ingest taxonomies and confidence scores To insert individual taxonomies into database, use: @@ -114,6 +124,7 @@ To insert individual taxonomies into database, use: tagpack-tool taxonomy insert abuse tagpack-tool taxonomy insert entity tagpack-tool taxonomy insert confidence + tagpack-tool taxonomy insert country To insert all configured taxonomies at once, simply omit taxonomy name @@ -145,13 +156,22 @@ To ingest **new** tagpacks and **skip** over already ingested tagpacks, add the By default, trying to insert tagpacks from a repository with **local** modifications will **fail**. To force insertion despite local modifications, add the ``--no_strict_check`` command-line parameter - tagpack-tool tagpack insert --force --add_new tests/testfiles/ + tagpack-tool tagpack insert --no_strict_check tests/testfiles/ By default, tagpacks in the TagStore provide a backlink to the original tagpack file in their remote git repository ([see here](README_tagpacks.md#versioning-with-git)). -To instead write local file paths instead, add the ``--no_git`` command-line parameter +To write local file paths instead, add the ``--no_git`` command-line parameter tagpack-tool tagpack insert --no_git --add_new tests/testfiles/ +### Ingest ActorPacks + +Insert a single ActorPack file or all ActorPacks from a given folder: + + tagpack-tool actorpack insert tests/testfiles/simple/ex_addr_actorpack.yaml + tagpack-tool actorpack insert tests/testfiles/ + +You can use the parameters `--force`, `--add_new`, `--no_strict_check` and `--no_git` options in the same way as with the `tagpack` command. + ### Align ingested attribution tags with GraphSense cluster Ids The final step after inserting a tagpack is to fetch the corresponding From 600e4312ef519ae6fee1bd739b2486dc4002c731 Mon Sep 17 00:00:00 2001 From: Gibran Gomez Date: Tue, 6 Dec 2022 14:49:07 +0100 Subject: [PATCH 6/7] Added actorpack test file --- tests/testfiles/actors/ex_actorpack.yaml | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 tests/testfiles/actors/ex_actorpack.yaml diff --git a/tests/testfiles/actors/ex_actorpack.yaml b/tests/testfiles/actors/ex_actorpack.yaml new file mode 100644 index 0000000..58db21d --- /dev/null +++ b/tests/testfiles/actors/ex_actorpack.yaml @@ -0,0 +1,15 @@ +title: Test ActorPack +creator: GraphSense Core Team +description: A collection of actors commonly used for demonstrating GraphSense features +lastmod: 2022-11-29 +actors: +- id: internet_archive + uri: https://archive.org + label: Internet Archive + jurisdictions: [US] + categories: [organization] +- id: binance + uri: https://binance.com + label: Binance + jurisdictions: [US, AT] + categories: [exchange] From e33a3c5b39ae3a2ee8fe7b615a7df33750cdd74e Mon Sep 17 00:00:00 2001 From: Michael F Date: Wed, 7 Dec 2022 13:23:59 +0100 Subject: [PATCH 7/7] update readme --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 0b84621..95c346c 100644 --- a/README.md +++ b/README.md @@ -42,7 +42,7 @@ Recursively validate all TagPacks in (a) given folder(s). Actorpacks are validated against the [actorpack schema](tagpack/conf/actorpack_schema.yaml). -Values in the field jurisdictions are validated against a set of [country codes](tagpack/db/countries.csv). +Values in the field jurisdictions are validated against a set of [country codes](src/tagpack/db/countries.csv). ## View available taxonomies and concepts