diff --git a/MANIFEST.in b/MANIFEST.in
new file mode 100644
index 0000000..568d227
--- /dev/null
+++ b/MANIFEST.in
@@ -0,0 +1,5 @@
+include tagpack/db/*.cql
+include tagpack/db/*.csv
+include tagpack/conf/tagpack_schema.yaml
+include tagpack/conf/actorpack_schema.yaml
+include tagpack/conf/confidence.csv
diff --git a/README.md b/README.md
index f349f36..95c346c 100644
--- a/README.md
+++ b/README.md
@@ -5,9 +5,10 @@
This repository provides a command line tool for managing [GraphSense TagPacks](https://github.com/graphsense/graphsense-tagpacks/wiki/GraphSense-TagPacks). It can be used for
1. [validating TagPacks against the TagPack schema](#validation)
-2. [handling taxonomies and concepts](#taxonomies)
-3. [ingesting TagPacks and related data into a TagStore](#tagstore)
-4. [calculating the quality of the tags in the TagStore](#quality)
+2. [validating ActorPacks against the ActorPack schema](#actorpack_validation)
+3. [handling taxonomies and concepts](#taxonomies)
+4. [ingesting TagPacks and related data into a TagStore](#tagstore)
+5. [calculating the quality of the tags in the TagStore](#quality)
Please note that the last feature requires (installation of) a [Postgresql](https://www.postgresql.org/) database.
@@ -19,8 +20,7 @@ Please note that the last feature requires (installation of) a [Postgresql](http
Validate a single TagPack file
- tagpack-tool tagpack validate tests/testfiles/ex_addr_tagpack.yaml
- tagpack-tool tagpack validate tests/testfiles/ex_entity_tagpack.yaml
+ tagpack-tool tagpack validate tests/testfiles/simple/ex_addr_tagpack.yaml
Recursively validate all TagPacks in (a) given folder(s).
@@ -28,7 +28,21 @@ Recursively validate all TagPacks in (a) given folder(s).
Tagpacks are validated against the [tagpack schema](tagpack/conf/tagpack_schema.yaml).
-Confidence settings are validated against a set of acceptable [confidence](tagpack/conf/confidence.csv) values.
+Confidence settings are validated against a set of acceptable [confidence](tagpack/db/confidence.csv) values.
+
+## Validate an ActorPack
+
+Validate a single ActorPack file
+
+ tagpack-tool actorpack validate tests/testfiles/actors/ex_actorpack.yaml
+
+Recursively validate all TagPacks in (a) given folder(s).
+
+ tagpack-tool actorpack validate tests/testfiles/actors/
+
+Actorpacks are validated against the [actorpack schema](tagpack/conf/actorpack_schema.yaml).
+
+Values in the field jurisdictions are validated against a set of [country codes](src/tagpack/db/countries.csv).
## View available taxonomies and concepts
@@ -36,7 +50,7 @@ List configured taxonomy keys and URIs
tagpack-tool taxonomy list
-Fetch and show concepts of a specific remote taxonomy (referenced by key)
+Fetch and show concepts of a specific remote/local taxonomy (referenced by key: abuse, entity, confidence, country)
tagpack-tool taxonomy show entity
@@ -97,16 +111,12 @@ To use a specific config file pass the file's location:
tagpack-tool --config path/to/config.yaml config
-
-
-
### Initialize the tagstore database
To initialize the database with all the taxonomies needed for ingesting the tagpacks, use:
tagpack-tool tagstore init
-
### Ingest taxonomies and confidence scores
To insert individual taxonomies into database, use:
@@ -114,6 +124,7 @@ To insert individual taxonomies into database, use:
tagpack-tool taxonomy insert abuse
tagpack-tool taxonomy insert entity
tagpack-tool taxonomy insert confidence
+ tagpack-tool taxonomy insert country
To insert all configured taxonomies at once, simply omit taxonomy name
@@ -145,13 +156,22 @@ To ingest **new** tagpacks and **skip** over already ingested tagpacks, add the
By default, trying to insert tagpacks from a repository with **local** modifications will **fail**.
To force insertion despite local modifications, add the ``--no_strict_check`` command-line parameter
- tagpack-tool tagpack insert --force --add_new tests/testfiles/
+ tagpack-tool tagpack insert --no_strict_check tests/testfiles/
By default, tagpacks in the TagStore provide a backlink to the original tagpack file in their remote git repository ([see here](README_tagpacks.md#versioning-with-git)).
-To instead write local file paths instead, add the ``--no_git`` command-line parameter
+To write local file paths instead, add the ``--no_git`` command-line parameter
tagpack-tool tagpack insert --no_git --add_new tests/testfiles/
+### Ingest ActorPacks
+
+Insert a single ActorPack file or all ActorPacks from a given folder:
+
+ tagpack-tool actorpack insert tests/testfiles/simple/ex_addr_actorpack.yaml
+ tagpack-tool actorpack insert tests/testfiles/
+
+You can use the parameters `--force`, `--add_new`, `--no_strict_check` and `--no_git` options in the same way as with the `tagpack` command.
+
### Align ingested attribution tags with GraphSense cluster Ids
The final step after inserting a tagpack is to fetch the corresponding
diff --git a/src/tagpack/__init__.py b/src/tagpack/__init__.py
index f83f75a..304cdca 100644
--- a/src/tagpack/__init__.py
+++ b/src/tagpack/__init__.py
@@ -2,6 +2,8 @@
import sys
+import yaml
+
if sys.version_info[:2] >= (3, 8):
# TODO: Import directly (no need for conditional) when `python_requires = >= 3.8`
from importlib.metadata import PackageNotFoundError, version # pragma: no cover
@@ -48,3 +50,15 @@ def __str__(self):
if self.nested_exception:
msg = msg + "\nError Details: " + str(self.nested_exception)
return msg
+
+
+# https://gist.github.com/pypt/94d747fe5180851196eb
+class UniqueKeyLoader(yaml.FullLoader):
+ def construct_mapping(self, node, deep=False):
+ mapping = set()
+ for key_node, value_node in node.value:
+ key = self.construct_object(key_node, deep=deep)
+ if key in mapping:
+ raise ValidationError(f"Duplicate {key!r} key found in YAML.")
+ mapping.add(key)
+ return super().construct_mapping(node, deep)
diff --git a/src/tagpack/actorpack.py b/src/tagpack/actorpack.py
new file mode 100644
index 0000000..2ae22e7
--- /dev/null
+++ b/src/tagpack/actorpack.py
@@ -0,0 +1,207 @@
+"""ActorPack - A wrapper for ActorPack files"""
+import json
+import os
+import sys
+
+import yaml
+from yamlinclude import YamlIncludeConstructor
+
+from tagpack import TagPackFileError, UniqueKeyLoader, ValidationError
+from tagpack.cmd_utils import print_info
+
+
+class ActorPack(object):
+ """Represents an ActorPack"""
+
+ def __init__(self, uri, contents, schema, taxonomies):
+ self.uri = uri
+ self.contents = contents
+ self.schema = schema
+ self.taxonomies = taxonomies
+ self._unique_actors = []
+ self._duplicates = []
+
+ def load_from_file(uri, pathname, schema, taxonomies, header_dir=None):
+ YamlIncludeConstructor.add_to_loader_class(
+ loader_class=yaml.FullLoader, base_dir=header_dir
+ )
+
+ if not os.path.isfile(pathname):
+ sys.exit("This program requires {} to be a file".format(pathname))
+ contents = yaml.load(open(pathname, "r"), UniqueKeyLoader)
+
+ if "header" in contents.keys():
+ for k, v in contents["header"].items():
+ contents[k] = v
+ contents.pop("header")
+ return ActorPack(uri, contents, schema, taxonomies)
+
+ @property
+ def all_header_fields(self):
+ """Returns all ActorPack header fields, including generic actor fields"""
+ try:
+ return {k: v for k, v in self.contents.items()}
+ except AttributeError:
+ raise TagPackFileError("Cannot extract ActorPack fields")
+
+ @property
+ def header_fields(self):
+ """Returns only ActorPack header fields that are defined as such"""
+ try:
+ return {
+ k: v for k, v in self.contents.items() if k in self.schema.header_fields
+ }
+ except AttributeError:
+ raise TagPackFileError("Cannot extract ActorPack fields")
+
+ @property
+ def actor_fields(self):
+ """Returns actor fields defined in the ActorPack header"""
+ try:
+ return {
+ k: v
+ for k, v in self.contents.items()
+ if k != "actors" and k in self.schema.actor_fields
+ }
+ except AttributeError:
+ raise TagPackFileError("Cannot extract ActorPack fields")
+
+ @property
+ def actors(self):
+ """Returns all actors defined in a ActorPack's body"""
+ try:
+ return [
+ Actor.from_contents(actor, self) for actor in self.contents["actors"]
+ ]
+ except AttributeError:
+ raise TagPackFileError("Cannot extract actors from ActorPack")
+
+ def get_unique_actors(self):
+ if self._unique_actors:
+ return self._unique_actors
+
+ seen = set()
+ duplicates = []
+
+ for actor in self.actors:
+ # check if duplicate entry
+ t = tuple(str(actor.all_fields.get(k)).lower() for k in ["id", "label"])
+ if t in seen:
+ duplicates.append(t)
+ else:
+ seen.add(t)
+ self._unique_actors.append(actor)
+
+ self._duplicates = duplicates
+ return self._unique_actors
+
+ def validate(self):
+ """Validates an ActorPack against its schema and used taxonomies"""
+
+ # check if mandatory header fields are used by an ActorPack
+ for schema_field in self.schema.mandatory_header_fields:
+ if schema_field not in self.header_fields:
+ msg = f"Mandatory header field {schema_field} missing"
+ raise ValidationError(msg)
+
+ # check header fields' types, taxonomy and mandatory use
+ for field, value in self.all_header_fields.items():
+ # check a field is defined
+ if field not in self.schema.all_fields:
+ raise ValidationError(f"Field {field} not allowed in header")
+ # check for None values
+ if value is None:
+ msg = f"Value of header field {field} must not be empty (None)"
+ raise ValidationError(msg)
+
+ self.schema.check_type(field, value)
+ self.schema.check_taxonomies(field, value, self.taxonomies)
+
+ if len(self.actors) < 1:
+ raise ValidationError("No actors found.")
+
+ # iterate over all tags, check types, taxonomy and mandatory use
+ e2 = "Mandatory tag field {} missing in {}"
+ e3 = "Field {} not allowed in {}"
+ e4 = "Value of body field {} must not be empty (None) in {}"
+ for actor in self.get_unique_actors():
+ # check if mandatory actor fields are defined
+ if not isinstance(actor, Actor):
+ raise ValidationError(f"Unknown actor type {type(actor)}")
+
+ for schema_field in self.schema.mandatory_actor_fields:
+ if (
+ schema_field not in actor.explicit_fields
+ and schema_field not in self.actor_fields
+ ):
+ raise ValidationError(e2.format(schema_field, actor))
+
+ for field, value in actor.explicit_fields.items():
+ # check whether field is defined as body field
+ if field not in self.schema.actor_fields:
+ raise ValidationError(e3.format(field, actor))
+
+ # check for None values
+ if value is None:
+ raise ValidationError(e4.format(field, actor))
+
+ # check types and taxomomy use
+ try:
+ self.schema.check_type(field, value)
+ self.schema.check_taxonomies(field, value, self.taxonomies)
+ except ValidationError as e:
+ raise ValidationError(f"{e} in {actor}")
+
+ if self._duplicates:
+ msg = (
+ f"{len(self._duplicates)} duplicate(s) found, starting "
+ f"with {self._duplicates[0]}\n"
+ )
+ print_info(msg)
+ return True
+
+ def to_json(self):
+ """Returns a JSON representation of an ActorPack's header"""
+ actorpack = {}
+ for k, v in self.header_fields.items():
+ if k != "actors":
+ actorpack[k] = v
+ return json.dumps(actorpack, indent=4, sort_keys=True, default=str)
+
+ def __str__(self):
+ """Returns a string serialization of the entire ActorPack"""
+ return str(self.contents)
+
+
+class Actor(object):
+ """An actor"""
+
+ def __init__(self, contents, actorpack):
+ self.contents = contents
+ self.actorpack = actorpack
+
+ @staticmethod
+ def from_contents(contents, actorpack):
+ return Actor(contents, actorpack)
+
+ @property
+ def explicit_fields(self):
+ """Return only explicitly defined actor fields"""
+ return {k: v for k, v in self.contents.items()}
+
+ @property
+ def all_fields(self):
+ """Return all actor fields (explicit and generic)"""
+ return {
+ **self.actorpack.actor_fields,
+ **self.explicit_fields,
+ }
+
+ def to_json(self):
+ """Returns a JSON serialization of all actor fields"""
+ actor = self.all_fields
+ return json.dumps(actor, indent=4, sort_keys=True, default=str)
+
+ def __str__(self):
+ """ "Returns a string serialization of an Actor"""
+ return str(self.all_fields)
diff --git a/src/tagpack/actorpack_schema.py b/src/tagpack/actorpack_schema.py
new file mode 100644
index 0000000..1de1cd1
--- /dev/null
+++ b/src/tagpack/actorpack_schema.py
@@ -0,0 +1,96 @@
+"""ActorPack - A wrappers ActorPack Schema"""
+import datetime
+import importlib.resources as pkg_resources
+
+import pandas as pd
+import yaml
+
+from tagpack import ValidationError
+
+from . import conf, db
+
+ACTORPACK_SCHEMA_FILE = "actorpack_schema.yaml"
+COUNTRIES_FILE = "countries.csv"
+
+
+class ActorPackSchema(object):
+ """Defines the structure of an ActorPack and supports validation"""
+
+ def __init__(self):
+ schema = pkg_resources.read_text(conf, ACTORPACK_SCHEMA_FILE)
+ self.schema = yaml.safe_load(schema)
+ countries = pkg_resources.open_text(db, COUNTRIES_FILE)
+ self.countries = pd.read_csv(countries, index_col="id")
+ self.definition = ACTORPACK_SCHEMA_FILE
+
+ @property
+ def header_fields(self):
+ return {k: v for k, v in self.schema["header"].items()}
+
+ @property
+ def mandatory_header_fields(self):
+ return {k: v for k, v in self.schema["header"].items() if v["mandatory"]}
+
+ @property
+ def actor_fields(self):
+ return {k: v for k, v in self.schema["actor"].items()}
+
+ @property
+ def mandatory_actor_fields(self):
+ return {k: v for k, v in self.actor_fields.items() if v["mandatory"]}
+
+ @property
+ def all_fields(self):
+ """Returns all header and body fields"""
+ return {**self.header_fields, **self.actor_fields}
+
+ def field_type(self, field):
+ return self.all_fields[field]["type"]
+
+ def field_taxonomy(self, field):
+ try:
+ return self.all_fields[field].get("taxonomy")
+ except KeyError:
+ return None
+
+ def check_type(self, field, value):
+ """Checks whether a field's type matches the definition"""
+ schema_type = self.field_type(field)
+ if schema_type == "text":
+ if not isinstance(value, str):
+ raise ValidationError("Field {} must be of type text".format(field))
+ if len(value.strip()) == 0:
+ raise ValidationError("Empty value in text field {}".format(field))
+ elif schema_type == "datetime":
+ if not isinstance(value, datetime.date):
+ raise ValidationError(f"Field {field} must be of type datetime")
+ elif schema_type == "boolean":
+ if not isinstance(value, bool):
+ raise ValidationError(f"Field {field} must be of type boolean")
+ elif schema_type == "list":
+ if not isinstance(value, list):
+ raise ValidationError(f"Field {field} must be of type list")
+ else:
+ raise ValidationError("Unsupported schema type {}".format(schema_type))
+ return True
+
+ def check_taxonomies(self, field, value, taxonomies):
+ """Checks whether a field uses values from given taxonomies"""
+ if not self.field_taxonomy(field):
+ # No taxonomy was requested
+ return True
+ elif not taxonomies:
+ raise ValidationError("No taxonomies loaded")
+
+ expected_taxonomy_id = self.field_taxonomy(field)
+ expected_taxonomy = taxonomies.get(expected_taxonomy_id)
+
+ if expected_taxonomy is None:
+ raise ValidationError(f"Unknown taxonomy {expected_taxonomy_id}")
+
+ for v in value if isinstance(value, list) else [value]:
+ if v not in expected_taxonomy.concept_ids:
+ msg = f"Undefined concept {v} for {field} field"
+ raise ValidationError(msg)
+
+ return True
diff --git a/src/tagpack/cli.py b/src/tagpack/cli.py
index b72797b..c7274dc 100644
--- a/src/tagpack/cli.py
+++ b/src/tagpack/cli.py
@@ -13,6 +13,8 @@
from tabulate import tabulate
from tagpack import get_version
+from tagpack.actorpack import ActorPack
+from tagpack.actorpack_schema import ActorPackSchema
from tagpack.cmd_utils import (
print_fail,
print_info,
@@ -43,6 +45,7 @@
"entity": f"{TAXONOMY_URL}/DW-VA-Taxonomy/assets/data/entities.csv",
"abuse": f"{TAXONOMY_URL}/DW-VA-Taxonomy/assets/data/abuses.csv",
"confidence": "src/tagpack/db/confidence.csv",
+ "country": "src/tagpack/db/countries.csv",
}
}
@@ -50,12 +53,18 @@
_DEFAULT_SCHEMA = "tagstore"
+def _solve_remote(taxonomy):
+ # Actually we work local files for confidence and country taxonomies, but
+ # this may change in the future
+ return not (taxonomy == "confidence" or taxonomy == "country")
+
+
def _load_taxonomies(config):
if "taxonomies" not in config:
return None
taxonomies = {}
for key in config["taxonomies"]:
- remote = not (key == "confidence")
+ remote = _solve_remote(key)
taxonomy = _load_taxonomy(config, key, remote=remote)
taxonomies[key] = taxonomy
return taxonomies
@@ -96,7 +105,7 @@ def show_taxonomy_concepts(args, remote=False):
return
print_line("Showing concepts of taxonomy {}".format(args.taxonomy))
- remote = not (args.taxonomy == "confidence")
+ remote = _solve_remote(args.taxonomy)
uri = config["taxonomies"][args.taxonomy]
print(f"{'Remote' if remote else 'Local'} URI: {uri}\n")
taxonomy = _load_taxonomy(config, args.taxonomy, remote=remote)
@@ -137,7 +146,7 @@ def insert_taxonomy(args, remote=False):
print(f"Taxonomy: {t}")
try:
# TODO this should change when having local taxonomies
- remote = not (t == "confidence")
+ remote = _solve_remote(t)
taxonomy = _load_taxonomy(config, t, remote=remote)
tagstore.insert_taxonomy(taxonomy)
@@ -153,6 +162,48 @@ def insert_taxonomy(args, remote=False):
print_line("Aborted insert", "fail")
+def low_quality_addresses(args):
+ print_line("Addresses with low quality")
+ tagstore = TagStore(args.url, args.schema)
+
+ try:
+ la = tagstore.low_quality_address_labels(args.threshold, args.currency)
+ if la:
+ c = args.currency if args.currency else "all"
+ print(f"List of {c} addresses and labels ({len(la)}):")
+ intersections = []
+ for (currency, address), labels in la.items():
+ print(f"\t{currency}\t{address}\t{labels}")
+
+ if not args.cluster:
+ continue
+
+ # Produce clusters of addresses based on tag intersections
+ seen = set()
+ for i, (e, n) in enumerate(intersections):
+ seen = e.intersection(labels)
+ if seen:
+ e.update(labels)
+ n += 1
+ intersections[i] = (e, n)
+ break
+ if not seen:
+ intersections.append((set(labels), 1))
+
+ if args.cluster:
+ print("\nSets of tags appearing in several addresses:")
+ s_int = sorted(intersections, key=lambda x: x[1], reverse=True)
+ for (k, v) in s_int:
+ if v > 1:
+ print(f"\t{v}: {', '.join(k)}")
+ else:
+ print("\tNone")
+
+ except Exception as e:
+ print_fail(e)
+ print_line("Operation failed", "fail")
+
+
def show_quality_measures(args):
print_line("Show quality measures")
tagstore = TagStore(args.url, args.schema)
@@ -466,6 +517,128 @@ def show_tagstore_composition(args):
print(tabulate(df, headers=headers, tablefmt="psql"))
+def validate_actorpack(args):
+ config = _load_config(args.config)
+
+ t0 = time.time()
+ print_line("ActorPack validation starts")
+ print(f"Path: {args.path}")
+
+ taxonomies = _load_taxonomies(config)
+ taxonomy_keys = taxonomies.keys()
+ print(f"Loaded taxonomies: {taxonomy_keys}")
+
+ schema = ActorPackSchema()
+ print(f"Loaded schema: {schema.definition}")
+
+ actorpack_files = collect_tagpack_files(args.path)
+ n_actorpacks = len([f for fs in actorpack_files.values() for f in fs])
+ print_info(f"Collected {n_actorpacks} ActorPack files\n")
+
+ no_passed = 0
+ try:
+ for headerfile_dir, files in actorpack_files.items():
+ for actorpack_file in files:
+ actorpack = ActorPack.load_from_file(
+ "", actorpack_file, schema, taxonomies, headerfile_dir
+ )
+
+ print(f"{actorpack_file}: ", end="")
+
+ actorpack.validate()
+ print_success("PASSED")
+
+ no_passed += 1
+ except (ValidationError, TagPackFileError) as e:
+ print_fail("FAILED", e)
+
+ status = "fail" if no_passed < n_actorpacks else "success"
+
+ duration = round(time.time() - t0, 2)
+ msg = f"{no_passed}/{n_actorpacks} ActorPacks passed in {duration}s"
+ print_line(msg, status)
+
+
+def insert_actorpacks(args):
+ t0 = time.time()
+ print_line("ActorPack insert starts")
+ print(f"Path: {args.path}")
+
+ if args.no_git:
+ base_url = args.path
+ print_line("No repository detection done.")
+ else:
+ base_url = get_repository(args.path)
+ print_line(f"Detected repository root in {base_url}")
+
+ tagstore = TagStore(args.url, args.schema)
+
+ schema = ActorPackSchema()
+ print_info(f"Loaded ActorPack schema definition: {schema.definition}")
+
+ config = _load_config(args.config)
+ taxonomies = _load_taxonomies(config)
+ taxonomy_keys = taxonomies.keys()
+ print(f"Loaded taxonomies: {taxonomy_keys}")
+
+ actorpack_files = collect_tagpack_files(args.path)
+
+ # resolve backlinks to remote repository and relative paths
+ # For the URI we use the same logic for ActorPacks than for TagPacks
+ scheck, nogit = not args.no_strict_check, args.no_git
+ prepared_packs = [
+ (m, h, n[0], n[1])
+ for m, h, n in [
+ (a, h, get_uri_for_tagpack(base_url, a, scheck, nogit))
+ for h, fs in actorpack_files.items()
+ for a in fs
+ ]
+ ]
+
+ prefix = config.get("prefix", "")
+ if args.add_new: # don't re-insert existing tagpacks
+ print_info("Checking which ActorPacks are new to the tagstore:")
+ prepared_packs = [
+ (t, h, u, r)
+ for (t, h, u, r) in prepared_packs
+ if not tagstore.actorpack_exists(prefix, r)
+ ]
+
+ n_ppacks = len(prepared_packs)
+ print_info(f"Collected {n_ppacks} ActorPack files\n")
+
+ no_passed = 0
+ no_actors = 0
+ public, force = args.public, args.force
+
+ for i, pack in enumerate(sorted(prepared_packs), start=1):
+ actorpack_file, headerfile_dir, uri, relpath = pack
+
+ actorpack = ActorPack.load_from_file(
+ uri, actorpack_file, schema, taxonomies, headerfile_dir
+ )
+
+ print(f"{i} {actorpack_file}: ", end="")
+ try:
+ tagstore.insert_actorpack(actorpack, public, force, prefix, relpath)
+ print_success(f"PROCESSED {len(actorpack.actors)} Actors")
+ no_passed += 1
+ no_actors += len(actorpack.actors)
+ except Exception as e:
+ print_fail("FAILED", e)
+
+ status = "fail" if no_passed < n_ppacks else "success"
+
+ duration = round(time.time() - t0, 2)
+ msg = "Processed {}/{} ActorPacks with {} Tags in {}s."
+ print_line(msg.format(no_passed, n_ppacks, no_actors, duration), status)
+
+
+# msg = "Don't forget to run 'tagstore refresh_views' soon to keep the database"
+# msg += " consistent!"
+# print_info(msg)
+
+
def main():
if sys.version_info < (3, 7):
sys.exit("This program requires python version 3.7 or later")
@@ -573,6 +746,81 @@ def main():
)
ptp_i.set_defaults(func=insert_tagpack, url=def_url)
+ # parsers for actorpack command
+ parser_ap = subparsers.add_parser("actorpack", help="actorpack commands")
+
+ app = parser_ap.add_subparsers(title="ActorPack commands")
+
+ # TODO parser for list command
+ # app_l = app.add_parser("list", help="list ActorPacks")
+
+ # parser for validate command
+ app_v = app.add_parser("validate", help="validate ActorPacks")
+ app_v.add_argument(
+ "path",
+ nargs="?",
+ metavar="PATH",
+ default=os.getcwd(),
+ help="ActorPack file or folder root path (current folder by default)",
+ )
+ app_v.set_defaults(func=validate_actorpack)
+
+ # parser for insert command
+ app_i = app.add_parser("insert", help="insert ActorPacks")
+ app_i.add_argument(
+ "path",
+ nargs="?",
+ metavar="PATH",
+ default=os.getcwd(),
+ help="ActorPacks file or folder root path",
+ )
+ app_i.add_argument(
+ "--schema",
+ default=_DEFAULT_SCHEMA,
+ metavar="DB_SCHEMA",
+ help="PostgreSQL schema for actorpack tables",
+ )
+ app_i.add_argument(
+ "-u", "--url", help="postgresql://user:password@db_host:port/database"
+ )
+ app_i.add_argument(
+ "-b",
+ "--batch_size",
+ nargs="?",
+ type=int,
+ default=1000,
+ help="batch size for insert",
+ )
+ app_i.add_argument(
+ "--public",
+ action="store_true",
+ help="By default, actorpacks are declared private in the database.\
+ Use this switch to declare them public.",
+ )
+ app_i.add_argument(
+ "--force",
+ action="store_true",
+ help="By default, actorpack insertion stops when an already inserted \
+ actorpack exists in the database. Use this switch to force \
+ re-insertion.",
+ )
+ app_i.add_argument(
+ "--add_new",
+ action="store_true",
+ help="By default, actorpack insertion stops when an already inserted \
+ actorpack exists in the database. Use this switch to insert \
+ new actorpacks while skipping over existing ones.",
+ )
+ app_i.add_argument(
+ "--no_strict_check",
+ action="store_true",
+ help="Disables check for local modifications in git repository",
+ )
+ app_i.add_argument(
+ "--no_git", action="store_true", help="Disables check for local git repository"
+ )
+ app_i.set_defaults(func=insert_actorpacks, url=def_url)
+
# parser for taxonomy command
parser_t = subparsers.add_parser("taxonomy", help="taxonomy commands")
parser_t.set_defaults(func=list_taxonomies)
@@ -588,7 +836,7 @@ def main():
pxp_s.add_argument(
"taxonomy",
metavar="TAXONOMY_KEY",
- choices=["abuse", "entity", "confidence"],
+ choices=["abuse", "entity", "confidence", "country"],
help="the selected taxonomy",
)
pxp_s.add_argument("-v", "--verbose", action="store_true", help="verbose concepts")
@@ -600,7 +848,7 @@ def main():
"taxonomy",
metavar="TAXONOMY_KEY",
nargs="?",
- choices=["abuse", "entity", "confidence"],
+ choices=["abuse", "entity", "confidence", "country"],
default=None,
help="the selected taxonomy",
)
@@ -731,6 +979,36 @@ def main():
)
pqp_i.set_defaults(func=calc_quality_measures, url=def_url)
+ # parser for quality measures list
+ pqp_l = pqp.add_parser("list", help="list low quality addresses")
+ pqp_l.add_argument(
+ "--currency",
+ default="",
+ choices=["BCH", "BTC", "ETH", "LTC", "ZEC"],
+ help="Show low quality addresses of a specific crypto-currency",
+ )
+ pqp_l.add_argument(
+ "--threshold",
+ default=0.25,
+ help="List addresses having a quality lower than this threshold",
+ )
+ pqp_l.add_argument(
+ "-c",
+ "--cluster",
+ action="store_true",
+ help="Cluster addresses having intersections of similar tags",
+ )
+ pqp_l.add_argument(
+ "--schema",
+ default=_DEFAULT_SCHEMA,
+ metavar="DB_SCHEMA",
+ help="PostgreSQL schema for quality measures tables",
+ )
+ pqp_l.add_argument(
+ "-u", "--url", help="postgresql://user:password@db_host:port/database"
+ )
+ pqp_l.set_defaults(func=low_quality_addresses, url=def_url)
+
# parser for quality measures show
pqp_s = pqp.add_parser("show", help="show average quality measures")
pqp_s.add_argument(
diff --git a/src/tagpack/conf/actorpack_schema.yaml b/src/tagpack/conf/actorpack_schema.yaml
new file mode 100644
index 0000000..7197453
--- /dev/null
+++ b/src/tagpack/conf/actorpack_schema.yaml
@@ -0,0 +1,37 @@
+header:
+ title:
+ type: text
+ mandatory: true
+ creator:
+ type: text
+ mandatory: true
+ description:
+ type: text
+ mandatory: false
+ is_public:
+ type: boolean
+ mandatory: false
+ actors:
+ type: list
+ mandatory: true
+actor:
+ id:
+ type: text
+ mandatory: true
+ uri:
+ type: text
+ mandatory: true
+ label:
+ type: text
+ mandatory: true
+ lastmod:
+ type: datetime
+ mandatory: true
+ categories:
+ type: list
+ mandatory: true
+ taxonomy: entity
+ jurisdictions:
+ type: list
+ mandatory: false
+ taxonomy: country
diff --git a/src/tagpack/db/countries.csv b/src/tagpack/db/countries.csv
new file mode 100644
index 0000000..add2e7a
--- /dev/null
+++ b/src/tagpack/db/countries.csv
@@ -0,0 +1,250 @@
+label,id,description
+Afghanistan,AF,ISO-3166_AF
+Åland Islands,AX,ISO-3166_AX
+Albania,AL,ISO-3166_AL
+Algeria,DZ,ISO-3166_DZ
+American Samoa,AS,ISO-3166_AS
+Andorra,AD,ISO-3166_AD
+Angola,AO,ISO-3166_AO
+Anguilla,AI,ISO-3166_AI
+Antarctica,AQ,ISO-3166_AQ
+Antigua and Barbuda,AG,ISO-3166_AG
+Argentina,AR,ISO-3166_AR
+Armenia,AM,ISO-3166_AM
+Aruba,AW,ISO-3166_AW
+Australia,AU,ISO-3166_AU
+Austria,AT,ISO-3166_AT
+Azerbaijan,AZ,ISO-3166_AZ
+Bahamas,BS,ISO-3166_BS
+Bahrain,BH,ISO-3166_BH
+Bangladesh,BD,ISO-3166_BD
+Barbados,BB,ISO-3166_BB
+Belarus,BY,ISO-3166_BY
+Belgium,BE,ISO-3166_BE
+Belize,BZ,ISO-3166_BZ
+Benin,BJ,ISO-3166_BJ
+Bermuda,BM,ISO-3166_BM
+Bhutan,BT,ISO-3166_BT
+"Bolivia, Plurinational State of",BO,ISO-3166_BO
+"Bonaire, Sint Eustatius and Saba",BQ,ISO-3166_BQ
+Bosnia and Herzegovina,BA,ISO-3166_BA
+Botswana,BW,ISO-3166_BW
+Bouvet Island,BV,ISO-3166_BV
+Brazil,BR,ISO-3166_BR
+British Indian Ocean Territory,IO,ISO-3166_IO
+Brunei Darussalam,BN,ISO-3166_BN
+Bulgaria,BG,ISO-3166_BG
+Burkina Faso,BF,ISO-3166_BF
+Burundi,BI,ISO-3166_BI
+Cambodia,KH,ISO-3166_KH
+Cameroon,CM,ISO-3166_CM
+Canada,CA,ISO-3166_CA
+Cape Verde,CV,ISO-3166_CV
+Cayman Islands,KY,ISO-3166_KY
+Central African Republic,CF,ISO-3166_CF
+Chad,TD,ISO-3166_TD
+Chile,CL,ISO-3166_CL
+China,CN,ISO-3166_CN
+Christmas Island,CX,ISO-3166_CX
+Cocos (Keeling) Islands,CC,ISO-3166_CC
+Colombia,CO,ISO-3166_CO
+Comoros,KM,ISO-3166_KM
+Congo,CG,ISO-3166_CG
+"Congo, the Democratic Republic of the",CD,ISO-3166_CD
+Cook Islands,CK,ISO-3166_CK
+Costa Rica,CR,ISO-3166_CR
+Côte d'Ivoire,CI,ISO-3166_CI
+Croatia,HR,ISO-3166_HR
+Cuba,CU,ISO-3166_CU
+Curaçao,CW,ISO-3166_CW
+Cyprus,CY,ISO-3166_CY
+Czech Republic,CZ,ISO-3166_CZ
+Denmark,DK,ISO-3166_DK
+Djibouti,DJ,ISO-3166_DJ
+Dominica,DM,ISO-3166_DM
+Dominican Republic,DO,ISO-3166_DO
+Ecuador,EC,ISO-3166_EC
+Egypt,EG,ISO-3166_EG
+El Salvador,SV,ISO-3166_SV
+Equatorial Guinea,GQ,ISO-3166_GQ
+Eritrea,ER,ISO-3166_ER
+Estonia,EE,ISO-3166_EE
+Ethiopia,ET,ISO-3166_ET
+Falkland Islands (Malvinas),FK,ISO-3166_FK
+Faroe Islands,FO,ISO-3166_FO
+Fiji,FJ,ISO-3166_FJ
+Finland,FI,ISO-3166_FI
+France,FR,ISO-3166_FR
+French Guiana,GF,ISO-3166_GF
+French Polynesia,PF,ISO-3166_PF
+French Southern Territories,TF,ISO-3166_TF
+Gabon,GA,ISO-3166_GA
+Gambia,GM,ISO-3166_GM
+Georgia,GE,ISO-3166_GE
+Germany,DE,ISO-3166_DE
+Ghana,GH,ISO-3166_GH
+Gibraltar,GI,ISO-3166_GI
+Greece,GR,ISO-3166_GR
+Greenland,GL,ISO-3166_GL
+Grenada,GD,ISO-3166_GD
+Guadeloupe,GP,ISO-3166_GP
+Guam,GU,ISO-3166_GU
+Guatemala,GT,ISO-3166_GT
+Guernsey,GG,ISO-3166_GG
+Guinea,GN,ISO-3166_GN
+Guinea-Bissau,GW,ISO-3166_GW
+Guyana,GY,ISO-3166_GY
+Haiti,HT,ISO-3166_HT
+Heard Island and McDonald Islands,HM,ISO-3166_HM
+Holy See (Vatican City State),VA,ISO-3166_VA
+Honduras,HN,ISO-3166_HN
+Hong Kong,HK,ISO-3166_HK
+Hungary,HU,ISO-3166_HU
+Iceland,IS,ISO-3166_IS
+India,IN,ISO-3166_IN
+Indonesia,ID,ISO-3166_ID
+"Iran, Islamic Republic of",IR,ISO-3166_IR
+Iraq,IQ,ISO-3166_IQ
+Ireland,IE,ISO-3166_IE
+Isle of Man,IM,ISO-3166_IM
+Israel,IL,ISO-3166_IL
+Italy,IT,ISO-3166_IT
+Jamaica,JM,ISO-3166_JM
+Japan,JP,ISO-3166_JP
+Jersey,JE,ISO-3166_JE
+Jordan,JO,ISO-3166_JO
+Kazakhstan,KZ,ISO-3166_KZ
+Kenya,KE,ISO-3166_KE
+Kiribati,KI,ISO-3166_KI
+"Korea, Democratic People's Republic of",KP,ISO-3166_KP
+"Korea, Republic of",KR,ISO-3166_KR
+Kuwait,KW,ISO-3166_KW
+Kyrgyzstan,KG,ISO-3166_KG
+Lao People's Democratic Republic,LA,ISO-3166_LA
+Latvia,LV,ISO-3166_LV
+Lebanon,LB,ISO-3166_LB
+Lesotho,LS,ISO-3166_LS
+Liberia,LR,ISO-3166_LR
+Libya,LY,ISO-3166_LY
+Liechtenstein,LI,ISO-3166_LI
+Lithuania,LT,ISO-3166_LT
+Luxembourg,LU,ISO-3166_LU
+Macao,MO,ISO-3166_MO
+"Macedonia, the Former Yugoslav Republic of",MK,ISO-3166_MK
+Madagascar,MG,ISO-3166_MG
+Malawi,MW,ISO-3166_MW
+Malaysia,MY,ISO-3166_MY
+Maldives,MV,ISO-3166_MV
+Mali,ML,ISO-3166_ML
+Malta,MT,ISO-3166_MT
+Marshall Islands,MH,ISO-3166_MH
+Martinique,MQ,ISO-3166_MQ
+Mauritania,MR,ISO-3166_MR
+Mauritius,MU,ISO-3166_MU
+Mayotte,YT,ISO-3166_YT
+Mexico,MX,ISO-3166_MX
+"Micronesia, Federated States of",FM,ISO-3166_FM
+"Moldova, Republic of",MD,ISO-3166_MD
+Monaco,MC,ISO-3166_MC
+Mongolia,MN,ISO-3166_MN
+Montenegro,ME,ISO-3166_ME
+Montserrat,MS,ISO-3166_MS
+Morocco,MA,ISO-3166_MA
+Mozambique,MZ,ISO-3166_MZ
+Myanmar,MM,ISO-3166_MM
+Namibia,NA,ISO-3166_NA
+Nauru,NR,ISO-3166_NR
+Nepal,NP,ISO-3166_NP
+Netherlands,NL,ISO-3166_NL
+New Caledonia,NC,ISO-3166_NC
+New Zealand,NZ,ISO-3166_NZ
+Nicaragua,NI,ISO-3166_NI
+Niger,NE,ISO-3166_NE
+Nigeria,NG,ISO-3166_NG
+Niue,NU,ISO-3166_NU
+Norfolk Island,NF,ISO-3166_NF
+Northern Mariana Islands,MP,ISO-3166_MP
+Norway,NO,ISO-3166_NO
+Oman,OM,ISO-3166_OM
+Pakistan,PK,ISO-3166_PK
+Palau,PW,ISO-3166_PW
+"Palestine, State of",PS,ISO-3166_PS
+Panama,PA,ISO-3166_PA
+Papua New Guinea,PG,ISO-3166_PG
+Paraguay,PY,ISO-3166_PY
+Peru,PE,ISO-3166_PE
+Philippines,PH,ISO-3166_PH
+Pitcairn,PN,ISO-3166_PN
+Poland,PL,ISO-3166_PL
+Portugal,PT,ISO-3166_PT
+Puerto Rico,PR,ISO-3166_PR
+Qatar,QA,ISO-3166_QA
+Réunion,RE,ISO-3166_RE
+Romania,RO,ISO-3166_RO
+Russian Federation,RU,ISO-3166_RU
+Rwanda,RW,ISO-3166_RW
+Saint Barthélemy,BL,ISO-3166_BL
+"Saint Helena, Ascension and Tristan da Cunha",SH,ISO-3166_SH
+Saint Kitts and Nevis,KN,ISO-3166_KN
+Saint Lucia,LC,ISO-3166_LC
+Saint Martin (French part),MF,ISO-3166_MF
+Saint Pierre and Miquelon,PM,ISO-3166_PM
+Saint Vincent and the Grenadines,VC,ISO-3166_VC
+Samoa,WS,ISO-3166_WS
+San Marino,SM,ISO-3166_SM
+Sao Tome and Principe,ST,ISO-3166_ST
+Saudi Arabia,SA,ISO-3166_SA
+Senegal,SN,ISO-3166_SN
+Serbia,RS,ISO-3166_RS
+Seychelles,SC,ISO-3166_SC
+Sierra Leone,SL,ISO-3166_SL
+Singapore,SG,ISO-3166_SG
+Sint Maarten (Dutch part),SX,ISO-3166_SX
+Slovakia,SK,ISO-3166_SK
+Slovenia,SI,ISO-3166_SI
+Solomon Islands,SB,ISO-3166_SB
+Somalia,SO,ISO-3166_SO
+South Africa,ZA,ISO-3166_ZA
+South Georgia and the South Sandwich Islands,GS,ISO-3166_GS
+South Sudan,SS,ISO-3166_SS
+Spain,ES,ISO-3166_ES
+Sri Lanka,LK,ISO-3166_LK
+Sudan,SD,ISO-3166_SD
+Suriname,SR,ISO-3166_SR
+Svalbard and Jan Mayen,SJ,ISO-3166_SJ
+Swaziland,SZ,ISO-3166_SZ
+Sweden,SE,ISO-3166_SE
+Switzerland,CH,ISO-3166_CH
+Syrian Arab Republic,SY,ISO-3166_SY
+"Taiwan, Province of China",TW,ISO-3166_TW
+Tajikistan,TJ,ISO-3166_TJ
+"Tanzania, United Republic of",TZ,ISO-3166_TZ
+Thailand,TH,ISO-3166_TH
+Timor-Leste,TL,ISO-3166_TL
+Togo,TG,ISO-3166_TG
+Tokelau,TK,ISO-3166_TK
+Tonga,TO,ISO-3166_TO
+Trinidad and Tobago,TT,ISO-3166_TT
+Tunisia,TN,ISO-3166_TN
+Turkey,TR,ISO-3166_TR
+Turkmenistan,TM,ISO-3166_TM
+Turks and Caicos Islands,TC,ISO-3166_TC
+Tuvalu,TV,ISO-3166_TV
+Uganda,UG,ISO-3166_UG
+Ukraine,UA,ISO-3166_UA
+United Arab Emirates,AE,ISO-3166_AE
+United Kingdom,GB,ISO-3166_GB
+United States,US,ISO-3166_US
+United States Minor Outlying Islands,UM,ISO-3166_UM
+Uruguay,UY,ISO-3166_UY
+Uzbekistan,UZ,ISO-3166_UZ
+Vanuatu,VU,ISO-3166_VU
+"Venezuela, Bolivarian Republic of",VE,ISO-3166_VE
+Viet Nam,VN,ISO-3166_VN
+"Virgin Islands, British",VG,ISO-3166_VG
+"Virgin Islands, U.S.",VI,ISO-3166_VI
+Wallis and Futuna,WF,ISO-3166_WF
+Western Sahara,EH,ISO-3166_EH
+Yemen,YE,ISO-3166_YE
+Zambia,ZM,ISO-3166_ZM
+Zimbabwe,ZW,ISO-3166_ZW
diff --git a/src/tagpack/db/tagstore_schema.sql b/src/tagpack/db/tagstore_schema.sql
index 2da8000..0831e91 100644
--- a/src/tagpack/db/tagstore_schema.sql
+++ b/src/tagpack/db/tagstore_schema.sql
@@ -80,6 +80,39 @@ CREATE INDEX tag_label_index ON tag (label);
CREATE INDEX tag_address_index ON tag (address);
CREATE INDEX tag_is_cluster_definer_index ON tag (is_cluster_definer);
+-- Actor and ActorPack tables
+
+CREATE TABLE actorpack (
+ id VARCHAR PRIMARY KEY,
+ title VARCHAR NOT NULL,
+ creator VARCHAR NOT NULL,
+ description VARCHAR NOT NULL,
+ is_public BOOLEAN DEFAULT FALSE,
+ uri VARCHAR ,
+ lastmod TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP
+);
+
+CREATE TABLE actor (
+ id VARCHAR PRIMARY KEY,
+ uri VARCHAR ,
+ label VARCHAR NOT NULL,
+ lastmod TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
+ actorpack VARCHAR REFERENCES actorpack(id) ON DELETE CASCADE,
+ CONSTRAINT unique_actor UNIQUE (id)
+);
+
+CREATE TABLE actor_categories (
+ id SERIAL PRIMARY KEY,
+ actor_id VARCHAR REFERENCES actor(id) ON DELETE CASCADE,
+ category_id VARCHAR REFERENCES concept(id) ON DELETE CASCADE
+);
+
+CREATE TABLE actor_jurisdictions (
+ id SERIAL PRIMARY KEY,
+ actor_id VARCHAR REFERENCES actor(id) ON DELETE CASCADE,
+ country_id VARCHAR REFERENCES concept(id) ON DELETE CASCADE
+);
+
-- GraphSense mapping table
CREATE TABLE address_cluster_mapping (
diff --git a/src/tagpack/tagpack.py b/src/tagpack/tagpack.py
index 61fa1e3..2462541 100644
--- a/src/tagpack/tagpack.py
+++ b/src/tagpack/tagpack.py
@@ -12,7 +12,7 @@
from git import Repo
from yamlinclude import YamlIncludeConstructor
-from tagpack import TagPackFileError, ValidationError
+from tagpack import TagPackFileError, UniqueKeyLoader, ValidationError
from tagpack.cmd_utils import print_info, print_warn
@@ -135,18 +135,6 @@ def collect_tagpack_files(path):
return tagpack_files
-# https://gist.github.com/pypt/94d747fe5180851196eb
-class UniqueKeyLoader(yaml.FullLoader):
- def construct_mapping(self, node, deep=False):
- mapping = set()
- for key_node, value_node in node.value:
- key = self.construct_object(key_node, deep=deep)
- if key in mapping:
- raise ValidationError(f"Duplicate {key!r} key found in YAML.")
- mapping.add(key)
- return super().construct_mapping(node, deep)
-
-
class TagPack(object):
"""Represents a TagPack"""
diff --git a/src/tagpack/tagstore.py b/src/tagpack/tagstore.py
index ad24175..63f4b78 100644
--- a/src/tagpack/tagstore.py
+++ b/src/tagpack/tagstore.py
@@ -20,6 +20,7 @@ def __init__(self, url, schema):
self.cursor.execute("SELECT unnest(enum_range(NULL::currency))")
self.supported_currencies = [i[0] for i in self.cursor.fetchall()]
self.existing_packs = None
+ self.existing_actorpacks = None
def insert_taxonomy(self, taxonomy):
if taxonomy.key == "confidence":
@@ -110,6 +111,115 @@ def insert_tagpack(
self.conn.commit()
+ def actorpack_exists(self, prefix, actorpack_name):
+ if not self.existing_actorpacks:
+ self.existing_actorpacks = self.get_ingested_actorpacks()
+ actorpack_id = self.create_actorpack_id(prefix, actorpack_name)
+ return actorpack_id in self.existing_actorpacks
+
+ def create_actorpack_id(self, prefix, actorpack_name):
+ return ":".join([prefix, actorpack_name]) if prefix else actorpack_name
+
+ def get_ingested_actorpacks(self) -> list:
+ self.cursor.execute("SELECT id from actorpack")
+ return [i[0] for i in self.cursor.fetchall()]
+
+ def insert_actorpack(
+ self, actorpack, is_public, force_insert, prefix, rel_path, batch=1000
+ ):
+ actorpack_id = self.create_actorpack_id(prefix, rel_path)
+ h = _get_actor_header(actorpack, actorpack_id)
+
+ if force_insert:
+ print(f"Evicting and re-inserting actorpack {actorpack_id}")
+ q = "DELETE FROM actorpack WHERE id = (%s)"
+ self.cursor.execute(q, (actorpack_id,))
+
+ q = "INSERT INTO actorpack \
+ (id, title, creator, description, is_public, uri) \
+ VALUES (%s,%s,%s,%s,%s,%s)"
+ v = (
+ h.get("id"),
+ h.get("title"),
+ h.get("creator"),
+ h.get("description"),
+ is_public,
+ actorpack.uri,
+ )
+ self.cursor.execute(q, v)
+ self.conn.commit()
+
+ actor_sql = "INSERT INTO actor (id, label, uri, lastmod, actorpack) \
+ VALUES (%s, %s, %s, %s, %s)"
+ act_cat_sql = "INSERT INTO actor_categories (actor_id, category_id) \
+ VALUES (%s, %s)"
+ act_jur_sql = "INSERT INTO actor_jurisdictions (actor_id, country_id) \
+ VALUES (%s, %s)"
+
+ actor_data = []
+ cat_data = []
+ jur_data = []
+ for actor in actorpack.get_unique_actors():
+ actor_data.append(_get_actor(actor, actorpack_id))
+ cat_data.extend(_get_actor_categories(actor))
+ jur_data.extend(_get_actor_jurisdictions(actor))
+ if len(actor_data) > batch:
+ execute_batch(self.cursor, actor_sql, actor_data)
+ execute_batch(self.cursor, act_cat_sql, cat_data)
+ execute_batch(self.cursor, act_jur_sql, jur_data)
+
+ actor_data = []
+ cat_data = []
+ jur_data = []
+
+ # insert remaining items
+ execute_batch(self.cursor, actor_sql, actor_data)
+ execute_batch(self.cursor, act_cat_sql, cat_data)
+ execute_batch(self.cursor, act_jur_sql, jur_data)
+
+ self.conn.commit()
+
+ def low_quality_address_labels(self, th=0.25, currency="") -> dict:
+ """
+ This function returns a list of addresses having a quality meassure
+ equal or lower than a threshold value, along with the corresponding
+ tags for each address.
+ """
+ currency = currency.upper()
+ if currency not in ["", "BCH", "BTC", "ETH", "LTC", "ZEC"]:
+ raise ValidationError(f"Currency not supported: {currency}")
+
+ if not currency:
+ currency = "%"
+
+ msg = "Threshold must be a float number between 0 and 1"
+ try:
+ th = float(th)
+ if th < 0 or th > 1:
+ raise ValidationError(msg)
+ except ValueError:
+ raise ValidationError(msg)
+
+ q = "SELECT j.currency, j.address, array_agg(j.label) labels \
+ FROM ( \
+ SELECT q.currency, q.address, t.label \
+ FROM address_quality q, tag t \
+ WHERE q.currency::text LIKE %s \
+ AND q.address=t.address \
+ AND q.quality <= %s \
+ ) as j \
+ GROUP BY j.currency, j.address"
+
+ self.cursor.execute(
+ q,
+ (
+ currency,
+ th,
+ ),
+ )
+
+ return {(row[0], row[1]): row[2] for row in self.cursor.fetchall()}
+
def remove_duplicates(self):
self.cursor.execute(
"""
@@ -220,7 +330,7 @@ def get_ingested_tagpacks(self) -> list:
self.cursor.execute("SELECT id from tagpack")
return [i[0] for i in self.cursor.fetchall()]
- def get_quality_measures(self, currency="") -> float:
+ def get_quality_measures(self, currency="") -> dict:
"""
This function returns a dict with the quality measures (count, avg, and
stddev) for a specific currency, or for all if currency is not
@@ -228,7 +338,7 @@ def get_quality_measures(self, currency="") -> float:
"""
currency = currency.upper()
if currency not in ["", "BCH", "BTC", "ETH", "LTC", "ZEC"]:
- raise ValidationError("Currency not supported: {currency}")
+ raise ValidationError(f"Currency not supported: {currency}")
query = "SELECT COUNT(quality), AVG(quality), STDDEV(quality)"
query += " FROM address_quality"
@@ -241,7 +351,7 @@ def get_quality_measures(self, currency="") -> float:
keys = ["count", "avg", "stddev"]
return {keys[i]: v for row in self.cursor.fetchall() for i, v in enumerate(row)}
- def calculate_quality_measures(self) -> float:
+ def calculate_quality_measures(self) -> dict:
self.cursor.execute("CALL calculate_quality()")
self.cursor.execute("CALL insert_address_quality()")
self.conn.commit()
@@ -296,3 +406,39 @@ def _get_header(tagpack, tid):
"creator": tc["creator"],
"description": tc.get("description", "not provided"),
}
+
+
+def _get_actor_header(actorpack, id):
+ ac = actorpack.contents
+ return {
+ "id": id,
+ "title": ac["title"],
+ "creator": ac["creator"],
+ "description": ac.get("description", "not provided"),
+ }
+
+
+def _get_actor(actor, actorpack_id):
+ return (
+ actor.all_fields.get("id"),
+ actor.all_fields.get("label").strip(),
+ actor.all_fields.get("uri", None).strip(),
+ actor.all_fields.get("lastmod", datetime.now().isoformat()),
+ actorpack_id,
+ )
+
+
+def _get_actor_categories(actor):
+ data = []
+ actor_id = actor.all_fields.get("id")
+ for category in actor.all_fields.get("categories"):
+ data.append((actor_id, category))
+ return data
+
+
+def _get_actor_jurisdictions(actor):
+ data = []
+ actor_id = actor.all_fields.get("id")
+ for country in actor.all_fields.get("jurisdictions"):
+ data.append((actor_id, country))
+ return data
diff --git a/src/tagpack/taxonomy.py b/src/tagpack/taxonomy.py
index 5dfe98b..213e9f3 100644
--- a/src/tagpack/taxonomy.py
+++ b/src/tagpack/taxonomy.py
@@ -79,11 +79,14 @@ def load_from_remote(self):
def load_from_local(self):
with open(self.uri, "r") as f:
csv_reader = csv.DictReader(f, delimiter=",")
+ uri = self.uri
for row in csv_reader:
+ ident = row["id"]
+ label = row["label"] if "label" in row else None
level = row["level"] if "level" in row else None
- concept = Concept(
- self, row["id"], self.uri, row["label"], level, row["description"]
- )
+ desc = row["description"] if "description" in row else ""
+
+ concept = Concept(self, ident, uri, label, level, desc)
self.concepts.append(concept)
@property
diff --git a/tests/test_actorpack_schema.py b/tests/test_actorpack_schema.py
new file mode 100644
index 0000000..af3d5d8
--- /dev/null
+++ b/tests/test_actorpack_schema.py
@@ -0,0 +1,140 @@
+from datetime import date
+
+import pytest
+
+from tagpack.actorpack_schema import ActorPackSchema
+from tagpack.tagpack_schema import ValidationError
+from tagpack.taxonomy import Taxonomy
+
+field_types = {
+ "title": "text",
+ "creator": "text",
+ "description": "text",
+ "is_public": "boolean",
+ "actors": "list",
+ "id": "text",
+ "uri": "text",
+ "label": "text",
+ "lastmod": "datetime",
+ "categories": "list",
+ "jurisdictions": "list",
+}
+
+field_values = {
+ "title": "some text string",
+ "creator": "some text string",
+ "description": "some text string",
+ "is_public": True,
+ "actors": [1, 2, 3],
+ "id": "some text string",
+ "uri": "some text string",
+ "label": "some text string",
+ "lastmod": date.fromisoformat("2022-01-01"),
+ "categories": [1, 2, 3],
+ "jurisdictions": [1, 2, 3],
+}
+
+
+@pytest.fixture
+def schema(monkeypatch):
+ actorpack_schema = ActorPackSchema()
+
+ return actorpack_schema
+
+
+@pytest.fixture
+def taxonomies():
+ tax_entity = Taxonomy("entity", "http://example.com/entity")
+ tax_entity.add_concept("exchange", "Exchange", None, "Some description")
+
+ tax_country = Taxonomy("country", "http://example.com/country")
+ tax_country.add_concept("MX", "Mexico", None, None)
+
+ taxonomies = {"entity": tax_entity, "country": tax_country}
+ return taxonomies
+
+
+def test_init(schema):
+ assert isinstance(schema, ActorPackSchema)
+ assert schema.definition == "actorpack_schema.yaml"
+
+
+def test_header_fields(schema):
+ assert isinstance(schema.header_fields, dict)
+ fields = {"title", "creator", "description", "is_public", "actors"}
+ assert fields - set(schema.header_fields) == set()
+ for field in fields:
+ assert field in schema.header_fields
+ assert "type" in schema.header_fields[field]
+ assert "mandatory" in schema.header_fields[field]
+
+
+def test_mandatory_header_fields(schema):
+ assert isinstance(schema.mandatory_header_fields, dict)
+ fields = ["title", "creator", "actors"]
+ for field in fields:
+ assert field in schema.mandatory_header_fields
+ assert schema.header_fields[field]["mandatory"] is True
+
+
+def test_actor_fields(schema):
+ assert isinstance(schema.actor_fields, dict)
+ fields = {"id", "uri", "label", "lastmod", "categories", "jurisdictions"}
+ assert fields - set(schema.actor_fields) == set()
+ for field in fields:
+ assert field in schema.actor_fields
+ assert "type" in schema.actor_fields[field]
+ assert "mandatory" in schema.actor_fields[field]
+
+
+def test_mandatory_actor_fields(schema):
+ assert isinstance(schema.mandatory_actor_fields, dict)
+ fields = ["id", "uri", "label", "lastmod", "categories"]
+ for field in fields:
+ assert field in schema.mandatory_actor_fields
+ assert schema.actor_fields[field]["mandatory"] is True
+
+
+def test_field_type(schema):
+ for field, ftype in field_types.items():
+ assert schema.field_type(field) == ftype
+
+
+def test_field_taxonomy(schema):
+ assert schema.field_taxonomy("categories") == "entity"
+ assert schema.field_taxonomy("jurisdictions") == "country"
+
+
+def test_field_no_taxonomy(schema):
+ assert schema.field_taxonomy("title") is None
+
+
+def test_check_type(schema):
+ for field, value in field_values.items():
+ assert schema.check_type(field, value)
+ with (pytest.raises(ValidationError)) as e:
+ assert schema.check_type(field, 5)
+ msg = f"Field {field} must be of type {field_types[field]}"
+ assert msg in str(e.value)
+
+
+def test_check_taxonomies(schema, taxonomies):
+ schema.schema["actor"]["test"] = {"taxonomy": "nonexistent"}
+ with (pytest.raises(ValidationError)) as e:
+ assert schema.check_taxonomies("test", "invalid", None)
+ assert "No taxonomies loaded" in str(e.value)
+
+ schema.schema["actor"]["invalidtax"] = {"taxonomy": "nonexistent"}
+ with (pytest.raises(ValidationError)) as e:
+ assert schema.check_taxonomies("invalidtax", "value", taxonomies)
+ assert "Unknown taxonomy nonexistent" in str(e.value)
+
+ assert schema.check_taxonomies("categories", "exchange", taxonomies)
+ with (pytest.raises(ValidationError)) as e:
+ assert schema.check_taxonomies("categories", "test", taxonomies)
+ assert "Undefined concept test for categories field" in str(e.value)
+
+ assert schema.check_taxonomies("jurisdictions", "MX", taxonomies)
+ with (pytest.raises(ValidationError)) as e:
+ assert schema.check_taxonomies("jurisdictions", "test", taxonomies)
+ assert "Undefined concept test for jurisdictions field" in str(e.value)
diff --git a/tests/testfiles/actors/ex_actorpack.yaml b/tests/testfiles/actors/ex_actorpack.yaml
new file mode 100644
index 0000000..58db21d
--- /dev/null
+++ b/tests/testfiles/actors/ex_actorpack.yaml
@@ -0,0 +1,15 @@
+title: Test ActorPack
+creator: GraphSense Core Team
+description: A collection of actors commonly used for demonstrating GraphSense features
+lastmod: 2022-11-29
+actors:
+- id: internet_archive
+ uri: https://archive.org
+ label: Internet Archive
+ jurisdictions: [US]
+ categories: [organization]
+- id: binance
+ uri: https://binance.com
+ label: Binance
+ jurisdictions: [US, AT]
+ categories: [exchange]