From 82a07079ac9b029e073a6d37e2574b5479098517 Mon Sep 17 00:00:00 2001 From: Gibran Gomez Date: Tue, 1 Nov 2022 22:25:42 +0100 Subject: [PATCH 1/7] Re-organization of commands --- bin/tagpack-tool | 273 ++++++++++++++++++++++++++--------------------- 1 file changed, 150 insertions(+), 123 deletions(-) diff --git a/bin/tagpack-tool b/bin/tagpack-tool index 3091f65..bec5d2e 100755 --- a/bin/tagpack-tool +++ b/bin/tagpack-tool @@ -462,183 +462,210 @@ def main(): subparsers = parser.add_subparsers(title='Commands') + + # parser for config command + parser_c = subparsers.add_parser("config", + help="show TagPack Repository config") + parser_c.add_argument('-v', '--verbose', action='store_true', + help='verbose configuration') + parser_c.set_defaults(func=show_config) + + + # parsers for tagpack command + parser_tp = subparsers.add_parser("tagpack", + help="tagpack commands") + + ptp = parser_tp.add_subparsers(title="TagPack commands") + + # parser for validate command + ptp_v = ptp.add_parser("validate", help="validate TagPacks") + ptp_v.add_argument('path', nargs='?', metavar='PATH', default=os.getcwd(), + help='TagPacks file or folder root path (current folder by \ + default)') + ptp_v.add_argument("--no_address_validation", action='store_true', + help='Disables checksum validation of addresses') + ptp_v.set_defaults(func=validate_tagpack) + + # parser for insert command + ptp_i = ptp.add_parser("insert", help="insert TagPacks") + ptp_i.add_argument('path', nargs='?', metavar='PATH', default=os.getcwd(), + help='TagPacks file or folder root path') + ptp_i.add_argument('--schema', + default=_DEFAULT_SCHEMA, metavar='DB_SCHEMA', + help="PostgreSQL schema for tagpack tables") + ptp_i.add_argument('-u', '--url', + help="postgresql://user:password@db_host:port/database") + ptp_i.add_argument('-b', '--batch_size', nargs='?', type=int, default=1000, + help='batch size for insert)') + ptp_i.add_argument("--public", action='store_true', + help='By default, tagpacks are declared private in the database. ' + 'Use this switch to declare them public.') + ptp_i.add_argument("--force", action='store_true', + help='By default, tagpack insertion stops when an already ingested\ + tagpack exists in the database. Use this switch to force \ + re-insertion.') + ptp_i.add_argument("--add_new", action='store_true', + help='By default, tagpack insertion stops when an already ingested\ + tagpack exists in the database. Use this switch to ingest \ + new tagpacks while skipping over existing ones.') + ptp_i.add_argument("--no_strict_check", action='store_true', + help='Disables check for local modifications in git repository') + ptp_i.add_argument("--no_git", action='store_true', + help='Disables check for local git repository') + ptp_i.set_defaults(func=insert_tagpack, url=def_url) + + # parser for taxonomy command parser_t = subparsers.add_parser("taxonomy", - help="show taxonomy concepts") + help="taxonomy commands") parser_t.set_defaults(func=list_taxonomies) - parser_t_subparsers = parser_t.add_subparsers(title='Taxonomy commands') + pxp = parser_t.add_subparsers(title='Taxonomy commands') - # parser for taxonomy insert command - parser_t_i = parser_t_subparsers.add_parser( - 'insert', help='insert taxonomy into GraphSense') - parser_t_i.add_argument('taxonomy', metavar='TAXONOMY_KEY', nargs='?', - choices=['abuse', 'entity'], - help='the selected taxonomy', default=None) - parser_t_i.add_argument('--schema', - default=_DEFAULT_SCHEMA, metavar='DB_SCHEMA', - help="PostgreSQL schema for taxonomy tables") - parser_t_i.add_argument('-u', '--url', help="postgresql://user:password@db_host:port/database") - parser_t_i.set_defaults(func=insert_taxonomy, url=def_url) + # parser for taxonomy list command + pxp_l = pxp.add_parser('list', + help='list taxonomy concepts') + pxp_l.set_defaults(func=list_taxonomies) # parser for taxonomy show command - parser_t_s = parser_t_subparsers.add_parser( - 'show', help='show taxonomy concepts') - parser_t_s.add_argument('taxonomy', metavar='TAXONOMY_KEY', - choices=['abuse', 'entity'], - help='the selected taxonomy') - parser_t_s.add_argument('-v', '--verbose', action='store_true', - help="verbose concepts") - parser_t_s.set_defaults(func=show_taxonomy_concepts) + pxp_s = pxp.add_parser('show', + help='show taxonomy concepts') + pxp_s.add_argument('taxonomy', metavar='TAXONOMY_KEY', + choices=['abuse', 'entity'], + help='the selected taxonomy') + pxp_s.add_argument('-v', '--verbose', action='store_true', + help="verbose concepts") + pxp_s.set_defaults(func=show_taxonomy_concepts) + + # parser for taxonomy insert command + pxp_i = pxp.add_parser('insert', + help='insert taxonomy into GraphSense') + pxp_i.add_argument('taxonomy', metavar='TAXONOMY_KEY', nargs='?', + choices=['abuse', 'entity'], default=None, + help='the selected taxonomy') + pxp_i.add_argument('--schema', + default=_DEFAULT_SCHEMA, metavar='DB_SCHEMA', + help="PostgreSQL schema for taxonomy tables") + pxp_i.add_argument('-u', '--url', + help="postgresql://user:password@db_host:port/database") + pxp_i.set_defaults(func=insert_taxonomy, url=def_url) + # parser for confidence command parser_s = subparsers.add_parser("confidence", help="show confidence scores") parser_s.set_defaults(func=list_confidence_scores) - parser_s_subparsers = parser_s.add_subparsers(title='Confidence commands') + pcp = parser_s.add_subparsers(title='Confidence commands') # parser for confidence ingest command - parser_s_i = parser_s_subparsers.add_parser('ingest', + pcp_i = pcp.add_parser('ingest', help='ingest confidence scores into GraphSense') - parser_s_i.add_argument("--force", action='store_true', + pcp_i.add_argument("--force", action='store_true', help='Force re-insertion of confidence scores.') - parser_s_i.add_argument('--schema', + pcp_i.add_argument('--schema', default=_DEFAULT_SCHEMA, metavar='DB_SCHEMA', help="PostgreSQL schema for confidence tables") - parser_s_i.add_argument('-u', '--url', + pcp_i.add_argument('-u', '--url', help="postgresql://user:password@db_host:port/database") - parser_s_i.set_defaults(func=ingest_confidence_scores, url=def_url) + pcp_i.set_defaults(func=ingest_confidence_scores, url=def_url) # parser for confidence show command - parser_s_s = parser_s_subparsers.add_parser('show', + pcp_s = pcp.add_parser('show', help='show confidence scores') - parser_s_s.add_argument('-v', '--verbose', action='store_true', + pcp_s.add_argument('-v', '--verbose', action='store_true', help="verbose concepts") - parser_s_s.set_defaults(func=show_confidence_scores) + pcp_s.set_defaults(func=show_confidence_scores) + + + # parsers for database housekeeping + parser_db = subparsers.add_parser("tagstore", + help="database housekeeping commands") + + pdp = parser_db.add_subparsers(title="TagStore commands") + + # insert_cluster_mappings [update] + pc = pdp.add_parser("insert_cluster_mappings", + help="insert cluster mappings") + pc.add_argument('-d', '--db_nodes', nargs='+', + default=['localhost'], metavar='DB_NODE', + help='Cassandra node(s); default "localhost")') + pc.add_argument('-f', '--ks_file', + metavar='KEYSPACE_FILE', + help="JSON file with Cassandra keyspaces that contain GraphSense \ + cluster mappings") + pc.add_argument('--schema', + default=_DEFAULT_SCHEMA, metavar='DB_SCHEMA', + help="PostgreSQL schema for GraphSense cluster mapping table") + pc.add_argument('-u', '--url', + help="postgresql://user:password@db_host:port/database") + pc.add_argument('--update', action='store_true', + help='update all cluster mappings') +# pc.set_defaults(update=False) + pc.set_defaults(func=insert_cluster_mapping, url=def_url) + + # refresh_views + pd = pdp.add_parser("refresh_views", help='update views') + pd.add_argument('--schema', default=_DEFAULT_SCHEMA, metavar='DB_SCHEMA', + help="PostgreSQL schema for GraphSense cluster mapping table") + pd.add_argument('-u', '--url', + help="postgresql://user:password@db_host:port/database") + pd.set_defaults(func=update_db, url=def_url) + + # remove_duplicates + pr = pdp.add_parser("remove_duplicates", help='remove duplicate tags') + pr.add_argument('--schema', default=_DEFAULT_SCHEMA, metavar='DB_SCHEMA', + help="PostgreSQL schema for GraphSense cluster mapping table") + pr.add_argument('-u', '--url', + help="postgresql://user:password@db_host:port/database") + pr.set_defaults(func=remove_duplicates, url=def_url) + # parser for quality measures parser_q = subparsers.add_parser("quality", help="calculate tags quality measures") - parser_q.set_defaults(func=show_quality_measures) + parser_q.set_defaults(func=show_quality_measures, url=def_url, + schema=_DEFAULT_SCHEMA, currency='') - parser_q_subparsers = parser_q.add_subparsers(title='Quality commands') + pqp = parser_q.add_subparsers(title='Quality commands') # parser for quality measures calculation - parser_q_i = parser_q_subparsers.add_parser('calculate', + pqp_i = pqp.add_parser('calculate', help='calculate quality measures for all tags in the DB') - parser_q_i.add_argument('--schema', + pqp_i.add_argument('--schema', default=_DEFAULT_SCHEMA, metavar='DB_SCHEMA', help="PostgreSQL schema for quality measures tables") - parser_q_i.add_argument('-u', '--url', + pqp_i.add_argument('-u', '--url', help="postgresql://user:password@db_host:port/database") - parser_q_i.set_defaults(func=calc_quality_measures, url=def_url) + pqp_i.set_defaults(func=calc_quality_measures, url=def_url) # parser for quality measures show - parser_q_s = parser_q_subparsers.add_parser('show', + pqp_s = pqp.add_parser('show', help='show average quality measures') - parser_q_s.add_argument('--currency', + pqp_s.add_argument('--currency', default='', choices=['BCH', 'BTC', 'ETH', 'LTC', 'ZEC'], help="Show the avg quality measure for a specific crypto-currency") - parser_q_s.add_argument('--schema', + pqp_s.add_argument('--schema', default=_DEFAULT_SCHEMA, metavar='DB_SCHEMA', help="PostgreSQL schema for quality measures tables") - parser_q_s.add_argument('-u', '--url', + pqp_s.add_argument('-u', '--url', help="postgresql://user:password@db_host:port/database") - parser_q_s.set_defaults(func=show_quality_measures, url=def_url) - - # parser for config command - parser_c = subparsers.add_parser("config", - help="show TagPack Repository config") - parser_c.add_argument('-v', '--verbose', action='store_true', - help='verbose configuration') - parser_c.set_defaults(func=show_config) + pqp_s.set_defaults(func=show_quality_measures, url=def_url) - # parser for insert command - parser_i = subparsers.add_parser("insert", - help="insert TagPacks into GraphSense") - parser_i.add_argument('path', nargs='?', metavar='PATH', - default=os.getcwd(), - help='TagPacks file or folder root path') - parser_i.add_argument('--schema', - default=_DEFAULT_SCHEMA, metavar='DB_SCHEMA', - help="PostgreSQL schema for tagpack tables") - parser_i.add_argument('-u', '--url', help="postgresql://user:password@db_host:port/database") - parser_i.add_argument('-b', '--batch_size', nargs='?', type=int, - default=1000, - help='batch size for insert)') - parser_i.add_argument("--public", action='store_true', - help='By default, tagpacks are declared private in the database. ' - 'Use this switch to declare them public.') - parser_i.add_argument("--force", action='store_true', - help='By default, tagpack insertion stops when an already ingested tagpack' - 'exists in the database. Use this switch to force re-insertion.') - parser_i.add_argument("--add_new", action='store_true', - help='By default, tagpack insertion stops when an already ingested tagpack' - 'exists in the database. Use this switch to ingest new tagpacks ' - 'while skipping over existing ones.') - parser_i.add_argument("--no_strict_check", action='store_true', - help='Disables check for local modifications in git repository') - parser_i.add_argument("--no_git", action='store_true', - help='Disables check for local git repository') - parser_i.set_defaults(func=insert_tagpack, url=def_url) - - # parser for validate command - parser_v = subparsers.add_parser("validate", help="validate TagPacks") - parser_v.add_argument('path', nargs='?', metavar='PATH', - default=os.getcwd(), - help='TagPacks file or folder root path (current folder\ - by default)') - parser_v.add_argument("--no_address_validation", action='store_true', - help='Disables checksum validation of addresses') - parser_v.set_defaults(func=validate_tagpack) - - # parser for cluster command - parser_cl = subparsers.add_parser("cluster", - help="insert cluster mappings") - parser_cl.add_argument('-d', '--db_nodes', nargs='+', - default=['localhost'], metavar='DB_NODE', - help='Cassandra node(s); default "localhost")') - parser_cl.add_argument('-f', '--ks_file', - metavar='KEYSPACE_FILE', - help="JSON file with Cassandra keyspaces that contain GraphSense cluster mappings") - parser_cl.add_argument('--schema', - default=_DEFAULT_SCHEMA, metavar='DB_SCHEMA', - help="PostgreSQL schema for GraphSense cluster mapping table") - parser_cl.add_argument('-u', '--url', help="postgresql://user:password@db_host:port/database") - parser_cl.add_argument('--update', help='update all cluster mappings', action='store_true') - parser_cl.set_defaults(update=False) - parser_cl.set_defaults(func=insert_cluster_mapping, url=def_url) - - # parsers for database housekeeping - parser_db = subparsers.add_parser("db", - help="database housekeeping commands") - - pdp = parser_db.add_subparsers(title="DB commands") - - pd = pdp.add_parser("refresh_views", help='update views') - pd.add_argument('--schema', - default=_DEFAULT_SCHEMA, metavar='DB_SCHEMA', - help="PostgreSQL schema for GraphSense cluster mapping table") - pd.add_argument('-u', '--url', help="postgresql://user:password@db_host:port/database") - pd.set_defaults(func=update_db, url=def_url) - - pr = pdp.add_parser("remove_duplicates", help='remove duplicate tags') - pr.add_argument('--schema', - default=_DEFAULT_SCHEMA, metavar='DB_SCHEMA', - help="PostgreSQL schema for GraphSense cluster mapping table") - pr.add_argument('-u', '--url', help="postgresql://user:password@db_host:port/database") - pr.set_defaults(func=remove_duplicates, url=def_url) if len(sys.argv) == 1: parser.print_help(sys.stderr) sys.exit(1) args = parser.parse_args() + if hasattr(args, 'url') and not args.url: print_warn(url_msg) parser.error(f"No postgresql URL connection was provided. Exiting.") + if not hasattr(args, 'func'): + parser.error(f"No action was requested. Exiting.") + args.func(args) From 7f9ca013647da9bda53769090a40f9dff9e6d664 Mon Sep 17 00:00:00 2001 From: Gibran Gomez Date: Sun, 13 Nov 2022 16:41:07 +0100 Subject: [PATCH 2/7] Integration of confidence and taxonomies --- bin/tagpack-tool | 172 +++++++++++------------------------ tagpack/tagstore.py | 24 +++-- tagpack/taxonomy.py | 25 +++-- tests/test_tagpack.py | 4 +- tests/test_tagpack_schema.py | 4 +- 5 files changed, 92 insertions(+), 137 deletions(-) diff --git a/bin/tagpack-tool b/bin/tagpack-tool index bec5d2e..1da1f06 100755 --- a/bin/tagpack-tool +++ b/bin/tagpack-tool @@ -29,10 +29,8 @@ TAXONOMY_URL = 'https://graphsense.github.io' DEFAULT_CONFIG = { 'taxonomies': { 'entity': f'{TAXONOMY_URL}/DW-VA-Taxonomy/assets/data/entities.csv', - 'abuse': f'{TAXONOMY_URL}/DW-VA-Taxonomy/assets/data/abuses.csv' - }, - 'confidence': { - 'scores': 'tagpack/db/confidence.csv' + 'abuse': f'{TAXONOMY_URL}/DW-VA-Taxonomy/assets/data/abuses.csv', + 'confidence': 'tagpack/db/confidence.csv' } } @@ -40,34 +38,29 @@ DEFAULT_CONFIG = { _DEFAULT_SCHEMA = "tagstore" -def _remote_load_taxonomies(config): +def _load_taxonomies(config): if 'taxonomies' not in config: return None taxonomies = {} for key in config['taxonomies']: - taxonomy = _remote_load_taxonomy(config, key) + remote = not (key == 'confidence') + taxonomy = _load_taxonomy(config, key, remote=remote) taxonomies[key] = taxonomy return taxonomies -def _remote_load_taxonomy(config, key): +def _load_taxonomy(config, key, remote=False): if 'taxonomies' not in config: return None uri = config['taxonomies'][key] taxonomy = Taxonomy(key, uri) - taxonomy.load_from_remote() + if remote: + taxonomy.load_from_remote() + else: + taxonomy.load_from_local() return taxonomy -def _local_load_confidence_scores(config): - if 'confidence' not in config: - return None - path = config['confidence']['scores'] - confidence = Confidence(path) - confidence.load_from_local() - return confidence - - def list_taxonomies(args=None): config = _load_config(args.config) @@ -83,19 +76,25 @@ def list_taxonomies(args=None): print_line(f"{count} configured taxonomies", 'success') -def show_taxonomy_concepts(args): +def show_taxonomy_concepts(args, remote=False): config = _load_config(args.config) if 'taxonomies' not in config: print_line("No taxonomies configured", 'fail') return + print_line("Showing concepts of taxonomy {}".format(args.taxonomy)) - print("Remote URI: ", config['taxonomies'][args.taxonomy], "\n") - taxonomy = _remote_load_taxonomy(config, args.taxonomy) + remote = not (args.taxonomy == 'confidence') + uri = config['taxonomies'][args.taxonomy] + print(f"{'Remote' if remote else 'Local'} URI: {uri}\n") + taxonomy = _load_taxonomy(config, args.taxonomy, remote=remote) if args.verbose: - headers = ['Id', 'Label', 'Uri', 'Description'] - table = [[c.id, c.label, c.uri, c.description] + headers = ['Id', 'Label', 'Level', 'Uri', 'Description'] + table = [[c.id, c.label, c.level, c.uri, c.description] for c in taxonomy.concepts] + elif args.taxonomy == 'confidence': + headers = ['Level', 'Label'] + table = [[c.level, c.label] for c in taxonomy.concepts] else: headers = ['Id', 'Label'] table = [[c.id, c.label] for c in taxonomy.concepts] @@ -104,7 +103,7 @@ def show_taxonomy_concepts(args): print_line(f"{len(taxonomy.concepts)} taxonomy concepts", 'success') -def insert_taxonomy(args): +def insert_taxonomy(args, remote=False): config = _load_config(args.config) if 'taxonomies' not in config: @@ -124,7 +123,9 @@ def insert_taxonomy(args): for t in tax_keys: print(f"Taxonomy: {t}") try: - taxonomy = _remote_load_taxonomy(config, t) + # TODO this should change when having local taxonomies + remote = not (t == 'confidence') + taxonomy = _load_taxonomy(config, t, remote=remote) tagstore.insert_taxonomy(taxonomy) print(f"{taxonomy.key} | {taxonomy.uri}:", end=' ') @@ -139,71 +140,6 @@ def insert_taxonomy(args): print_line("Aborted insert", 'fail') -def list_confidence_scores(args): - config = _load_config(args.config) - - print_line("Show configured confidence scores") - print_line(f"Configuration: {args.config}", 'info') - count = 0 - if 'confidence' not in config: - print_line("No confidence scores configured", 'fail') - else: - for key, value in config['confidence'].items(): - print_line(value) - count += 1 - print_line(f"{count} configured confidence scores", 'success') - - -def show_confidence_scores(args): - config = _load_config(args.config) - - if 'confidence' not in config: - print_line("No confidence scores configured", 'fail') - return - - print_line("Showing confidence scores") - print("Local path: ", config['confidence']['scores'], "\n") - confidence = _local_load_confidence_scores(config) - if args.verbose: - headers = ['id', 'label', 'description', 'level'] - table = [[c.id, c.label, c.description, c.level] - for c in confidence.scores] - else: - headers = ['Level', 'Label'] - table = [[c.level, c.label] for c in confidence.scores] - - print(tabulate(table, headers=headers)) - print_line(f"{len(confidence.scores)} confidence scores", 'success') - - -def ingest_confidence_scores(args): - config = _load_config(args.config) - - if 'confidence' not in config: - print_line("No confidence scores configured", 'fail') - return - - t0 = time.time() - print_line("Confidence scores ingest starts") - - tagstore = TagStore(args.url, args.schema) - - try: - confidence = _local_load_confidence_scores(config) - tagstore.insert_confidence_scores(confidence, args.force) - - print(f"{confidence.path}:", end=' ') - print_success("INSERTED") - - duration = round(time.time() - t0, 2) - print_line( - f"Inserted {len(confidence.scores)} scores in {duration}s", - 'success') - except Exception as e: - print_fail(e) - print_line("Aborted ingestion", 'fail') - - def show_quality_measures(args): print_line("Show quality measures") tagstore = TagStore(args.url, args.schema) @@ -272,7 +208,7 @@ def validate_tagpack(args): print_line("TagPack validation starts") print(f"Path: {args.path}") - taxonomies = _remote_load_taxonomies(config) + taxonomies = _load_taxonomies(config) taxonomy_keys = [key for key in taxonomies.keys()] print(f"Loaded taxonomies: {taxonomy_keys}") @@ -328,7 +264,7 @@ def insert_tagpack(args): print_info(f"Loaded TagPack schema definition: {schema.definition}") config = _load_config(args.config) - taxonomies = _remote_load_taxonomies(config) + taxonomies = _load_taxonomies(config) taxonomy_keys = [key for key in taxonomies.keys()] print(f"Loaded taxonomies: {taxonomy_keys}") @@ -531,7 +467,7 @@ def main(): pxp_s = pxp.add_parser('show', help='show taxonomy concepts') pxp_s.add_argument('taxonomy', metavar='TAXONOMY_KEY', - choices=['abuse', 'entity'], + choices=['abuse', 'entity', 'confidence'], help='the selected taxonomy') pxp_s.add_argument('-v', '--verbose', action='store_true', help="verbose concepts") @@ -541,7 +477,7 @@ def main(): pxp_i = pxp.add_parser('insert', help='insert taxonomy into GraphSense') pxp_i.add_argument('taxonomy', metavar='TAXONOMY_KEY', nargs='?', - choices=['abuse', 'entity'], default=None, + choices=['abuse', 'entity', 'confidence'], default=None, help='the selected taxonomy') pxp_i.add_argument('--schema', default=_DEFAULT_SCHEMA, metavar='DB_SCHEMA', @@ -551,31 +487,31 @@ def main(): pxp_i.set_defaults(func=insert_taxonomy, url=def_url) - # parser for confidence command - parser_s = subparsers.add_parser("confidence", - help="show confidence scores") - parser_s.set_defaults(func=list_confidence_scores) - - pcp = parser_s.add_subparsers(title='Confidence commands') - - # parser for confidence ingest command - pcp_i = pcp.add_parser('ingest', - help='ingest confidence scores into GraphSense') - pcp_i.add_argument("--force", action='store_true', - help='Force re-insertion of confidence scores.') - pcp_i.add_argument('--schema', - default=_DEFAULT_SCHEMA, metavar='DB_SCHEMA', - help="PostgreSQL schema for confidence tables") - pcp_i.add_argument('-u', '--url', - help="postgresql://user:password@db_host:port/database") - pcp_i.set_defaults(func=ingest_confidence_scores, url=def_url) - - # parser for confidence show command - pcp_s = pcp.add_parser('show', - help='show confidence scores') - pcp_s.add_argument('-v', '--verbose', action='store_true', - help="verbose concepts") - pcp_s.set_defaults(func=show_confidence_scores) +# # parser for confidence command +# parser_s = subparsers.add_parser("confidence", +# help="show confidence scores") +# parser_s.set_defaults(func=list_confidence_scores) +# +# pcp = parser_s.add_subparsers(title='Confidence commands') +# +# # parser for confidence insert command +# pcp_i = pcp.add_parser('insert', +# help='insert confidence scores into GraphSense') +# pcp_i.add_argument("--force", action='store_true', +# help='Force re-insertion of confidence scores.') +# pcp_i.add_argument('--schema', +# default=_DEFAULT_SCHEMA, metavar='DB_SCHEMA', +# help="PostgreSQL schema for confidence tables") +# pcp_i.add_argument('-u', '--url', +# help="postgresql://user:password@db_host:port/database") +# pcp_i.set_defaults(func=ingest_confidence_scores, url=def_url) +# +# # parser for confidence show command +# pcp_s = pcp.add_parser('show', +# help='show confidence scores') +# pcp_s.add_argument('-v', '--verbose', action='store_true', +# help="verbose concepts") +# pcp_s.set_defaults(func=show_confidence_scores) # parsers for database housekeeping diff --git a/tagpack/tagstore.py b/tagpack/tagstore.py index 771bb1c..e412564 100644 --- a/tagpack/tagstore.py +++ b/tagpack/tagstore.py @@ -22,23 +22,29 @@ def __init__(self, url, schema): self.existing_packs = None def insert_taxonomy(self, taxonomy): - self.cursor.execute("""INSERT INTO taxonomy (id, source, description) VALUES (%s, %s, %s)""", (taxonomy.key, taxonomy.uri, f"Imported at {datetime.now().isoformat()}")) + if taxonomy.key == 'confidence': + self.insert_confidence_scores(taxonomy) + return + + statement = "INSERT INTO taxonomy (id, source, description) " + statement += "VALUES (%s, %s, %s)" + desc = f"Imported at {datetime.now().isoformat()}" + v = (taxonomy.key, taxonomy.uri, desc) + self.cursor.execute(statement, v) for c in taxonomy.concepts: - self.cursor.execute("""INSERT INTO concept (id, label, taxonomy, source, description) VALUES (%s, %s, %s, %s, %s)""", (c.id, c.label, c.taxonomy.key, c.uri, c.description)) + statement = "INSERT INTO concept (id, label, taxonomy, source, " + statement += "description) VALUES (%s, %s, %s, %s, %s)" + v = (c.id, c.label, c.taxonomy.key, c.uri, c.description) + self.cursor.execute(statement, v) self.conn.commit() - def insert_confidence_scores(self, confidence, force): + def insert_confidence_scores(self, confidence): statement = "INSERT INTO confidence (id, label, description, level)" statement += " VALUES (%s, %s, %s, %s)" - # TODO What to do with foreign key restrictions? -# if force: -# print(f"evicting and re-inserting all confidence scores") -# self.cursor.execute("DELETE FROM confidence") - - for c in confidence.scores: + for c in confidence.concepts: values = (c.id, c.label, c.description, c.level) self.cursor.execute(statement, values) diff --git a/tagpack/taxonomy.py b/tagpack/taxonomy.py index 6d4ccf1..54021c0 100644 --- a/tagpack/taxonomy.py +++ b/tagpack/taxonomy.py @@ -17,21 +17,23 @@ class Concept(object): """ - def __init__(self, taxonomy, id, uri, label, description): + def __init__(self, taxonomy, id, uri, label, level, description): self.taxonomy = taxonomy self.id = id self.uri = uri self.label = label + self.level = level self.description = description def to_json(self): return json.dumps( {'taxonomy': self.taxonomy.key, 'id': self.id, 'uri': self.uri, - 'label': self.label, 'description': self.description}) + 'label': self.label, 'level': self.level, + 'description': self.description}) def __str__(self): s = [str(self.taxonomy.key), str(self.id), str(self.uri), - str(self.label), str(self.description)] + str(self.label), str(self.level), str(self.description)] return "[" + " | ".join(s) + "]" @@ -55,17 +57,28 @@ def load_from_remote(self): f = StringIO(response.text) csv_reader = csv.DictReader(f, delimiter=',') for row in csv_reader: + level = row['level'] if 'level' in row else None concept = Concept(self, row['id'], row['uri'], - row['label'], row['description']) + row['label'], level, row['description']) self.concepts.append(concept) + def load_from_local(self): + with open(self.uri, 'r') as f: + csv_reader = csv.DictReader(f, delimiter=',') + for row in csv_reader: + level = row['level'] if 'level' in row else None + concept = Concept(self, row['id'], self.uri, + row['label'], level, row['description']) + self.concepts.append(concept) + @property def concept_ids(self): return [concept.id for concept in self.concepts] - def add_concept(self, concept_id, label, description): + def add_concept(self, concept_id, label, level, description): concept_uri = self.uri + '/' + concept_id - concept = Concept(self, concept_id, concept_uri, label, description) + concept = Concept(self, concept_id, concept_uri, label, level, + description) self.concepts.append(concept) def to_json(self): diff --git a/tests/test_tagpack.py b/tests/test_tagpack.py index 4462b6a..fa90f0f 100644 --- a/tests/test_tagpack.py +++ b/tests/test_tagpack.py @@ -19,10 +19,10 @@ def schema(monkeypatch): @pytest.fixture def taxonomies(): tax_entity = Taxonomy('entity', 'http://example.com/entity') - tax_entity.add_concept('exchange', 'Exchange', 'Some description') + tax_entity.add_concept('exchange', 'Exchange', None, 'Some description') tax_abuse = Taxonomy('abuse', 'http://example.com/abuse') - tax_abuse.add_concept('bad_coding', 'Bad coding', 'Really bad') + tax_abuse.add_concept('bad_coding', 'Bad coding', None, 'Really bad') taxonomies = {'entity': tax_entity, 'abuse': tax_abuse} return taxonomies diff --git a/tests/test_tagpack_schema.py b/tests/test_tagpack_schema.py index 373ac2a..fb919e9 100644 --- a/tests/test_tagpack_schema.py +++ b/tests/test_tagpack_schema.py @@ -15,10 +15,10 @@ def schema(monkeypatch): @pytest.fixture def taxonomies(): tax_entity = Taxonomy('entity', 'http://example.com/entity') - tax_entity.add_concept('exchange', 'Exchange', 'Some description') + tax_entity.add_concept('exchange', 'Exchange', None, 'Some description') tax_abuse = Taxonomy('abuse', 'http://example.com/abuse') - tax_abuse.add_concept('bad_coding', 'Bad coding', 'Really bad') + tax_abuse.add_concept('bad_coding', 'Bad coding', None, 'Really bad') taxonomies = {'entity': tax_entity, 'abuse': tax_abuse} return taxonomies From a059fd1dde7d0ba85353546eefff491c6c207e98 Mon Sep 17 00:00:00 2001 From: Gibran Gomez Date: Mon, 14 Nov 2022 13:23:07 +0100 Subject: [PATCH 3/7] Changes to bin/tagpack-tool suggested by flake8 --- bin/tagpack-tool | 229 ++++++++++++++++++++++++----------------------- 1 file changed, 116 insertions(+), 113 deletions(-) diff --git a/bin/tagpack-tool b/bin/tagpack-tool index 1da1f06..f0197c2 100755 --- a/bin/tagpack-tool +++ b/bin/tagpack-tool @@ -20,7 +20,6 @@ from tagpack.tagpack import TagPack, TagPackFileError, collect_tagpack_files,\ from tagpack.tagpack_schema import TagPackSchema, ValidationError from tagpack.tagstore import TagStore from tagpack.taxonomy import Taxonomy -from tagpack.confidence import Confidence CONFIG_FILE = "config.yaml" @@ -168,7 +167,7 @@ def calc_quality_measures(args): try: qm = tagstore.calculate_quality_measures() - print(f"Global quality measures:") + print("Global quality measures:") if qm is not None: print(f"\tCOUNT: {qm['count']}") print(f"\tAVG: {qm['avg']}") @@ -273,14 +272,16 @@ def insert_tagpack(args): # resolve backlinks to remote repository and relative paths scheck, nogit = not args.no_strict_check, args.no_git prepared_packs = [(m, h, n[0], n[1]) for m, h, n in - [(a, h, get_uri_for_tagpack(base_url, a, scheck, nogit))\ - for h, fs in tagpack_files.items() for a in fs]] + [(a, h, get_uri_for_tagpack(base_url, a, scheck, nogit)) + for h, fs in tagpack_files.items() for a in fs]] prefix = config.get('prefix', '') if args.add_new: # don't re-insert existing tagpacks print_info("Checking which files are new to the tagstore:") - prepared_packs = [(t, h, u, r) for (t, h, u, r) in prepared_packs \ - if not tagstore.tp_exists(prefix, r)] + prepared_packs = [ + (t, h, u, r) for (t, h, u, r) in prepared_packs + if not tagstore.tp_exists(prefix, r) + ] n_ppacks = len(prepared_packs) print_info(f"Collected {n_ppacks} TagPack files\n") @@ -311,7 +312,9 @@ def insert_tagpack(args): msg += "Only tags for supported currencies {} are inserted." print_line(msg.format(no_passed, n_ppacks, no_tags, duration, supported), status) - print_info("Don't forget to run 'db refresh_views' soon to keep the database consistent!") + msg = "Don't forget to run 'db refresh_views' soon to keep the database" + msg += " consistent!" + print_info(msg) def _split_into_chunks(seq, size): @@ -320,7 +323,8 @@ def _split_into_chunks(seq, size): def insert_cluster_mapping(args, batch_size=5_000): tagstore = TagStore(args.url, args.schema) - df = pd.DataFrame(tagstore.get_addresses(args.update), columns=['address', 'currency']) + df = pd.DataFrame(tagstore.get_addresses(args.update), + columns=['address', 'currency']) ks_mapping = json.load(open(args.ks_file)) gs = GraphSense(args.db_nodes, ks_mapping) @@ -338,13 +342,17 @@ def insert_cluster_mapping(args, batch_size=5_000): tagstore.insert_cluster_mappings(clusters) mappings_count += len(clusters) - print_success(f"INSERTED/UPDATED {mappings_count} {currency} cluster mappings") + msg = f"INSERTED/UPDATED {mappings_count} {currency} cluster" + msg += " mappings" + print_success(msg) processed_currencies.append(currency) except Exception as e: print_fail("FAILED", e) duration = round(time.time() - t0, 2) - print_line(f"Inserted {'missing' if not args.update else 'all'} cluster mappings for {processed_currencies} in {duration}s", "success") + msg = f"Inserted {'missing' if not args.update else 'all'} cluster " + msg += f"mappings for {processed_currencies} in {duration}s" + print_line(msg, "success") tagstore.finish_mappings_update(gs.ks_map.keys()) @@ -357,7 +365,8 @@ def update_db(args): def remove_duplicates(args): tagstore = TagStore(args.url, args.schema) rows_deleted = tagstore.remove_duplicates() - print_info(f'{rows_deleted} duplicate tags have been deleted from the database.') + msg = f"{rows_deleted} duplicate tags have been deleted from the database." + print_info(msg) def show_version(): @@ -374,10 +383,11 @@ def read_url_from_env(): url = f"postgresql://{ev['POSTGRES_USER']}:{ev['POSTGRES_PASSWORD']}" url += f"@{ev['POSTGRES_HOST']}:5432/{ev['POSTGRES_DB']}" msg = '' - except KeyError as e: - miss = set([f"POSTGRES_{a}" for a in ['USER','PASSWORD','HOST','DB']]) + except KeyError: + fields = ['USER', 'PASSWORD', 'HOST', 'DB'] + miss = set([f"POSTGRES_{a}" for a in fields]) miss -= set(ev.keys()) - msg = f"Unable to build postgresql URL from environmnet variables: " + msg = "Unable to build postgresql URL from environmnet variables: " msg += ', '.join(miss) + ' not found.' url = None return url, msg @@ -398,197 +408,190 @@ def main(): subparsers = parser.add_subparsers(title='Commands') - # parser for config command - parser_c = subparsers.add_parser("config", - help="show TagPack Repository config") - parser_c.add_argument('-v', '--verbose', action='store_true', + parser_c = subparsers.add_parser( + "config", help="show TagPack Repository config") + parser_c.add_argument( + '-v', '--verbose', action='store_true', help='verbose configuration') parser_c.set_defaults(func=show_config) - # parsers for tagpack command - parser_tp = subparsers.add_parser("tagpack", - help="tagpack commands") + parser_tp = subparsers.add_parser( + "tagpack", help="tagpack commands") ptp = parser_tp.add_subparsers(title="TagPack commands") # parser for validate command ptp_v = ptp.add_parser("validate", help="validate TagPacks") - ptp_v.add_argument('path', nargs='?', metavar='PATH', default=os.getcwd(), - help='TagPacks file or folder root path (current folder by \ + ptp_v.add_argument( + 'path', nargs='?', metavar='PATH', default=os.getcwd(), + help='TagPack file or folder root path (current folder by \ default)') - ptp_v.add_argument("--no_address_validation", action='store_true', + ptp_v.add_argument( + "--no_address_validation", action='store_true', help='Disables checksum validation of addresses') ptp_v.set_defaults(func=validate_tagpack) # parser for insert command ptp_i = ptp.add_parser("insert", help="insert TagPacks") - ptp_i.add_argument('path', nargs='?', metavar='PATH', default=os.getcwd(), + ptp_i.add_argument( + 'path', nargs='?', metavar='PATH', default=os.getcwd(), help='TagPacks file or folder root path') - ptp_i.add_argument('--schema', - default=_DEFAULT_SCHEMA, metavar='DB_SCHEMA', + ptp_i.add_argument( + '--schema', default=_DEFAULT_SCHEMA, metavar='DB_SCHEMA', help="PostgreSQL schema for tagpack tables") - ptp_i.add_argument('-u', '--url', + ptp_i.add_argument( + '-u', '--url', help="postgresql://user:password@db_host:port/database") - ptp_i.add_argument('-b', '--batch_size', nargs='?', type=int, default=1000, + ptp_i.add_argument( + '-b', '--batch_size', nargs='?', type=int, default=1000, help='batch size for insert)') - ptp_i.add_argument("--public", action='store_true', - help='By default, tagpacks are declared private in the database. ' - 'Use this switch to declare them public.') - ptp_i.add_argument("--force", action='store_true', + ptp_i.add_argument( + "--public", action='store_true', + help='By default, tagpacks are declared private in the database.\ + Use this switch to declare them public.') + ptp_i.add_argument( + "--force", action='store_true', help='By default, tagpack insertion stops when an already ingested\ tagpack exists in the database. Use this switch to force \ re-insertion.') - ptp_i.add_argument("--add_new", action='store_true', + ptp_i.add_argument( + "--add_new", action='store_true', help='By default, tagpack insertion stops when an already ingested\ tagpack exists in the database. Use this switch to ingest \ new tagpacks while skipping over existing ones.') - ptp_i.add_argument("--no_strict_check", action='store_true', + ptp_i.add_argument( + "--no_strict_check", action='store_true', help='Disables check for local modifications in git repository') - ptp_i.add_argument("--no_git", action='store_true', + ptp_i.add_argument( + "--no_git", action='store_true', help='Disables check for local git repository') ptp_i.set_defaults(func=insert_tagpack, url=def_url) - # parser for taxonomy command - parser_t = subparsers.add_parser("taxonomy", - help="taxonomy commands") + parser_t = subparsers.add_parser( + "taxonomy", help="taxonomy commands") parser_t.set_defaults(func=list_taxonomies) pxp = parser_t.add_subparsers(title='Taxonomy commands') # parser for taxonomy list command - pxp_l = pxp.add_parser('list', - help='list taxonomy concepts') + pxp_l = pxp.add_parser( + 'list', help='list taxonomy concepts') pxp_l.set_defaults(func=list_taxonomies) # parser for taxonomy show command - pxp_s = pxp.add_parser('show', - help='show taxonomy concepts') - pxp_s.add_argument('taxonomy', metavar='TAXONOMY_KEY', + pxp_s = pxp.add_parser( + 'show', help='show taxonomy concepts') + pxp_s.add_argument( + 'taxonomy', metavar='TAXONOMY_KEY', choices=['abuse', 'entity', 'confidence'], help='the selected taxonomy') - pxp_s.add_argument('-v', '--verbose', action='store_true', + pxp_s.add_argument( + '-v', '--verbose', action='store_true', help="verbose concepts") pxp_s.set_defaults(func=show_taxonomy_concepts) # parser for taxonomy insert command - pxp_i = pxp.add_parser('insert', - help='insert taxonomy into GraphSense') - pxp_i.add_argument('taxonomy', metavar='TAXONOMY_KEY', nargs='?', + pxp_i = pxp.add_parser( + 'insert', help='insert taxonomy into GraphSense') + pxp_i.add_argument( + 'taxonomy', metavar='TAXONOMY_KEY', nargs='?', choices=['abuse', 'entity', 'confidence'], default=None, help='the selected taxonomy') - pxp_i.add_argument('--schema', - default=_DEFAULT_SCHEMA, metavar='DB_SCHEMA', + pxp_i.add_argument( + '--schema', default=_DEFAULT_SCHEMA, metavar='DB_SCHEMA', help="PostgreSQL schema for taxonomy tables") - pxp_i.add_argument('-u', '--url', + pxp_i.add_argument( + '-u', '--url', help="postgresql://user:password@db_host:port/database") pxp_i.set_defaults(func=insert_taxonomy, url=def_url) - -# # parser for confidence command -# parser_s = subparsers.add_parser("confidence", -# help="show confidence scores") -# parser_s.set_defaults(func=list_confidence_scores) -# -# pcp = parser_s.add_subparsers(title='Confidence commands') -# -# # parser for confidence insert command -# pcp_i = pcp.add_parser('insert', -# help='insert confidence scores into GraphSense') -# pcp_i.add_argument("--force", action='store_true', -# help='Force re-insertion of confidence scores.') -# pcp_i.add_argument('--schema', -# default=_DEFAULT_SCHEMA, metavar='DB_SCHEMA', -# help="PostgreSQL schema for confidence tables") -# pcp_i.add_argument('-u', '--url', -# help="postgresql://user:password@db_host:port/database") -# pcp_i.set_defaults(func=ingest_confidence_scores, url=def_url) -# -# # parser for confidence show command -# pcp_s = pcp.add_parser('show', -# help='show confidence scores') -# pcp_s.add_argument('-v', '--verbose', action='store_true', -# help="verbose concepts") -# pcp_s.set_defaults(func=show_confidence_scores) - - # parsers for database housekeeping - parser_db = subparsers.add_parser("tagstore", - help="database housekeeping commands") + parser_db = subparsers.add_parser( + "tagstore", help="database housekeeping commands") pdp = parser_db.add_subparsers(title="TagStore commands") # insert_cluster_mappings [update] - pc = pdp.add_parser("insert_cluster_mappings", - help="insert cluster mappings") - pc.add_argument('-d', '--db_nodes', nargs='+', + pc = pdp.add_parser( + "insert_cluster_mappings", help="insert cluster mappings") + pc.add_argument( + '-d', '--db_nodes', nargs='+', default=['localhost'], metavar='DB_NODE', help='Cassandra node(s); default "localhost")') - pc.add_argument('-f', '--ks_file', + pc.add_argument( + '-f', '--ks_file', metavar='KEYSPACE_FILE', help="JSON file with Cassandra keyspaces that contain GraphSense \ cluster mappings") - pc.add_argument('--schema', - default=_DEFAULT_SCHEMA, metavar='DB_SCHEMA', + pc.add_argument( + '--schema', default=_DEFAULT_SCHEMA, metavar='DB_SCHEMA', help="PostgreSQL schema for GraphSense cluster mapping table") - pc.add_argument('-u', '--url', + pc.add_argument( + '-u', '--url', help="postgresql://user:password@db_host:port/database") - pc.add_argument('--update', action='store_true', - help='update all cluster mappings') -# pc.set_defaults(update=False) + pc.add_argument( + '--update', action='store_true', + help='update all cluster mappings') pc.set_defaults(func=insert_cluster_mapping, url=def_url) # refresh_views pd = pdp.add_parser("refresh_views", help='update views') - pd.add_argument('--schema', default=_DEFAULT_SCHEMA, metavar='DB_SCHEMA', + pd.add_argument( + '--schema', default=_DEFAULT_SCHEMA, metavar='DB_SCHEMA', help="PostgreSQL schema for GraphSense cluster mapping table") - pd.add_argument('-u', '--url', + pd.add_argument( + '-u', '--url', help="postgresql://user:password@db_host:port/database") pd.set_defaults(func=update_db, url=def_url) # remove_duplicates pr = pdp.add_parser("remove_duplicates", help='remove duplicate tags') - pr.add_argument('--schema', default=_DEFAULT_SCHEMA, metavar='DB_SCHEMA', + pr.add_argument( + '--schema', default=_DEFAULT_SCHEMA, metavar='DB_SCHEMA', help="PostgreSQL schema for GraphSense cluster mapping table") - pr.add_argument('-u', '--url', + pr.add_argument( + '-u', '--url', help="postgresql://user:password@db_host:port/database") pr.set_defaults(func=remove_duplicates, url=def_url) - # parser for quality measures - parser_q = subparsers.add_parser("quality", - help="calculate tags quality measures") - parser_q.set_defaults(func=show_quality_measures, url=def_url, + parser_q = subparsers.add_parser( + "quality", help="calculate tags quality measures") + parser_q.set_defaults( + func=show_quality_measures, url=def_url, schema=_DEFAULT_SCHEMA, currency='') pqp = parser_q.add_subparsers(title='Quality commands') # parser for quality measures calculation - pqp_i = pqp.add_parser('calculate', + pqp_i = pqp.add_parser( + 'calculate', help='calculate quality measures for all tags in the DB') - pqp_i.add_argument('--schema', - default=_DEFAULT_SCHEMA, metavar='DB_SCHEMA', + pqp_i.add_argument( + '--schema', default=_DEFAULT_SCHEMA, metavar='DB_SCHEMA', help="PostgreSQL schema for quality measures tables") - pqp_i.add_argument('-u', '--url', + pqp_i.add_argument( + '-u', '--url', help="postgresql://user:password@db_host:port/database") pqp_i.set_defaults(func=calc_quality_measures, url=def_url) # parser for quality measures show - pqp_s = pqp.add_parser('show', - help='show average quality measures') - pqp_s.add_argument('--currency', - default='', choices=['BCH', 'BTC', 'ETH', 'LTC', 'ZEC'], + pqp_s = pqp.add_parser('show', help='show average quality measures') + pqp_s.add_argument( + '--currency', default='', + choices=['BCH', 'BTC', 'ETH', 'LTC', 'ZEC'], help="Show the avg quality measure for a specific crypto-currency") - pqp_s.add_argument('--schema', - default=_DEFAULT_SCHEMA, metavar='DB_SCHEMA', + pqp_s.add_argument( + '--schema', default=_DEFAULT_SCHEMA, metavar='DB_SCHEMA', help="PostgreSQL schema for quality measures tables") - pqp_s.add_argument('-u', '--url', + pqp_s.add_argument( + '-u', '--url', help="postgresql://user:password@db_host:port/database") pqp_s.set_defaults(func=show_quality_measures, url=def_url) - if len(sys.argv) == 1: parser.print_help(sys.stderr) sys.exit(1) @@ -597,10 +600,10 @@ def main(): if hasattr(args, 'url') and not args.url: print_warn(url_msg) - parser.error(f"No postgresql URL connection was provided. Exiting.") + parser.error("No postgresql URL connection was provided. Exiting.") if not hasattr(args, 'func'): - parser.error(f"No action was requested. Exiting.") + parser.error("No action was requested. Exiting.") args.func(args) From 1d23f9ac99193c1a8374ef2ce6da7e2999de555a Mon Sep 17 00:00:00 2001 From: Gibran Gomez Date: Mon, 14 Nov 2022 14:26:14 +0100 Subject: [PATCH 4/7] Changes to tagpack/tagpack.py suggested by flake8 --- tagpack/tagpack.py | 52 ++++++++++++++++++++++++++++------------------ 1 file changed, 32 insertions(+), 20 deletions(-) diff --git a/tagpack/tagpack.py b/tagpack/tagpack.py index 4e99430..ad1cf99 100644 --- a/tagpack/tagpack.py +++ b/tagpack/tagpack.py @@ -63,11 +63,14 @@ def get_uri_for_tagpack(repo_path, tagpack_file, strict_check, no_git): repo = Repo(repo_path) if strict_check and repo.is_dirty(): - print_info(f"Local modifications in {repo.common_dir} detected, please push first.") + msg = f"Local modifications in {repo.common_dir} detected, please " + msg += "push first." + print_info(msg) sys.exit(0) if len(repo.remotes) > 1: - raise ValidationError("Multiple remotes present, cannot decide on backlink. ") + msg = "Multiple remotes present, cannot decide on backlink." + raise ValidationError(msg) rel_path = str(pathlib.Path(tagpack_file).relative_to(repo_path)) @@ -91,7 +94,7 @@ def collect_tagpack_files(path): files = set(glob.glob(path + '/**/*.yaml', recursive=True)) elif os.path.isfile(path): # validate single file files = set([path]) - else: # TODO Error! Should we validate the path within __main__? + else: # TODO Error! Should we validate the path within __main__? print_warn(f"Not a valid path: {path}") return {} @@ -107,8 +110,11 @@ def collect_tagpack_files(path): for f in hfiles: header = os.path.dirname(f) # Select files in the same path than header, subdirs only - match_files = set([mfile for mfile in files if (header in mfile \ - and len(mfile.split(os.sep)) > len(f.split(os.sep)))]) + match_files = set( + [mfile for mfile in files if ( + header in mfile + and len(mfile.split(os.sep)) > len(f.split(os.sep)) + )]) tagpack_files[header] = match_files files -= match_files @@ -120,7 +126,7 @@ def collect_tagpack_files(path): for t, fs in tagpack_files.items(): if not fs: msj = f"\tThe header file in {os.path.realpath(t)} won't be " - msj += f"included in any tagpack" + msj += "included in any tagpack" print_warn(msj) tagpack_files = {k: v for k, v in tagpack_files.items() if v} @@ -151,11 +157,13 @@ def __init__(self, uri, contents, schema, taxonomies): self._unique_tags = [] self._duplicates = [] - verifiable_currencies = [a.ticker \ + verifiable_currencies = [ + a.ticker for a in coinaddrvalidator.currency.Currencies.instances.values()] def load_from_file(uri, pathname, schema, taxonomies, header_dir=None): - YamlIncludeConstructor.add_to_loader_class(loader_class=yaml.FullLoader, base_dir=header_dir) + YamlIncludeConstructor.add_to_loader_class( + loader_class=yaml.FullLoader, base_dir=header_dir) if not os.path.isfile(pathname): sys.exit("This program requires {} to be a file" @@ -212,8 +220,11 @@ def get_unique_tags(self): for tag in self.tags: # check if duplicate entry - t = tuple([str(tag.all_fields.get(k)).lower() if k in tag.all_fields.keys() else '' - for k in ['address', 'currency', 'label', 'source']]) + t = tuple( + [str(tag.all_fields.get(k)).lower() + if k in tag.all_fields.keys() else '' + for k in ['address', 'currency', 'label', 'source']] + ) if t in seen: duplicates.append(t) else: @@ -251,6 +262,9 @@ def validate(self): raise ValidationError("no tags found.") # iterate over all tags, check types, taxonomy and mandatory use + e2 = "Mandatory tag field {} missing in {}" + e3 = "Field {} not allowed in {}" + e4 = "Value of body field {} must not be empty (None) in {}" for tag in self.get_unique_tags(): # check if mandatory tag fields are defined if not isinstance(tag, Tag): @@ -259,17 +273,16 @@ def validate(self): for schema_field in self.schema.mandatory_tag_fields: if schema_field not in tag.explicit_fields and \ schema_field not in self.tag_fields: - raise ValidationError(f"Mandatory tag field {schema_field} missing in {tag} ") + raise ValidationError(e2.format(schema_field, tag)) for field, value in tag.explicit_fields.items(): # check whether field is defined as body field if field not in self.schema.tag_fields: - raise ValidationError(f"Field {field} not allowed in {tag} ") + raise ValidationError(e3.format(field, tag)) # check for None values if value is None: - raise ValidationError( - f"Value of body field {field} must not be empty (None) in {tag}") + raise ValidationError(e4.format(field, tag)) # check types and taxomomy use try: @@ -279,10 +292,11 @@ def validate(self): raise ValidationError(f'{e} in {tag}') if self._duplicates: - print_info(f"{len(self._duplicates)} duplicate(s) found, starting with {self._duplicates[0]}\n") + msg = f"{len(self._duplicates)} duplicate(s) found, starting " + msg += f"with {self._duplicates[0]}\n" + print_info(msg) return True - def verify_addresses(self): """ Verify valid blockchain addresses using coinaddrvalidator library. In @@ -292,9 +306,8 @@ def verify_addresses(self): """ unsupported = defaultdict(set) - + msg = "\tPossible invalid {} address: {}" for tag in self.get_unique_tags(): - currency = tag.all_fields.get('currency', '').lower() cupper = currency.upper() address = tag.all_fields.get('address') @@ -303,7 +316,7 @@ def verify_addresses(self): elif currency in self.verifiable_currencies: v = coinaddrvalidator.validate(currency, address) if not v.valid: - print_warn(f"\tPossible invalid {cupper} address: {address}") + print_warn(msg.format(cupper, address)) else: unsupported[cupper].add(address) @@ -312,7 +325,6 @@ def verify_addresses(self): for a in sorted(addrs): print_warn(f"\t\t{a}") - def to_json(self): """Returns a JSON representation of a TagPack's header""" tagpack = {} From d1be88ceb679becc064566d46e322a14820b4aa5 Mon Sep 17 00:00:00 2001 From: Gibran Gomez Date: Mon, 14 Nov 2022 18:36:57 +0100 Subject: [PATCH 5/7] Changes to tagpack/tagstore.py suggested by flake8 --- tagpack/tagstore.py | 62 ++++++++++++++++++++++++++++++--------------- 1 file changed, 42 insertions(+), 20 deletions(-) diff --git a/tagpack/tagstore.py b/tagpack/tagstore.py index e412564..56a23ab 100644 --- a/tagpack/tagstore.py +++ b/tagpack/tagstore.py @@ -1,5 +1,4 @@ # -*- coding: utf-8 -*- -import os from datetime import datetime import numpy as np @@ -58,18 +57,31 @@ def tp_exists(self, prefix, rel_path): def create_id(self, prefix, rel_path): return ":".join([prefix, rel_path]) if prefix else rel_path - def insert_tagpack(self, tagpack, is_public, force_insert, prefix, rel_path, batch=1000): + def insert_tagpack(self, tagpack, is_public, force_insert, prefix, + rel_path, batch=1000): + tagpack_id = self.create_id(prefix, rel_path) h = _get_header(tagpack, tagpack_id) if force_insert: print(f"evicting and re-inserting tagpack {tagpack_id}") - self.cursor.execute("DELETE FROM tagpack WHERE id = (%s)", (tagpack_id,)) - self.cursor.execute("INSERT INTO tagpack (id, title, description, creator, uri, is_public) VALUES (%s,%s,%s,%s,%s,%s)", (h.get('id'), h.get('title'), h.get('description'), h.get('creator'), tagpack.uri, is_public)) + q = "DELETE FROM tagpack WHERE id = (%s)" + self.cursor.execute(q, (tagpack_id,)) + + q = "INSERT INTO tagpack \ + (id, title, description, creator, uri, is_public) \ + VALUES (%s,%s,%s,%s,%s,%s)" + v = (h.get('id'), h.get('title'), h.get('description'), + h.get('creator'), tagpack.uri, is_public) + self.cursor.execute(q, v) self.conn.commit() - addr_sql = "INSERT INTO address (currency, address) VALUES (%s, %s) ON CONFLICT DO NOTHING" - tag_sql = "INSERT INTO tag (label, source, category, abuse, address, currency, is_cluster_definer, confidence, lastmod, context, tagpack ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)" + addr_sql = "INSERT INTO address (currency, address) VALUES (%s, %s) \ + ON CONFLICT DO NOTHING" + tag_sql = "INSERT INTO tag (label, source, category, abuse, address, \ + currency, is_cluster_definer, confidence, lastmod, \ + context, tagpack ) VALUES \ + (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)" tag_data = [] address_data = [] @@ -130,17 +142,23 @@ def get_addresses(self, update_existing): if update_existing: self.cursor.execute("SELECT address, currency FROM address") else: - self.cursor.execute("SELECT address, currency FROM address WHERE NOT is_mapped") + q = "SELECT address, currency FROM address WHERE NOT is_mapped" + self.cursor.execute(q) for record in self.cursor: yield record def insert_cluster_mappings(self, clusters): if not clusters.empty: - q = "INSERT INTO address_cluster_mapping (address, currency, gs_cluster_id , gs_cluster_def_addr , gs_cluster_no_addr )" \ - "VALUES (%s, %s, %s, %s, %s) ON CONFLICT (currency, address) DO UPDATE SET " \ - "gs_cluster_id = EXCLUDED.gs_cluster_id , gs_cluster_def_addr = EXCLUDED.gs_cluster_def_addr , gs_cluster_no_addr = EXCLUDED.gs_cluster_no_addr " + q = "INSERT INTO address_cluster_mapping (address, currency, \ + gs_cluster_id , gs_cluster_def_addr , gs_cluster_no_addr) \ + VALUES (%s, %s, %s, %s, %s) ON CONFLICT (currency, address) \ + DO UPDATE SET gs_cluster_id = EXCLUDED.gs_cluster_id, \ + gs_cluster_def_addr = EXCLUDED.gs_cluster_def_addr, \ + gs_cluster_no_addr = EXCLUDED.gs_cluster_no_addr" - data = clusters[['address', 'currency', 'cluster_id', 'cluster_defining_address', 'no_addresses']].to_records(index=False) + cols = ['address', 'currency', 'cluster_id', + 'cluster_defining_address', 'no_addresses'] + data = clusters[cols].to_records(index=False) execute_batch(self.cursor, q, data) self.conn.commit() @@ -149,7 +167,9 @@ def _supports_currency(self, tag): return tag.all_fields.get('currency') in self.supported_currencies def finish_mappings_update(self, keys): - self.cursor.execute('UPDATE address SET is_mapped=true WHERE NOT is_mapped AND currency IN %s', (tuple(keys),)) + q = 'UPDATE address SET is_mapped=true WHERE NOT is_mapped \ + AND currency IN %s' + self.cursor.execute(q, (tuple(keys),)) self.conn.commit() def get_ingested_tagpacks(self) -> list: @@ -175,8 +195,10 @@ def get_quality_measures(self, currency='') -> float: self.cursor.execute(query) keys = ['count', 'avg', 'stddev'] - return {keys[i]:v for row in self.cursor.fetchall() \ - for i,v in enumerate(row)} + return { + keys[i]: v for row in self.cursor.fetchall() + for i, v in enumerate(row) + } def calculate_quality_measures(self) -> float: self.cursor.execute("CALL calculate_quality()") @@ -191,9 +213,12 @@ def _get_tag(tag, tagpack_id): _, address = _get_currency_and_address(tag) - return (label, tag.all_fields.get('source'), tag.all_fields.get('category', None), - tag.all_fields.get('abuse', None), address, tag.all_fields.get('currency'), - tag.all_fields.get('is_cluster_definer'), tag.all_fields.get('confidence'), + return (label, tag.all_fields.get('source'), + tag.all_fields.get('category', None), + tag.all_fields.get('abuse', None), address, + tag.all_fields.get('currency'), + tag.all_fields.get('is_cluster_definer'), + tag.all_fields.get('confidence'), lastmod, tag.all_fields.get('context'), tagpack_id) @@ -209,9 +234,6 @@ def _get_header(tagpack, tid): return { 'id': tid, 'title': tc['title'], -# 'source': tc.get('source', os.path.split(tagpack.tags[0].all_fields.get('source'))[0]), 'creator': tc['creator'], 'description': tc.get('description', 'not provided'), -# 'owner': tc.get('owner', 'unknown') } - From 9fa9ef30f20b85054e34df59a0c49fc27f8a0033 Mon Sep 17 00:00:00 2001 From: Gibran Gomez Date: Thu, 17 Nov 2022 11:03:11 +0100 Subject: [PATCH 6/7] Added tagstore init command --- bin/tagpack-tool | 23 +++++++++++++++++++++++ docker/init.sh | 3 ++- 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/bin/tagpack-tool b/bin/tagpack-tool index 91ed0f1..bdc70a4 100755 --- a/bin/tagpack-tool +++ b/bin/tagpack-tool @@ -370,6 +370,19 @@ def insert_cluster_mapping(args, batch_size=5_000): print_line(f"Inserted {'missing' if not args.update else 'all'} cluster mappings for {processed_currencies} in {duration}s", "success") +def init_db(args): + config = _load_config(args.config) + + if 'taxonomies' not in config: + print_line("No taxonomies configured to init the db", 'fail') + return + + t0 = time.time() + print_line("Init database starts") + insert_taxonomy(args) + duration = round(time.time() - t0, 2) + print_line(f"Init database in {duration}s", 'success') + def update_db(args): tagstore = TagStore(args.url, args.schema) tagstore.refresh_db() @@ -528,6 +541,16 @@ def main(): pdp = parser_db.add_subparsers(title="TagStore commands") + # init the database + pbp = pdp.add_parser("init", help='init the database') + pbp.add_argument( + '--schema', default=_DEFAULT_SCHEMA, metavar='DB_SCHEMA', + help="PostgreSQL schema for GraphSense cluster mapping table") + pbp.add_argument( + '-u', '--url', + help="postgresql://user:password@db_host:port/database") + pbp.set_defaults(func=init_db, url=def_url, taxonomy=None) + # insert_cluster_mappings [update] pc = pdp.add_parser( "insert_cluster_mappings", help="insert cluster mappings") diff --git a/docker/init.sh b/docker/init.sh index 6d791e5..a8f4c45 100755 --- a/docker/init.sh +++ b/docker/init.sh @@ -18,4 +18,5 @@ ALTER MATERIALIZED VIEW tagstore.label OWNER TO "$POSTGRES_USER_TAGSTORE"; ALTER MATERIALIZED VIEW tagstore.statistics OWNER TO "$POSTGRES_USER_TAGSTORE"; EOF # insert confidence table -psql -U "$POSTGRES_USER" -d "$POSTGRES_DB" -c "\copy tagstore.confidence(id,label,description,level) from 'tmp/confidence.csv' DELIMITER ',' CSV HEADER;" +# Now this values can be ingested by the tool +#psql -U "$POSTGRES_USER" -d "$POSTGRES_DB" -c "\copy tagstore.confidence(id,label,description,level) from 'tmp/confidence.csv' DELIMITER ',' CSV HEADER;" From 9c949dd381cba08d93f2b344e46d45036b59527c Mon Sep 17 00:00:00 2001 From: Gibran Gomez Date: Thu, 17 Nov 2022 17:11:17 +0100 Subject: [PATCH 7/7] Updated README.md file --- README.md | 66 ++++++++++++++++++++++++++++--------------------------- 1 file changed, 34 insertions(+), 32 deletions(-) diff --git a/README.md b/README.md index d195dc7..ba09f94 100644 --- a/README.md +++ b/README.md @@ -19,12 +19,12 @@ Please note that the last feature requires (installation of) a [Postgresql](http Validate a single TagPack file - tagpack-tool validate tests/testfiles/ex_addr_tagpack.yaml - tagpack-tool validate tests/testfiles/ex_entity_tagpack.yaml + tagpack-tool tagpack validate tests/testfiles/ex_addr_tagpack.yaml + tagpack-tool tagpack validate tests/testfiles/ex_entity_tagpack.yaml Recursively validate all TagPacks in (a) given folder(s). - tagpack-tool validate tests/testfiles/ + tagpack-tool tagpack validate tests/testfiles/ Tagpacks are validated against the [tagpack schema](tagpack/conf/tagpack_schema.yaml). @@ -34,7 +34,7 @@ Confidence settings are validated against a set of acceptable [confidence](tagpa List configured taxonomy keys and URIs - tagpack-tool taxonomy + tagpack-tool taxonomy list Fetch and show concepts of a specific remote taxonomy (referenced by key) @@ -52,28 +52,18 @@ Fetch and show concepts of a specific remote taxonomy (referenced by key) Setup and start a PostgreSQL instance. First, copy `docker/env.template` to `.env` and fill the fields `POSTGRES_PASSWORD` and `POSTGRES_PASSWORD_TAGSTORE`. -Start an PostgreSQL instance using Docker Compose: +Start a PostgreSQL instance using Docker Compose: docker-compose up -d This will automatically create the database schema as defined -in `scripts/tagstore_schema.sql`. +in `tagpack/db/tagstore_schema.sql`. #### Option 2: Use an existing PostgreSQL database Create the schema and tables in a PostgreSQL instance of your choice - psql -h $DBHOST -p $DBPORT -d $DB -U $DBUSER --password -f tagpack/db/tagstore_schema.sql - -### Ingest confidence scores - - psql \ - -h $DBHOST \ - -p $DBPORT \ - -d $DB \ - -U $DBUSER \ - --password \ - -c "\copy tagstore.confidence(id,label,description,level) from 'tagpack/db/confidence.csv' delimiter ',' csv header;" + psql -h $POSTGRES_HOST -d $POSTGRES_DB -U $POSTGRES_USER --password -f tagpack/db/tagstore_schema.sql ### Export .env variables @@ -91,14 +81,26 @@ Or just export each variable using: Then call tagpack-tool. -### Ingest taxonomies +### Initialize the database + +To initialize the database with all the taxonomies needed for ingesting the tagpacks, use: + + tagpack-tool tagstore init + +This will generate a default config.yaml file for the taxonomies. +To create the default configuration file from scratch (when config.yaml does not exist) use: + + tagpack-tool config --verbose + +### Ingest taxonomies and confidence scores -Insert concepts from a remote taxonomy into database, e.g. abuse: +To insert individual taxonomies into database, use: tagpack-tool taxonomy insert abuse tagpack-tool taxonomy insert entity + tagpack-tool taxonomy insert confidence -resp. to insert all configured taxonomies at once, simply omit taxonomy name +To insert all configured taxonomies at once, simply omit taxonomy name tagpack-tool taxonomy insert @@ -106,34 +108,34 @@ resp. to insert all configured taxonomies at once, simply omit taxonomy name Insert a single TagPack file or all TagPacks from a given folder - tagpack-tool insert tests/testfiles/simple/ex_addr_tagpack.yaml - tagpack-tool insert tests/testfiles/simple/multiple_tags_for_address.yaml - tagpack-tool insert tests/testfiles/ + tagpack-tool tagpack insert tests/testfiles/simple/ex_addr_tagpack.yaml + tagpack-tool tagpack insert tests/testfiles/simple/multiple_tags_for_address.yaml + tagpack-tool tagpack insert tests/testfiles/ By default, TagPacks are declared as non-public in the database. For public TagPacks, add the `--public` flag to your arguments: - tagpack-tool insert --public tests/testfiles/ + tagpack-tool tagpack insert --public tests/testfiles/ If you try to insert tagpacks that already exist in the database, the ingestion process will be stopped. To force **re-insertion** (if tagpack file contents have been modified), add the `--force` flag to your arguments: - tagpack-tool insert --force tests/testfiles/ + tagpack-tool tagpack insert --force tests/testfiles/ To ingest **new** tagpacks and **skip** over already ingested tagpacks, add the `--add_new` flag to your arguments: - tagpack-tool insert --add_new tests/testfiles/ + tagpack-tool tagpack insert --add_new tests/testfiles/ By default, trying to insert tagpacks from a repository with **local** modifications will **fail**. To force insertion despite local modifications, add the ``--no_strict_check`` command-line parameter - tagpack-tool insert --force --add_new tests/testfiles/ + tagpack-tool tagpack insert --force --add_new tests/testfiles/ By default, tagpacks in the TagStore provide a backlink to the original tagpack file in their remote git repository ([see here](README_tagpacks.md#versioning-with-git)). To instead write local file paths instead, add the ``--no_git`` command-line parameter - tagpack-tool insert --no_git --add_new tests/testfiles/ + tagpack-tool tagpack insert --no_git --add_new tests/testfiles/ ### Align ingested attribution tags with GraphSense cluster Ids @@ -146,24 +148,24 @@ suit your Graphsense setup. Then fetch the cluster mappings from your Graphsense instance and insert them into the tagstore database: - tagpack-tool cluster -d $CASSANDRA_HOST -f ks_map.json + tagpack-tool tagstore insert_cluster_mappings -d $CASSANDRA_HOST -f ks_map.json To update ALL cluster-mappings in your tagstore, add the `--update` flag: - tagpack-tool cluster --update -d $CASSANDRA_HOST -f ks_map.json + tagpack-tool tagstore insert_cluster_mappings --update -d $CASSANDRA_HOST -f ks_map.json ### Remove duplicate tags Different tagpacks may contain identical tags - the same label and source for a particular address. To remove such redundant information, run - tagpack db remove_duplicates + tagpack-tool tagstore remove_duplicates ### IMPORTANT: Keeping data consistency after tagpack insertion After all required tagpacks have been ingested, run - tagpack-tool db refresh_views + tagpack-tool tagstore refresh_views to update all materialized views. Depending on the amount of tags contained in the tagstore, this may take a while.