diff --git a/CHANGELOG.md b/CHANGELOG.md index eafb333..8d3d541 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,9 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). +## [..] unreleased +### Changed +- removed unnecessary columns in cluster mapping table [#45](https://github.com/graphsense/graphsense-tagpack-tool/issues/45) + ## [1.1.0] 2022-10-11 ### Added - Support for connection pooling diff --git a/tagpack/db/tagstore_schema.sql b/tagpack/db/tagstore_schema.sql index ffb87fd..027ffbb 100644 --- a/tagpack/db/tagstore_schema.sql +++ b/tagpack/db/tagstore_schema.sql @@ -88,8 +88,6 @@ CREATE TABLE address_cluster_mapping ( gs_cluster_id INTEGER NOT NULL, gs_cluster_def_addr VARCHAR NOT NULL, gs_cluster_no_addr INTEGER DEFAULT NULL, - gs_cluster_in_degr INTEGER DEFAULT NULL, - gs_cluster_out_degr INTEGER DEFAULT NULL, PRIMARY KEY(currency, address), FOREIGN KEY (currency, address) REFERENCES address (currency, address) ON DELETE CASCADE ); diff --git a/tagpack/graphsense.py b/tagpack/graphsense.py index d389241..ac58c25 100644 --- a/tagpack/graphsense.py +++ b/tagpack/graphsense.py @@ -197,21 +197,12 @@ def get_address_clusters(self, df: DataFrame, currency: str) -> DataFrame: if currency == "ETH": df_address_ids["cluster_id"] = df_address_ids["address_id"] df_address_ids["no_addresses"] = 1 - degrees = self.get_address_statistics(df_address_ids, currency) result = df_address_ids.merge(addresses, on="address") - if len(degrees): - result = result.merge(degrees, on="address_id", how="left") - else: - # no external txs - result["in_degree"] = 0 - result["out_degree"] = 0 result.drop("address", axis="columns", inplace=True) result.rename(columns={"checksum_address": "address"}, inplace=True) result["cluster_defining_address"] = result["address"] - # no txs have been recorded - result.fillna(value={"in_degree": 0, "out_degree": 0}, inplace=True) return result @@ -233,23 +224,3 @@ def get_address_clusters(self, df: DataFrame, currency: str) -> DataFrame: return result - def get_address_statistics(self, df, currency): - """Get statistics for address ids.""" - self._check_passed_params(df, currency, 'address_id') - - keyspace = self.ks_map[currency]['transformed'] - ks_config = self._query_keyspace_config(keyspace) - self.session.set_keyspace(keyspace) - - df_temp = df[['address_id']].copy() - df_temp = df_temp.drop_duplicates() - df_temp['address_id_group'] = np.floor( - df_temp['address_id'] / ks_config['bucket_size']).astype(int) - - query = "SELECT address_id, in_degree, out_degree " + \ - "FROM address WHERE address_id_group=? and address_id=?" - statement = self.session.prepare(query) - parameters = df_temp[ - ['address_id_group', 'address_id']].to_records(index=False) - - return self._execute_query(statement, parameters) diff --git a/tagpack/tagstore.py b/tagpack/tagstore.py index 8c12802..66dd59c 100644 --- a/tagpack/tagstore.py +++ b/tagpack/tagstore.py @@ -110,12 +110,11 @@ def get_addresses(self, update_existing): def insert_cluster_mappings(self, clusters): if not clusters.empty: - q = "INSERT INTO address_cluster_mapping (address, currency, gs_cluster_id , gs_cluster_def_addr , gs_cluster_no_addr , gs_cluster_in_degr , gs_cluster_out_degr)" \ - "VALUES (%s, %s, %s, %s, %s, %s, %s) ON CONFLICT (currency, address) DO UPDATE SET " \ - "gs_cluster_id = EXCLUDED.gs_cluster_id , gs_cluster_def_addr = EXCLUDED.gs_cluster_def_addr , gs_cluster_no_addr = EXCLUDED.gs_cluster_no_addr , " \ - "gs_cluster_in_degr = EXCLUDED.gs_cluster_in_degr , gs_cluster_out_degr = EXCLUDED.gs_cluster_out_degr" + q = "INSERT INTO address_cluster_mapping (address, currency, gs_cluster_id , gs_cluster_def_addr , gs_cluster_no_addr )" \ + "VALUES (%s, %s, %s, %s, %s) ON CONFLICT (currency, address) DO UPDATE SET " \ + "gs_cluster_id = EXCLUDED.gs_cluster_id , gs_cluster_def_addr = EXCLUDED.gs_cluster_def_addr , gs_cluster_no_addr = EXCLUDED.gs_cluster_no_addr " - data = clusters[['address', 'currency', 'cluster_id', 'cluster_defining_address', 'no_addresses', 'in_degree', 'out_degree']].to_records(index=False) + data = clusters[['address', 'currency', 'cluster_id', 'cluster_defining_address', 'no_addresses']].to_records(index=False) execute_batch(self.cursor, q, data) self.conn.commit()