Skip to content

Commit

Permalink
remove unnecessary columns #45
Browse files Browse the repository at this point in the history
  • Loading branch information
mdragaschnig committed Oct 13, 2022
1 parent e9952b0 commit 9673a01
Show file tree
Hide file tree
Showing 4 changed files with 8 additions and 36 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).

## [..] unreleased
### Changed
- removed unnecessary columns in cluster mapping table [#45](https://github.com/graphsense/graphsense-tagpack-tool/issues/45)

## [1.1.0] 2022-10-11
### Added
- Support for connection pooling
Expand Down
2 changes: 0 additions & 2 deletions tagpack/db/tagstore_schema.sql
Original file line number Diff line number Diff line change
Expand Up @@ -88,8 +88,6 @@ CREATE TABLE address_cluster_mapping (
gs_cluster_id INTEGER NOT NULL,
gs_cluster_def_addr VARCHAR NOT NULL,
gs_cluster_no_addr INTEGER DEFAULT NULL,
gs_cluster_in_degr INTEGER DEFAULT NULL,
gs_cluster_out_degr INTEGER DEFAULT NULL,
PRIMARY KEY(currency, address),
FOREIGN KEY (currency, address) REFERENCES address (currency, address) ON DELETE CASCADE
);
Expand Down
29 changes: 0 additions & 29 deletions tagpack/graphsense.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,21 +197,12 @@ def get_address_clusters(self, df: DataFrame, currency: str) -> DataFrame:
if currency == "ETH":
df_address_ids["cluster_id"] = df_address_ids["address_id"]
df_address_ids["no_addresses"] = 1
degrees = self.get_address_statistics(df_address_ids, currency)

result = df_address_ids.merge(addresses, on="address")
if len(degrees):
result = result.merge(degrees, on="address_id", how="left")
else:
# no external txs
result["in_degree"] = 0
result["out_degree"] = 0

result.drop("address", axis="columns", inplace=True)
result.rename(columns={"checksum_address": "address"}, inplace=True)
result["cluster_defining_address"] = result["address"]
# no txs have been recorded
result.fillna(value={"in_degree": 0, "out_degree": 0}, inplace=True)

return result

Expand All @@ -233,23 +224,3 @@ def get_address_clusters(self, df: DataFrame, currency: str) -> DataFrame:

return result

def get_address_statistics(self, df, currency):
"""Get statistics for address ids."""
self._check_passed_params(df, currency, 'address_id')

keyspace = self.ks_map[currency]['transformed']
ks_config = self._query_keyspace_config(keyspace)
self.session.set_keyspace(keyspace)

df_temp = df[['address_id']].copy()
df_temp = df_temp.drop_duplicates()
df_temp['address_id_group'] = np.floor(
df_temp['address_id'] / ks_config['bucket_size']).astype(int)

query = "SELECT address_id, in_degree, out_degree " + \
"FROM address WHERE address_id_group=? and address_id=?"
statement = self.session.prepare(query)
parameters = df_temp[
['address_id_group', 'address_id']].to_records(index=False)

return self._execute_query(statement, parameters)
9 changes: 4 additions & 5 deletions tagpack/tagstore.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,12 +110,11 @@ def get_addresses(self, update_existing):

def insert_cluster_mappings(self, clusters):
if not clusters.empty:
q = "INSERT INTO address_cluster_mapping (address, currency, gs_cluster_id , gs_cluster_def_addr , gs_cluster_no_addr , gs_cluster_in_degr , gs_cluster_out_degr)" \
"VALUES (%s, %s, %s, %s, %s, %s, %s) ON CONFLICT (currency, address) DO UPDATE SET " \
"gs_cluster_id = EXCLUDED.gs_cluster_id , gs_cluster_def_addr = EXCLUDED.gs_cluster_def_addr , gs_cluster_no_addr = EXCLUDED.gs_cluster_no_addr , " \
"gs_cluster_in_degr = EXCLUDED.gs_cluster_in_degr , gs_cluster_out_degr = EXCLUDED.gs_cluster_out_degr"
q = "INSERT INTO address_cluster_mapping (address, currency, gs_cluster_id , gs_cluster_def_addr , gs_cluster_no_addr )" \
"VALUES (%s, %s, %s, %s, %s) ON CONFLICT (currency, address) DO UPDATE SET " \
"gs_cluster_id = EXCLUDED.gs_cluster_id , gs_cluster_def_addr = EXCLUDED.gs_cluster_def_addr , gs_cluster_no_addr = EXCLUDED.gs_cluster_no_addr "

data = clusters[['address', 'currency', 'cluster_id', 'cluster_defining_address', 'no_addresses', 'in_degree', 'out_degree']].to_records(index=False)
data = clusters[['address', 'currency', 'cluster_id', 'cluster_defining_address', 'no_addresses']].to_records(index=False)

execute_batch(self.cursor, q, data)
self.conn.commit()
Expand Down

0 comments on commit 9673a01

Please sign in to comment.