Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/filter large files #86

Merged
merged 2 commits into from
May 10, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/tagpack/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,7 @@ def low_quality_addresses(args):
if args.cluster:
print("\nSets of tags appearing in several addresses:")
s_int = sorted(intersections, key=lambda x: x[1], reverse=True)
for (k, v) in s_int:
for k, v in s_int:
if v > 1:
print(f"\t{v}: {', '.join(k)}")
else:
Expand Down
3 changes: 1 addition & 2 deletions src/tagpack/graphsense.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def _execute_query(self, statement, parameters):

i = 0
all_results = []
for (success, result) in results:
for success, result in results:
if not success:
print("failed" + result)
else:
Expand All @@ -59,7 +59,6 @@ def contains_keyspace_mapping(self, currency: str) -> bool:
return currency in self.ks_map

def _check_passed_params(self, df: DataFrame, currency: str, req_column: str):

if df.empty:
raise Exception(f"Received empty dataframe for currency {currency}")
if req_column not in df.columns:
Expand Down
16 changes: 14 additions & 2 deletions src/tagpack/tagpack.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ def get_uri_for_tagpack(repo_path, tagpack_file, strict_check, no_git):
return res, rel_path, default_prefix


def collect_tagpack_files(path, search_actorpacks=False):
def collect_tagpack_files(path, search_actorpacks=False, max_mb=200):
"""
Collect Tagpack YAML files from the given path. This function returns a
dict made of sets. Each key of the dict is the corresponding header path of
Expand Down Expand Up @@ -144,6 +144,19 @@ def collect_tagpack_files(path, search_actorpacks=False):

tagpack_files = {k: v for k, v in tagpack_files.items() if v}

# exclude files that are too large
max_bytes = max_mb * 1048576
for _, files in tagpack_files.items():
for f in files.copy():
if os.stat(f).st_size > max_bytes:
print_warn(
f"{f} is too large and will be not be processed: "
f"{(os.stat(f).st_size / 1048576):.2f} mb, current "
f"max file size is {max_mb} mb. "
"Please split the file to be processed."
)
files.remove(f)

return tagpack_files


Expand Down Expand Up @@ -397,7 +410,6 @@ def get_user_choice_cached(hl, hl_context_str, cache):
if hl in cache:
return cache[hl]
else:

candidates = find_actor_candidates(hl)
if len(candidates) == 0:
choice = None
Expand Down
3 changes: 0 additions & 3 deletions src/tagpack/tagstore.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,6 @@ def insert_taxonomy(self, taxonomy):
"(%(label)s,%(taxonomy)s,%(source)s,%(description)s);"
)
for c in taxonomy.concepts:

v = {
"id": c.id,
"label": c.label,
Expand All @@ -88,7 +87,6 @@ def insert_taxonomy(self, taxonomy):

@auto_commit
def insert_confidence_scores(self, confidence):

statement = (
"INSERT INTO confidence (id, label, description, level)"
" VALUES (%(id)s,%(label)s,%(description)s,%(level)s)"
Expand Down Expand Up @@ -120,7 +118,6 @@ def create_id(self, prefix, rel_path):
def insert_tagpack(
self, tagpack, is_public, force_insert, prefix, rel_path, batch=1000
):

tagpack_id = self.create_id(prefix, rel_path)
h = _get_header(tagpack, tagpack_id)

Expand Down
1 change: 0 additions & 1 deletion src/tagpack/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,6 @@ def get_github_repo_url(github_url):


def open_localfile_with_pkgresource_fallback(path):

if os.path.isfile(path):
return open(path, "r")
else:
Expand Down
10 changes: 5 additions & 5 deletions tests/test_actorpack_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,29 +115,29 @@ def test_field_no_taxonomy(schema):
def test_check_type(schema):
for field, value in field_values.items():
assert schema.check_type(field, value)
with (pytest.raises(ValidationError)) as e:
with pytest.raises(ValidationError) as e:
assert schema.check_type(field, 5)
msg = f"Field {field} must be of type {field_types[field]}"
assert msg in str(e.value)


def test_check_taxonomies(schema, taxonomies):
schema.schema["actor"]["test"] = {"taxonomy": "nonexistent"}
with (pytest.raises(ValidationError)) as e:
with pytest.raises(ValidationError) as e:
assert schema.check_taxonomies("test", "invalid", None)
assert "No taxonomies loaded" in str(e.value)

schema.schema["actor"]["invalidtax"] = {"taxonomy": "nonexistent"}
with (pytest.raises(ValidationError)) as e:
with pytest.raises(ValidationError) as e:
assert schema.check_taxonomies("invalidtax", "value", taxonomies)
assert "Unknown taxonomy in" in str(e.value)

assert schema.check_taxonomies("categories", "exchange", taxonomies)
with (pytest.raises(ValidationError)) as e:
with pytest.raises(ValidationError) as e:
assert schema.check_taxonomies("categories", "test", taxonomies)
assert "Undefined concept test for categories field" in str(e.value)

assert schema.check_taxonomies("jurisdictions", "MX", taxonomies)
with (pytest.raises(ValidationError)) as e:
with pytest.raises(ValidationError) as e:
assert schema.check_taxonomies("jurisdictions", "test", taxonomies)
assert "Undefined concept test for jurisdictions field" in str(e.value)
12 changes: 6 additions & 6 deletions tests/test_tagpack_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,33 +110,33 @@ def test_field_no_taxonomy(schema):

def test_check_type(schema):
assert schema.check_type("title", "some test string")
with (pytest.raises(ValidationError)) as e:
with pytest.raises(ValidationError) as e:
assert schema.check_type("title", 5)
assert "Field title must be of type text" in str(e.value)

assert schema.check_type("lastmod", date.fromisoformat("2021-04-21"))
with (pytest.raises(ValidationError)) as e:
with pytest.raises(ValidationError) as e:
assert schema.check_type("lastmod", 5)
assert "Field lastmod must be of type datetime" in str(e.value)

assert schema.check_type("address", "string")
with (pytest.raises(ValidationError)) as e:
with pytest.raises(ValidationError) as e:
assert schema.check_type("address", 0x2342)
assert "Field address must be of type text" in str(e.value)

assert schema.check_type("tags", [{"a": 1}, {"b": 2}])
with (pytest.raises(ValidationError)) as e:
with pytest.raises(ValidationError) as e:
assert schema.check_type("tags", "56abc")
assert "Field tags must be of type list" in str(e.value)


def test_check_taxonomies(schema, taxonomies):
assert schema.check_taxonomies("category", "exchange", taxonomies)
with (pytest.raises(ValidationError)) as e:
with pytest.raises(ValidationError) as e:
assert schema.check_taxonomies("category", "test", taxonomies)
assert "Undefined concept test in field category" in str(e.value)

schema.schema["tag"]["dummy"] = {"taxonomy": "test"}
with (pytest.raises(ValidationError)) as e:
with pytest.raises(ValidationError) as e:
assert schema.check_taxonomies("dummy", "test", taxonomies)
assert "Unknown taxonomy test" in str(e.value)