From 3168f1cb23fece4bc66b57a125460bb9ee3abce0 Mon Sep 17 00:00:00 2001 From: Gesina Phillips Date: Fri, 2 Feb 2024 15:38:07 -0500 Subject: [PATCH] re-ran linting/formatting after rebase --- src/cleanup_whitespace.py | 90 +++++++------- src/factor_field.py | 57 +++++---- src/generate_docs.py | 30 +++-- src/generate_field_enum_csv.py | 4 +- src/generate_field_values_csv.py | 70 +++++------ src/generate_field_yaml.py | 25 ++-- src/generate_grid.py | 31 +++-- src/generate_schema.py | 34 ++--- src/ingest_validation_tools/check_factory.py | 76 ++++++------ src/ingest_validation_tools/cli_utils.py | 5 +- .../directory_validator.py | 19 +-- src/ingest_validation_tools/docs_utils.py | 23 ++-- src/ingest_validation_tools/enums.py | 1 - src/ingest_validation_tools/error_report.py | 4 +- .../plugin_validator.py | 27 ++-- src/ingest_validation_tools/schema_loader.py | 22 ++-- .../table_validator.py | 30 ++--- src/ingest_validation_tools/upload.py | 117 +++++------------- .../validation_utils.py | 41 ++---- .../yaml_include_loader.py | 28 ++--- src/validate_tsv.py | 12 +- src/validate_upload.py | 22 ++-- 22 files changed, 306 insertions(+), 462 deletions(-) diff --git a/src/cleanup_whitespace.py b/src/cleanup_whitespace.py index b637d9d89..a2a4b19b2 100755 --- a/src/cleanup_whitespace.py +++ b/src/cleanup_whitespace.py @@ -1,34 +1,30 @@ #!/usr/bin/env python3 +import argparse import csv import sys -import argparse from pathlib import Path def main(): parser = argparse.ArgumentParser( - description=''' + description=""" Use the "--tsv_in"/"--tsv_out" options to strip invisible characters from TSVs. -''' +""" ) mutex = parser.add_mutually_exclusive_group(required=True) mutex.add_argument( - '--tsv_in', - type=Path, - metavar='INPUT', - help='TSV to strip padding whitespace from') + "--tsv_in", type=Path, metavar="INPUT", help="TSV to strip padding whitespace from" + ) mutex.add_argument( - '--encoding_test', + "--encoding_test", type=str, - metavar='ENCODING', - help='Generate test TSV using this encoding') + metavar="ENCODING", + help="Generate test TSV using this encoding", + ) parser.add_argument( - '--tsv_out', - type=Path, - metavar='OUTPUT', - help='Destination for clean TSV', - required=True) + "--tsv_out", type=Path, metavar="OUTPUT", help="Destination for clean TSV", required=True + ) args = parser.parse_args() if args.encoding_test: @@ -40,50 +36,52 @@ def main(): def print_encoding_test(encoding, output_path): space_chars = [ - '\u000b', # vertical tab - '\u0020', # normal space + "\u000b", # vertical tab + "\u0020", # normal space ] - if encoding != 'ascii': + if encoding != "ascii": space_chars += [ - '\u00a0', # non-breaking space + "\u00a0", # non-breaking space ] - if encoding not in ['ascii', 'latin-1']: + if encoding not in ["ascii", "latin-1"]: space_chars += [ - '\u2003', # em space - '\u3000', # idiographic space + "\u2003", # em space + "\u3000", # idiographic space ] - padding = ''.join(space_chars) + padding = "".join(space_chars) - with output_path.open(mode='w', encoding=encoding) as f: + with output_path.open(mode="w", encoding=encoding) as f: # Header: print( - 'quoted', 'empty', 'padded', - '', # Empty column header: should be cleaned up! - sep='\t', file=f + "quoted", + "empty", + "padded", + "", # Empty column header: should be cleaned up! + sep="\t", + file=f, ) # Body: print( f'"{padding}123{padding}"', - '', - f'{padding}123{padding}', - '', '', # Two empty cells: should be cleaned up! - sep='\t', file=f - ) - print( - '', '', '', '', # More empty cells: should be cleaned up! - sep='\t', file=f + "", + f"{padding}123{padding}", + "", + "", # Two empty cells: should be cleaned up! + sep="\t", + file=f, ) + print("", "", "", "", sep="\t", file=f) # More empty cells: should be cleaned up! # Trailing \n means there's a trailing empty line in the TSV to clean up. return 0 def print_clean_tsv(input_path, output_path): - dialect = 'excel-tab' - writer = csv.writer(output_path.open(mode='w', newline=''), dialect=dialect) + dialect = "excel-tab" + writer = csv.writer(output_path.open(mode="w", newline=""), dialect=dialect) - for encoding in ['utf-8', 'latin-1']: - warn(f'Trying to read {input_path} as {encoding}...') + for encoding in ["utf-8", "latin-1"]: + warn(f"Trying to read {input_path} as {encoding}...") try: # Read the file completely to determine if there are encoding problems, # rather than reading and writing line-by-line. @@ -91,10 +89,10 @@ def print_clean_tsv(input_path, output_path): clean_rows = clean(rows) for row in clean_rows: writer.writerow(row) - warn('Read succeeded') + warn("Read succeeded") return 0 except UnicodeDecodeError as e: - warn(f'Read failed: {e}') + warn(f"Read failed: {e}") continue return 1 @@ -108,7 +106,7 @@ def csv_to_rows(tsv_path, encoding=None, dialect=None): def clean(rows): - ''' + """ >>> clean([ ... [' x', 'y ', ''], ... ['', ' Hi! ', '', ''], @@ -117,7 +115,7 @@ def clean(rows): ... ]) [['x', 'y'], ['', 'Hi!']] - ''' + """ clean_rows = [] max_i = None for row in rows: @@ -126,16 +124,16 @@ def clean(rows): continue if max_i is None: max_i = last_non_empty_index(stripped_row) - clean_rows.append(stripped_row[:max_i + 1]) + clean_rows.append(stripped_row[: max_i + 1]) return clean_rows def last_non_empty_index(values): - ''' + """ >>> last_non_empty_index(['', '', '0', '', '']) 2 - ''' + """ return max(i for i, val in enumerate(values) if len(val)) diff --git a/src/factor_field.py b/src/factor_field.py index 83009be71..d3c5854f3 100755 --- a/src/factor_field.py +++ b/src/factor_field.py @@ -1,32 +1,33 @@ #!/usr/bin/env python3 -import sys import argparse -from pathlib import Path import fileinput +import sys from collections import defaultdict +from pathlib import Path def main(): - parser = argparse.ArgumentParser(description=''' + parser = argparse.ArgumentParser( + description=""" Factor out all variants of a given field. - ''') - parser.add_argument( - '--field', - metavar='NAME', - required=True) + """ + ) + parser.add_argument("--field", metavar="NAME", required=True) parser.add_argument( - '--input_dir', + "--input_dir", type=Path, - metavar='IN', - help='Directory to scan for instances of the field', - default='src/ingest_validation_tools/table-schemas/assays') + metavar="IN", + help="Directory to scan for instances of the field", + default="src/ingest_validation_tools/table-schemas/assays", + ) parser.add_argument( - '--output_dir', + "--output_dir", type=Path, - metavar='OUT', - help='Directory to write field extracts', - default='src/ingest_validation_tools/table-schemas/includes/fields') + metavar="OUT", + help="Directory to write field extracts", + default="src/ingest_validation_tools/table-schemas/includes/fields", + ) args = parser.parse_args() factor_field(args.field, args.input_dir, args.output_dir) @@ -46,18 +47,22 @@ def pull(field_name, input_dir): lines=lines, get_file_name=lambda: str(fileinput.filename()), field_name=field_name, - definitions=definitions + definitions=definitions, ) return definitions def push(field_name, definitions, output_dir): - options = [ - f"# {'; '.join(sorted(files))}\n{definition}" - for definition, files in definitions.items() - ] if len(definitions) > 1 else definitions.keys() + options = ( + [ + f"# {'; '.join(sorted(files))}\n{definition}" + for definition, files in definitions.items() + ] + if len(definitions) > 1 + else definitions.keys() + ) if options: - (output_dir / f'{field_name}.yaml').write_text('\n'.join(options)) + (output_dir / f"{field_name}.yaml").write_text("\n".join(options)) else: print(f"Check spelling of field name: '{field_name}'") sys.exit(1) @@ -93,18 +98,18 @@ def replace(lines, get_file_name, field_name, definitions): definition = None for line in lines: # This assumes the YAML has been cleaned up! - if f'name: {field_name}' in line: + if f"name: {field_name}" in line: inside = True - print(f'# include: ../includes/fields/{field_name}.yaml') + print(f"# include: ../includes/fields/{field_name}.yaml") definition = line continue - elif inside and line[0] not in ['-', '#']: + elif inside and line[0] not in ["-", "#"]: definition += line continue elif inside: definitions[definition].add(get_file_name()) inside = False - print(line, end='') + print(line, end="") if __name__ == "__main__": diff --git a/src/generate_docs.py b/src/generate_docs.py index 2f201fd33..86e1f9c57 100755 --- a/src/generate_docs.py +++ b/src/generate_docs.py @@ -2,29 +2,29 @@ import argparse import os -from pathlib import Path import sys -from yaml import dump as dump_yaml +from pathlib import Path from tableschema_to_template.create_xlsx import create_xlsx +from yaml import dump as dump_yaml +from ingest_validation_tools.cli_utils import dir_path +from ingest_validation_tools.docs_utils import ( + generate_readme_md, + generate_template_tsv, + get_tsv_name, + get_xlsx_name, +) from ingest_validation_tools.schema_loader import ( - dict_table_schema_versions, - get_table_schema, dict_directory_schema_versions, + dict_table_schema_versions, + enum_maps_to_lists, get_directory_schema, + get_fields_wo_headers, get_is_assay, - enum_maps_to_lists, get_pipeline_infos, - get_fields_wo_headers, -) -from ingest_validation_tools.docs_utils import ( - get_tsv_name, - get_xlsx_name, - generate_template_tsv, - generate_readme_md, + get_table_schema, ) -from ingest_validation_tools.cli_utils import dir_path def main(): @@ -158,9 +158,7 @@ def main(): ) max_schema["fields"] = get_fields_wo_headers(max_schema) if max_schema["fields"][0]["name"] != "is_cedar": - with open( - deprecated_path / get_tsv_name(args.type, is_assay=is_assay), "w" - ) as f: + with open(deprecated_path / get_tsv_name(args.type, is_assay=is_assay), "w") as f: f.write(generate_template_tsv(max_schema)) create_xlsx( max_schema, diff --git a/src/generate_field_enum_csv.py b/src/generate_field_enum_csv.py index 9ccdfa7f3..5df382d74 100755 --- a/src/generate_field_enum_csv.py +++ b/src/generate_field_enum_csv.py @@ -1,11 +1,11 @@ #!/usr/bin/env python3 +import argparse import sys from csv import DictWriter -import argparse from ingest_validation_tools.schema_loader import ( - list_table_schema_versions, get_table_schema, + list_table_schema_versions, ) diff --git a/src/generate_field_values_csv.py b/src/generate_field_values_csv.py index bc9cb89c9..c27f2fefd 100755 --- a/src/generate_field_values_csv.py +++ b/src/generate_field_values_csv.py @@ -1,8 +1,8 @@ #!/usr/bin/env python3 -import sys -from csv import DictWriter import argparse import re +import sys +from csv import DictWriter import requests @@ -10,67 +10,61 @@ def main(): parser = argparse.ArgumentParser() - default_url = 'https://search.api.hubmapconsortium.org/portal/search' - parser.add_argument( - '--url', - default=default_url, - help=f'ES endpoint. Default: {default_url}') + default_url = "https://search.api.hubmapconsortium.org/portal/search" + parser.add_argument("--url", default=default_url, help=f"ES endpoint. Default: {default_url}") default_size = 20 parser.add_argument( - '--size', + "--size", type=int, default=default_size, - help=f'Number of records to pull. Default: {default_size}') + help=f"Number of records to pull. Default: {default_size}", + ) - default_type = 'Dataset' + default_type = "Dataset" parser.add_argument( - '--type', - default=default_type, - help=f'Entity type to query. Default: {default_type}') + "--type", default=default_type, help=f"Entity type to query. Default: {default_type}" + ) args = parser.parse_args() query = { - 'post_filter': {'term': {'entity_type.keyword': args.type}}, - 'size': args.size, - '_source': ['metadata.metadata' if args.type == 'Dataset' else 'metadata'] + "post_filter": {"term": {"entity_type.keyword": args.type}}, + "size": args.size, + "_source": ["metadata.metadata" if args.type == "Dataset" else "metadata"], } response = requests.post(args.url, json=query) - hits = response.json()['hits']['hits'] + hits = response.json()["hits"]["hits"] writer = DictWriter( - sys.stdout, - fieldnames=[ - 'uuid', - 'assay_type', - 'field', - 'value'], - extrasaction='ignore') + sys.stdout, fieldnames=["uuid", "assay_type", "field", "value"], extrasaction="ignore" + ) writer.writeheader() for hit in hits: - uuid = hit['_id'] + uuid = hit["_id"] - if 'metadata' not in hit['_source']: + if "metadata" not in hit["_source"]: continue - meta = hit['_source']['metadata'] + meta = hit["_source"]["metadata"] - if 'metadata' in meta: - meta = meta['metadata'] + if "metadata" in meta: + meta = meta["metadata"] for field, value in meta.items(): - if not re.search(r'[A-Za-z]', value): + if not re.search(r"[A-Za-z]", value): continue - writer.writerow({ - 'uuid': uuid, - 'assay_type': meta['assay_type'] if 'assay_type' in meta else 'Sample', - 'field': field, - 'value': value - }) + writer.writerow( + { + "uuid": uuid, + "assay_type": meta["assay_type"] if "assay_type" in meta else "Sample", + "field": field, + "value": value, + } + ) - assert len(hits) < args.size, f'Result truncated at {args.size}' + assert len(hits) < args.size, f"Result truncated at {args.size}" return 0 -if __name__ == '__main__': +if __name__ == "__main__": sys.exit(main()) # pragma: no cover diff --git a/src/generate_field_yaml.py b/src/generate_field_yaml.py index 8c874f9bd..2dce1642c 100755 --- a/src/generate_field_yaml.py +++ b/src/generate_field_yaml.py @@ -1,12 +1,13 @@ #!/usr/bin/env python3 +import argparse import sys + from yaml import dump as dump_yaml -import argparse from ingest_validation_tools.schema_loader import ( - list_table_schema_versions, - get_table_schema, get_is_assay, + get_table_schema, + list_table_schema_versions, ) @@ -47,9 +48,7 @@ def __init__(self): self.default_value = None def add(self, field, schema_name=None, schema=None): - name, attr_value = self._get_name_value( - field, schema_name=schema_name, schema=schema - ) + name, attr_value = self._get_name_value(field, schema_name=schema_name, schema=schema) if self._skip_field(name, attr_value): return if name in self.mapping and self.mapping[name] != attr_value: @@ -66,9 +65,7 @@ def _skip_field(self, name, attr_value): return False def _handle_collision(self, name, attr_value): - raise Exception( - f'{name} is inconsistent: "{self.mapping[name]}" != "{attr_value}"' - ) + raise Exception(f'{name} is inconsistent: "{self.mapping[name]}" != "{attr_value}"') def dump_yaml(self): return dump_yaml(self.mapping) @@ -170,14 +167,8 @@ class AssayMapper(AbstractSetValuedMapper): """ def _get_name_value(self, field, schema_name=None, schema=None): - assay_type_fields = [ - field for field in schema["fields"] if field["name"] == "assay_type" - ] - value = ( - assay_type_fields[0]["constraints"]["enum"] - if len(assay_type_fields) - else [] - ) + assay_type_fields = [field for field in schema["fields"] if field["name"] == "assay_type"] + value = assay_type_fields[0]["constraints"]["enum"] if len(assay_type_fields) else [] return field["name"], set(value) diff --git a/src/generate_grid.py b/src/generate_grid.py index 7966330ac..4b6de8dd6 100755 --- a/src/generate_grid.py +++ b/src/generate_grid.py @@ -1,25 +1,22 @@ #!/usr/bin/env python3 import argparse -from pathlib import Path import sys from datetime import datetime +from pathlib import Path -from yaml import safe_load import xlsxwriter +from yaml import safe_load def main(): parser = argparse.ArgumentParser() - parser.add_argument( - 'target', - type=Path, - help='Path for Excel file') + parser.add_argument("target", type=Path, help="Path for Excel file") args = parser.parse_args() - docs_path = Path(__file__).parent.parent / 'docs' - field_schemas = safe_load((docs_path / 'field-schemas.yaml').read_text()) - field_descriptions = safe_load((docs_path / 'field-descriptions.yaml').read_text()) + docs_path = Path(__file__).parent.parent / "docs" + field_schemas = safe_load((docs_path / "field-schemas.yaml").read_text()) + field_descriptions = safe_load((docs_path / "field-descriptions.yaml").read_text()) all_schemas = set() for schemas in field_schemas.values(): @@ -28,18 +25,20 @@ def main(): schema_cols = sorted(all_schemas) workbook = xlsxwriter.Workbook(args.target) - worksheet = workbook.add_worksheet('schemas + fields') - workbook.set_properties({ - # So regenerated Excel files will be binary identical: - 'created': datetime(2000, 1, 1) - }) + worksheet = workbook.add_worksheet("schemas + fields") + workbook.set_properties( + { + # So regenerated Excel files will be binary identical: + "created": datetime(2000, 1, 1) + } + ) # Set column widths: worksheet.set_column(0, 0, 40) worksheet.set_column(1, len(schema_cols), 2) # Format and write headers: - header_format = workbook.add_format({'rotation': 60}) + header_format = workbook.add_format({"rotation": 60}) worksheet.freeze_panes(1, 1) for col, schema in enumerate_from_1(schema_cols): worksheet.write(0, col, schema, header_format) @@ -50,7 +49,7 @@ def main(): worksheet.write_comment(row, 0, field_descriptions[field]) for col, schema in enumerate_from_1(schema_cols): if schema in field_schemas[field]: - worksheet.write(row, col, '✓') + worksheet.write(row, col, "✓") workbook.close() diff --git a/src/generate_schema.py b/src/generate_schema.py index 895d5f705..4adf2b458 100755 --- a/src/generate_schema.py +++ b/src/generate_schema.py @@ -1,8 +1,8 @@ #!/usr/bin/env python3 +import argparse import csv import sys -import argparse from yaml import dump as dump_yaml @@ -10,36 +10,26 @@ def main(): parser = argparse.ArgumentParser() parser.add_argument( - '--fields', - type=argparse.FileType('r'), - help='Two-column TSV: Field name and description') + "--fields", type=argparse.FileType("r"), help="Two-column TSV: Field name and description" + ) args = parser.parse_args() field_list = [] - for row in csv.reader(args.fields, dialect='excel-tab'): + for row in csv.reader(args.fields, dialect="excel-tab"): if len(row) == 2: - field_list.append({ - 'name': row[0], - 'description': row[1] - }) + field_list.append({"name": row[0], "description": row[1]}) field_list[0] = { # Rebuild dict, so 'heading' is first. - 'heading': 'Level 2', - **field_list[0] + "heading": "Level 2", + **field_list[0], } level_1_overrides = [ - { - 'name': name, - 'constraints': { - 'enum': ['TODO'] - } - } - for name in ['assay_category', 'assay_type', 'analyte_class'] + {"name": name, "constraints": {"enum": ["TODO"]}} + for name in ["assay_category", "assay_type", "analyte_class"] ] - print(dump_yaml({ - 'doc_url': 'TODO', - 'fields': level_1_overrides + field_list - }, sort_keys=False)) + print( + dump_yaml({"doc_url": "TODO", "fields": level_1_overrides + field_list}, sort_keys=False) + ) return 0 diff --git a/src/ingest_validation_tools/check_factory.py b/src/ingest_validation_tools/check_factory.py index 515de3ab3..fed99ace6 100644 --- a/src/ingest_validation_tools/check_factory.py +++ b/src/ingest_validation_tools/check_factory.py @@ -1,15 +1,14 @@ +import json import re -from string import Template from pathlib import Path +from string import Template from sys import stderr -import json -from typing import List, Callable, Dict, Any, Iterator +from typing import Any, Callable, Dict, Iterator, List import frictionless import requests - -cache_path = Path(__file__).parent / 'url-status-cache.json' +cache_path = Path(__file__).parent / "url-status-cache.json" ErrorIterator = Iterator[frictionless.errors.CellError] Row = Dict[str, Any] @@ -22,64 +21,65 @@ def make_checks(schema) -> List[Check]: factory.make_url_check(), factory.make_sequence_limit_check(), factory.make_units_check(), - factory.make_forbid_na_check() + factory.make_forbid_na_check(), ] -class _CheckFactory(): +class _CheckFactory: def __init__(self, schema): self.schema = schema self._prev_value_run_length = {} def _get_constrained_fields(self, constraint: str) -> Dict[str, List]: - c_c = 'custom_constraints' + c_c = "custom_constraints" return { - f['name']: f[c_c][constraint] for f in self.schema['fields'] + f["name"]: f[c_c][constraint] + for f in self.schema["fields"] if c_c in f and constraint in f[c_c] } def _check_url_status_cache(self, url: str) -> str: if not cache_path.exists(): - cache_path.write_text('{}') + cache_path.write_text("{}") url_status_cache = json.loads(cache_path.read_text()) if url not in url_status_cache: - print(f'Fetching un-cached url: {url}', file=stderr) + print(f"Fetching un-cached url: {url}", file=stderr) try: response = requests.get(url) url_status_cache[url] = response.status_code except Exception as e: url_status_cache[url] = str(e) - cache_path.write_text(json.dumps( - url_status_cache, - sort_keys=True, - indent=2 - )) + cache_path.write_text(json.dumps(url_status_cache, sort_keys=True, indent=2)) return url_status_cache[url] - def make_url_check(self, template=Template( - 'URL returned $status: "$url"')) -> Check: - url_constrained_fields = self._get_constrained_fields('url') + def make_url_check(self, template=Template('URL returned $status: "$url"')) -> Check: + url_constrained_fields = self._get_constrained_fields("url") def url_check(row): for k, v in row.items(): if v is None: continue if k in url_constrained_fields: - prefix = url_constrained_fields[k]['prefix'] - url = f'{prefix}{v}' + prefix = url_constrained_fields[k]["prefix"] + url = f"{prefix}{v}" status = self._check_url_status_cache(url) if status != 200: note = template.substitute(status=status, url=url) yield frictionless.errors.CellError.from_row(row, note=note, field_name=k) + return url_check - def make_sequence_limit_check(self, template=Template( - 'there is a run of $run_length sequential items: Limit is $limit. ' - 'If correct, reorder rows.')) -> Check: - sequence_limit_fields = self._get_constrained_fields('sequence_limit') + def make_sequence_limit_check( + self, + template=Template( + "there is a run of $run_length sequential items: Limit is $limit. " + "If correct, reorder rows." + ), + ) -> Check: + sequence_limit_fields = self._get_constrained_fields("sequence_limit") def sequence_limit_check(row): - prefix_number_re = r'(?P.*?)(?P\d+)$' + prefix_number_re = r"(?P.*?)(?P\d+)$" for k, v in row.items(): # If the schema declares the field as datetime, # "v" will be a python object, and regexes will error. @@ -101,8 +101,8 @@ def sequence_limit_check(row): prev_value, run_length = self._prev_value_run_length[k] prev_match = re.search(prefix_number_re, prev_value) if ( - match.group('prefix') != prev_match.group('prefix') or - int(match.group('number')) != int(prev_match.group('number')) + 1 + match.group("prefix") != prev_match.group("prefix") + or int(match.group("number")) != int(prev_match.group("number")) + 1 ): self._prev_value_run_length[k] = (v, 1) continue @@ -111,16 +111,17 @@ def sequence_limit_check(row): self._prev_value_run_length[k] = (v, run_length) limit = sequence_limit_fields[k] - assert limit > 1, 'The lowest allowed limit is 2' + assert limit > 1, "The lowest allowed limit is 2" if run_length >= limit: note = template.substitute(run_length=run_length, limit=limit) yield frictionless.errors.CellError.from_row(row, note=note, field_name=k) return sequence_limit_check - def make_units_check(self, template=Template( - 'it requires a value when $units_for is filled')) -> Check: - units_constrained_fields = self._get_constrained_fields('units_for') + def make_units_check( + self, template=Template("it requires a value when $units_for is filled") + ) -> Check: + units_constrained_fields = self._get_constrained_fields("units_for") def units_check(row): for k, v in row.items(): @@ -129,19 +130,22 @@ def units_check(row): if (row[units_for] or row[units_for] == 0) and not row[k]: note = template.substitute(units_for=units_for) yield frictionless.errors.CellError.from_row(row, note=note, field_name=k) + return units_check - def make_forbid_na_check(self, template=Template( - '"N/A" fields should just be left empty')) -> Check: - forbid_na_constrained_fields = self._get_constrained_fields('forbid_na') + def make_forbid_na_check( + self, template=Template('"N/A" fields should just be left empty') + ) -> Check: + forbid_na_constrained_fields = self._get_constrained_fields("forbid_na") def forbid_na_check(row): for k, v in row.items(): if ( k in forbid_na_constrained_fields and isinstance(v, str) - and v.upper() in ['NA', 'N/A'] + and v.upper() in ["NA", "N/A"] ): note = template.substitute() yield frictionless.errors.CellError.from_row(row, note=note, field_name=k) + return forbid_na_check diff --git a/src/ingest_validation_tools/cli_utils.py b/src/ingest_validation_tools/cli_utils.py index 8aabfef58..8da1ee932 100644 --- a/src/ingest_validation_tools/cli_utils.py +++ b/src/ingest_validation_tools/cli_utils.py @@ -13,7 +13,4 @@ def dir_path(s): raise ShowUsageException(f'"{s}" is not a directory') -exit_codes = namedtuple( - 'ExitCode', - ['VALID', 'BUG', 'ERROR', 'INVALID'] -)(0, 1, 2, 3) +exit_codes = namedtuple("ExitCode", ["VALID", "BUG", "ERROR", "INVALID"])(0, 1, 2, 3) diff --git a/src/ingest_validation_tools/directory_validator.py b/src/ingest_validation_tools/directory_validator.py index fb89de93c..3e21606c3 100644 --- a/src/ingest_validation_tools/directory_validator.py +++ b/src/ingest_validation_tools/directory_validator.py @@ -1,8 +1,9 @@ import os import re from fnmatch import fnmatch -from typing import List, Dict, Tuple from pathlib import Path +from typing import Dict, List, Tuple + from ingest_validation_tools.yaml_include_loader import load_yaml @@ -40,9 +41,7 @@ def validate_directory( actual_paths += [f"{prefix}/"] # Otherwise this should be a branch directory else: - actual_paths += ( - [f"{prefix}/{name}" for name in file_names] if prefix else file_names - ) + actual_paths += [f"{prefix}/{name}" for name in file_names] if prefix else file_names """TODO: message_munger adds periods at the end of these messages which is very confusing for regex! Also human readability of required_patterns @@ -55,9 +54,7 @@ def validate_directory( assert isinstance(dependency_pattern, str) # Check to see whether there's a match matching_paths = [ - actual - for actual in actual_paths - if re.fullmatch(dependency_pattern, actual) + actual for actual in actual_paths if re.fullmatch(dependency_pattern, actual) ] # If there's a match, then we have to check that the dependent items are also captured # Let's also short-circuit and get failures out of the way @@ -87,9 +84,7 @@ def validate_directory( not_allowed_errors.extend( _get_not_allowed_errors(actual_paths, allowed_patterns, dataset_ignore_globs) ) - required_missing_errors.extend( - _get_missing_required_errors(actual_paths, required_patterns) - ) + required_missing_errors.extend(_get_missing_required_errors(actual_paths, required_patterns)) errors = {} if not_allowed_errors: @@ -113,9 +108,7 @@ def _get_not_allowed_errors( return not_allowed_errors -def _get_missing_required_errors( - paths: List[str], required_patterns: List[str] -) -> List[str]: +def _get_missing_required_errors(paths: List[str], required_patterns: List[str]) -> List[str]: return [ pattern for pattern in required_patterns diff --git a/src/ingest_validation_tools/docs_utils.py b/src/ingest_validation_tools/docs_utils.py index f12647cac..b84a79e36 100644 --- a/src/ingest_validation_tools/docs_utils.py +++ b/src/ingest_validation_tools/docs_utils.py @@ -1,8 +1,8 @@ +import html import re -from string import Template from pathlib import Path -import html -from typing import Dict, Any +from string import Template +from typing import Any, Dict from urllib.parse import urlencode import requests @@ -110,9 +110,7 @@ def _get_portal_names_md(assay_types): if portal_name is None: links.append(f"{assay_type} not in Portal") continue - query = urlencode( - {"mapped_data_types[0]": portal_name, "entity_type[0]": "Dataset"} - ) + query = urlencode({"mapped_data_types[0]": portal_name, "entity_type[0]": "Dataset"}) url = f"https://portal.hubmapconsortium.org/search?{query}" links.append(f"[{portal_name}]({url})") return f'In the portal: {" / ".join(links)}' @@ -179,10 +177,7 @@ def generate_readme_md( if ( is_deprecated or is_draft - or ( - is_cedar - and max_version_table_schema.get("fields", [])[0].get("example", "") == "" - ) + or (is_cedar and max_version_table_schema.get("fields", [])[0].get("example", "") == "") ): tsv_url = "" xlsx_url = "" @@ -581,9 +576,7 @@ def _make_dir_descriptions(dir_schemas, pipeline_infos): """ - pipeline_infos_md = " and ".join( - make_pipeline_link(info) for info in pipeline_infos - ) + pipeline_infos_md = " and ".join(make_pipeline_link(info) for info in pipeline_infos) pipeline_blurb = ( f"The HIVE will process each dataset with\n{pipeline_infos_md}.\n" if pipeline_infos @@ -610,9 +603,7 @@ def _make_dir_descriptions(dir_schemas, pipeline_infos): f"Version {v}" f'{" (use this one)" if current_version else ""}' f"\n" - + _make_dir_description( - schema["files"], schema.get("deprecated", False) - ) + + _make_dir_description(schema["files"], schema.get("deprecated", False)) + "\n\n" ) current_version = False diff --git a/src/ingest_validation_tools/enums.py b/src/ingest_validation_tools/enums.py index 166df1f80..cdd8bc175 100644 --- a/src/ingest_validation_tools/enums.py +++ b/src/ingest_validation_tools/enums.py @@ -1,6 +1,5 @@ from typing import Dict, List - """ >>> import requests >>> local_names = shared_enums['assay_type'] diff --git a/src/ingest_validation_tools/error_report.py b/src/ingest_validation_tools/error_report.py index 5dca8c807..3e87e2340 100644 --- a/src/ingest_validation_tools/error_report.py +++ b/src/ingest_validation_tools/error_report.py @@ -1,8 +1,8 @@ -from yaml import Dumper, dump from typing import List, Union -from ingest_validation_tools.message_munger import munge, recursive_munge +from yaml import Dumper, dump +from ingest_validation_tools.message_munger import munge, recursive_munge # Force dump not to use alias syntax. # https://stackoverflow.com/questions/13518819/avoid-references-in-pyyaml diff --git a/src/ingest_validation_tools/plugin_validator.py b/src/ingest_validation_tools/plugin_validator.py index 1caef3ea2..c435d52c1 100644 --- a/src/ingest_validation_tools/plugin_validator.py +++ b/src/ingest_validation_tools/plugin_validator.py @@ -1,8 +1,9 @@ +import inspect import sys from importlib import util -import inspect -from typing import List, Union, Tuple, Iterator, Type from pathlib import Path +from typing import Iterator, List, Tuple, Type, Union + from ingest_validation_tools.schema_loader import SchemaVersion PathOrStr = Union[str, Path] @@ -43,9 +44,7 @@ class Validator(object): """float: a rough measure of cost to run. Lower is better. """ - def __init__( - self, base_paths: List[Path], assay_type: str, contains: List = [], **kwargs - ): + def __init__(self, base_paths: List[Path], assay_type: str, contains: List = [], **kwargs): """ base_paths is expected to be a list of directories. These are the root paths of the directory trees to be validated. @@ -59,9 +58,7 @@ def __init__( elif isinstance(base_paths, str): self.paths = [Path(base_paths)] else: - raise Exception( - f"Validator init received base_paths arg as type {type(base_paths)}" - ) + raise Exception(f"Validator init received base_paths arg as type {type(base_paths)}") self.assay_type = assay_type self.contains = contains @@ -94,9 +91,7 @@ def run_plugin_validators_iter( for column_name in ["assay_type", "dataset_type"]: if column_name in sv.rows[0]: if any(row[column_name] != sv.dataset_type for row in sv.rows): - raise ValidatorError( - f"{metadata_path} contains more than one assay type" - ) + raise ValidatorError(f"{metadata_path} contains more than one assay type") data_paths = [] if all("data_path" in row for row in sv.rows): @@ -105,9 +100,7 @@ def run_plugin_validators_iter( if not data_path.is_absolute(): data_path = (Path(metadata_path).parent / data_path).resolve() if not data_path.is_dir(): - raise ValidatorError( - f"{data_path} should be the base directory of a dataset" - ) + raise ValidatorError(f"{data_path} should be the base directory of a dataset") data_paths.append(data_path) for k, v in validation_error_iter( data_paths, sv.dataset_type, plugin_dir, sv.contains, **kwargs @@ -142,11 +135,7 @@ def validation_class_iter(plugin_dir: PathOrStr) -> Iterator[Type[Validator]]: sys.modules[mod_nm] = mod spec.loader.exec_module(mod) # type: ignore for _, obj in inspect.getmembers(mod): - if ( - inspect.isclass(obj) - and obj != Validator - and issubclass(obj, Validator) - ): + if inspect.isclass(obj) and obj != Validator and issubclass(obj, Validator): sort_me.append((obj.cost, obj.description, obj)) sort_me.sort() for _, _, cls in sort_me: diff --git a/src/ingest_validation_tools/schema_loader.py b/src/ingest_validation_tools/schema_loader.py index dd3555db3..f95754ebc 100644 --- a/src/ingest_validation_tools/schema_loader.py +++ b/src/ingest_validation_tools/schema_loader.py @@ -1,15 +1,14 @@ from __future__ import annotations -from dataclasses import dataclass, field -from pathlib import Path +import re from collections import defaultdict from copy import deepcopy -import re -from typing import List, Dict, Set, Sequence, Optional, Union +from dataclasses import dataclass, field +from pathlib import Path +from typing import Dict, List, Optional, Sequence, Set, Union -from ingest_validation_tools.yaml_include_loader import load_yaml from ingest_validation_tools.enums import shared_enums - +from ingest_validation_tools.yaml_include_loader import load_yaml _table_schemas_path = Path(__file__).parent / "table-schemas" _directory_schemas_path = Path(__file__).parent / "directory-schemas" @@ -87,9 +86,7 @@ def get_row_data(self): assay_type = self.rows[0].get("assay_type") dataset_type = self.rows[0].get("dataset_type") if assay_type is not None and dataset_type is not None: - raise PreflightError( - f"Found both assay_type and dataset_type for path {self.path}!" - ) + raise PreflightError(f"Found both assay_type and dataset_type for path {self.path}!") else: self.dataset_type = assay_type if assay_type else dataset_type @@ -334,9 +331,7 @@ def _validate_level_1_enum(field: dict) -> None: name = field["name"] if name in shared_enums: - optional = not field["constraints"].get( - "required", True - ) # Default: required = True + optional = not field["constraints"].get("required", True) # Default: required = True actual = set( field["constraints"].get( "enum", @@ -346,8 +341,7 @@ def _validate_level_1_enum(field: dict) -> None: ) allowed = set(shared_enums[name]) assert actual <= allowed, ( - f"Unexpected enums for {name}: {actual - allowed}\n" - f"Allowed: {sorted(allowed)}" + f"Unexpected enums for {name}: {actual - allowed}\n" f"Allowed: {sorted(allowed)}" ) diff --git a/src/ingest_validation_tools/table_validator.py b/src/ingest_validation_tools/table_validator.py index 2867bb07a..25659f0e2 100644 --- a/src/ingest_validation_tools/table_validator.py +++ b/src/ingest_validation_tools/table_validator.py @@ -1,7 +1,7 @@ import csv -from pathlib import Path -from typing import List, Optional, Dict, Union from enum import Enum +from pathlib import Path +from typing import Dict, List, Optional, Union import frictionless @@ -38,9 +38,7 @@ def get_table_errors( schema_fields_dict = {field["name"]: field for field in schema["fields"]} - return [ - _get_message(error, schema_fields_dict, report_type) for error in task["errors"] - ] + return [_get_message(error, schema_fields_dict, report_type) for error in task["errors"]] def _get_pre_flight_errors(tsv_path: Path, schema: dict) -> Optional[List[str]]: @@ -51,9 +49,7 @@ def _get_pre_flight_errors(tsv_path: Path, schema: dict) -> Optional[List[str]]: delimiter = dialect.delimiter expected_delimiter = "\t" if delimiter != expected_delimiter: - return [ - f"Delimiter is {repr(delimiter)}, rather than expected {repr(expected_delimiter)}" - ] + return [f"Delimiter is {repr(delimiter)}, rather than expected {repr(expected_delimiter)}"] # Re-reading the file is ugly, but creating a stream seems gratuitous. with tsv_path.open() as tsv_handle: @@ -75,9 +71,7 @@ def _get_pre_flight_errors(tsv_path: Path, schema: dict) -> Optional[List[str]]: for i_pair in enumerate(zip(fields, expected_fields)): i, (actual, expected) = i_pair if actual != expected: - errors.append( - f'In column {i+1}, found "{actual}", expected "{expected}"' - ) + errors.append(f'In column {i+1}, found "{actual}", expected "{expected}"') return errors return None @@ -119,27 +113,19 @@ def _get_message( if "code" in error and error["code"] == "missing-label": msg = "Bug: Should have been caught pre-flight. File an issue." return msg if return_str else get_json(msg) - if ( - "rowPosition" in error - and "fieldName" in error - and "cell" in error - and "note" in error - ): + if "rowPosition" in error and "fieldName" in error and "cell" in error and "note" in error: msg = ( f'On row {error["rowPosition"]}, column "{error["fieldName"]}", ' f'value "{error["cell"]}" fails because {error["note"]}' f'{f". Example: {example}" if example else example}' ) - return ( - msg - if return_str - else get_json(msg, error["rowPosition"], error["fieldName"]) - ) + return msg if return_str else get_json(msg, error["rowPosition"], error["fieldName"]) return error["message"] if __name__ == "__main__": import argparse + from yaml import safe_load parser = argparse.ArgumentParser("CLI just for testing") diff --git a/src/ingest_validation_tools/upload.py b/src/ingest_validation_tools/upload.py index 71612b8d0..a5a10b34f 100644 --- a/src/ingest_validation_tools/upload.py +++ b/src/ingest_validation_tools/upload.py @@ -1,13 +1,13 @@ from __future__ import annotations -from copy import copy -import logging +import logging import subprocess from collections import Counter, defaultdict +from copy import copy from datetime import datetime from fnmatch import fnmatch from pathlib import Path -from typing import Any, Dict, List, Optional, Union, DefaultDict +from typing import Any, DefaultDict, Dict, List, Optional, Union import requests @@ -80,9 +80,7 @@ def __init__( self.globus_token, self.directory_path, ) - for path in ( - tsv_paths if tsv_paths else directory_path.glob(f"*{TSV_SUFFIX}") - ) + for path in (tsv_paths if tsv_paths else directory_path.glob(f"*{TSV_SUFFIX}")) } self.effective_tsv_paths = { @@ -195,9 +193,7 @@ def validation_routine( @property def multi_parent(self) -> Optional[SchemaVersion]: - multi_assay_parents = [ - sv for sv in self.effective_tsv_paths.values() if sv.contains - ] + multi_assay_parents = [sv for sv in self.effective_tsv_paths.values() if sv.contains] if len(multi_assay_parents) == 0: return if len(multi_assay_parents) > 1: @@ -235,23 +231,14 @@ def _check_upload(self) -> dict: def _get_local_tsv_errors(self) -> Optional[Dict]: errors: DefaultDict[str, list] = defaultdict(list) - types_counter = Counter( - [v.schema_name for v in self.effective_tsv_paths.values()] - ) - repeated = [ - assay_type for assay_type, count in types_counter.items() if count > 1 - ] + types_counter = Counter([v.schema_name for v in self.effective_tsv_paths.values()]) + repeated = [assay_type for assay_type, count in types_counter.items() if count > 1] if repeated: raise ErrorDictException( - { - "Repeated": f"There is more than one TSV for this type: {', '.join(repeated)}" - } + {"Repeated": f"There is more than one TSV for this type: {', '.join(repeated)}"} ) for path, schema in self.effective_tsv_paths.items(): - if ( - "data_path" not in schema.rows[0] - or "contributors_path" not in schema.rows[0] - ): + if "data_path" not in schema.rows[0] or "contributors_path" not in schema.rows[0]: errors.update( { f"{path} (as {schema.table_schema})": [ @@ -288,9 +275,7 @@ def _get_directory_errors(self) -> dict: errors.update(dir_errors) return errors - def _get_multi_assay_dir_errors( - self, path: str, dataset_types: Dict - ) -> Optional[Dict]: + def _get_multi_assay_dir_errors(self, path: str, dataset_types: Dict) -> Optional[Dict]: parent = dataset_types.get("parent") # Validate against parent multi-assay type if data_path is in parent TSV if parent: @@ -342,15 +327,11 @@ def _validate( return {f"{tsv_path} (as {schema_version.table_schema})": e} if schema.get("deprecated") and not self.ignore_deprecation: - return { - "Schema version is deprecated": f"{schema_version.table_schema}" - } + return {"Schema version is deprecated": f"{schema_version.table_schema}"} local_errors = get_table_errors(tsv_path, schema, report_type) if local_errors: - local_validated[ - f"{tsv_path} (as {schema_version.table_schema})" - ] = local_errors + local_validated[f"{tsv_path} (as {schema_version.table_schema})"] = local_errors else: """ Passing offline=True will skip all API/URL validation; @@ -359,9 +340,7 @@ def _validate( manually (see tests-manual/README.md) """ if self.offline: - logging.info( - f"{tsv_path}: Offline validation selected, cannot reach API." - ) + logging.info(f"{tsv_path}: Offline validation selected, cannot reach API.") return errors else: url_errors = self._cedar_url_checks(tsv_path, schema_version) @@ -395,9 +374,7 @@ def _get_plugin_errors(self, **kwargs) -> dict: # if this is a multi-assay upload, check all files ONCE # using the parent metadata file as a manifest, skipping # non-parent dataset_types - if not self.multi_parent or ( - sv.dataset_type == self.multi_parent.dataset_type - ): + if not self.multi_parent or (sv.dataset_type == self.multi_parent.dataset_type): for k, v in run_plugin_validators_iter( metadata_path, sv, plugin_path, **kwargs ): @@ -420,8 +397,7 @@ def _api_validation( errors["Request Errors"] = response.json() elif response.json()["reporting"] and len(response.json()["reporting"]) > 0: errors["Validation Errors"] = [ - self._get_message(error, report_type) - for error in response.json()["reporting"] + self._get_message(error, report_type) for error in response.json()["reporting"] ] else: logging.info(f"No errors found during CEDAR validation for {tsv_path}!") @@ -467,9 +443,7 @@ def _check_multi_assay_children(self): else: for row in sv.rows: if row.get("data_path"): - self.multi_assay_data_paths[row["data_path"]][ - "components" - ].append(sv) + self.multi_assay_data_paths[row["data_path"]]["components"].append(sv) necessary.remove(sv.dataset_type.lower()) message = "" if necessary: @@ -497,9 +471,7 @@ def _check_data_paths_shared_with_parent(self): # removing from multi_data_paths to trigger error downstream if not related_svs.get("components") and not related_svs.get("parent"): continue - existing_components = { - sv.dataset_type.lower() for sv in related_svs["components"] - } + existing_components = {sv.dataset_type.lower() for sv in related_svs["components"]} # If all required components are not present, add to missing_components # to trigger error downstream diff = set(self.multi_parent.contains).difference(existing_components) @@ -546,13 +518,9 @@ def _cedar_url_checks(self, tsv_path: str, schema_version: SchemaVersion): schema_name = schema_version.schema_name if "sample" in schema_name: - constrained_fields[ - "sample_id" - ] = "https://entity.api.hubmapconsortium.org/entities/" + constrained_fields["sample_id"] = "https://entity.api.hubmapconsortium.org/entities/" elif "organ" in schema_name: - constrained_fields[ - "organ_id" - ] = "https://entity.api.hubmapconsortium.org/entities/" + constrained_fields["organ_id"] = "https://entity.api.hubmapconsortium.org/entities/" elif "contributors" in schema_name: constrained_fields["orcid_id"] = "https://pub.orcid.org/v3.0/" else: @@ -568,15 +536,11 @@ def _cedar_url_checks(self, tsv_path: str, schema_version: SchemaVersion): def _check_matching_urls(self, tsv_path: str, constrained_fields: dict): rows = read_rows(Path(tsv_path), "ascii") fields = rows[0].keys() - missing_fields = [ - k for k in constrained_fields.keys() if k not in fields - ].sort() + missing_fields = [k for k in constrained_fields.keys() if k not in fields].sort() if missing_fields: return {f"Missing fields: {sorted(missing_fields)}"} if not self.globus_token: - return { - "No token": "No token was received to check URL fields against Entity API." - } + return {"No token": "No token was received to check URL fields against Entity API."} url_errors = [] for i, row in enumerate(rows): check = {k: v for k, v in row.items() if k in constrained_fields} @@ -592,9 +556,7 @@ def _check_matching_urls(self, tsv_path: str, constrained_fields: dict): ) response.raise_for_status() except Exception as e: - url_errors.append( - f"Row {i+2}, field '{field}' with value '{value}': {e}" - ) + url_errors.append(f"Row {i+2}, field '{field}' with value '{value}': {e}") return url_errors def _get_message( @@ -621,12 +583,7 @@ def _get_message( example = error.get("repairSuggestion", "") return_str = report_type is ReportType.STR - if ( - "errorType" in error - and "column" in error - and "row" in error - and "value" in error - ): + if "errorType" in error and "column" in error and "row" in error and "value" in error: # This may need readability improvements msg = ( f'On row {error["row"]}, column "{error["column"]}", ' @@ -644,9 +601,7 @@ def _check_path( metadata_path: Union[str, Path], ) -> Optional[Dict]: if ref == "data": - errors = self._check_data_path( - schema_version, Path(metadata_path), path_value - ) + errors = self._check_data_path(schema_version, Path(metadata_path), path_value) else: errors = self._check_other_path(Path(metadata_path), path_value, ref) return errors @@ -687,14 +642,10 @@ def _check_data_path( dataset_ignore_globs=self.dataset_ignore_globs, ) if ref_errors: - errors[ - f"{str(metadata_path)}, column 'data_path', value '{path_value}'" - ] = ref_errors + errors[f"{str(metadata_path)}, column 'data_path', value '{path_value}'"] = ref_errors return errors - def _check_other_path( - self, metadata_path: Path, other_path_value: str, path_type: str - ): + def _check_other_path(self, metadata_path: Path, other_path_value: str, path_type: str): errors = {} other_path = self.directory_path / other_path_value try: @@ -705,9 +656,7 @@ def _check_other_path( self.directory_path, ) except Exception as e: - errors[ - f"{metadata_path}, column '{path_type}_path', value '{other_path_value}'" - ] = [e] + errors[f"{metadata_path}, column '{path_type}_path', value '{other_path_value}'"] = [e] return errors tsv_ref_errors = self.validation_routine(tsv_paths={str(other_path): schema}) # TSV located and read, errors found @@ -736,12 +685,8 @@ def __get_no_ref_errors(self) -> dict: and not any([fnmatch(path.name, glob) for glob in self.upload_ignore_globs]) } unreferenced_paths = non_metadata_paths - referenced_data_paths - unreferenced_dir_paths = [ - path for path in unreferenced_paths if Path(path).is_dir() - ] - unreferenced_file_paths = [ - path for path in unreferenced_paths if not Path(path).is_dir() - ] + unreferenced_dir_paths = [path for path in unreferenced_paths if Path(path).is_dir()] + unreferenced_file_paths = [path for path in unreferenced_paths if not Path(path).is_dir()] errors = {} if unreferenced_dir_paths: errors["Directories"] = unreferenced_dir_paths @@ -755,9 +700,7 @@ def __get_multi_ref_errors(self) -> dict: errors = {} data_references = self.__get_data_references() multi_references = [ - path - for path, value in self.multi_assay_data_paths.items() - if value.get("parent") + path for path, value in self.multi_assay_data_paths.items() if value.get("parent") ] for path, references in data_references.items(): if path not in multi_references: diff --git a/src/ingest_validation_tools/validation_utils.py b/src/ingest_validation_tools/validation_utils.py index 8fc1fab24..41c602a51 100644 --- a/src/ingest_validation_tools/validation_utils.py +++ b/src/ingest_validation_tools/validation_utils.py @@ -1,21 +1,21 @@ -from collections import defaultdict import json import logging +from collections import defaultdict from csv import DictReader from pathlib import Path, PurePath from typing import DefaultDict, Dict, List, Optional, Union import requests +from ingest_validation_tools.directory_validator import ( + DirectoryValidationErrors, + validate_directory, +) from ingest_validation_tools.schema_loader import ( PreflightError, SchemaVersion, get_directory_schema, ) -from ingest_validation_tools.directory_validator import ( - validate_directory, - DirectoryValidationErrors, -) from ingest_validation_tools.table_validator import ReportType from ingest_validation_tools.test_validation_utils import ( compare_mock_with_response, @@ -64,21 +64,15 @@ def get_schema_version( offline=offline, ) if not assay_type_data: - message = ( - f"Assay data not retrieved from assayclassifier endpoint for TSV {path}." - ) + message = f"Assay data not retrieved from assayclassifier endpoint for TSV {path}." if "assay_type" in rows[0]: message += f' Assay type: {rows[0].get("assay_type")}.' elif "dataset_type" in rows[0]: message += f' Dataset type: {rows[0].get("dataset_type")}.' if "channel_id" in rows[0]: - message += ( - ' Has "channel_id": Antibodies TSV found where metadata TSV expected.' - ) + message += ' Has "channel_id": Antibodies TSV found where metadata TSV expected.' elif "orcid_id" in rows[0]: - message += ( - ' Has "orcid_id": Contributors TSV found where metadata TSV expected.' - ) + message += ' Has "orcid_id": Contributors TSV found where metadata TSV expected.' else: message += f' Column headers in TSV: {", ".join(rows[0].keys())}' raise PreflightError(message) @@ -114,12 +108,8 @@ def get_other_schema_name(rows: List, path: str) -> Optional[str]: else: match = {key: field for key, value in other_types.items() if field in value} other_type.update(match) - if other_type and ( - "assay_name" in rows[0].keys() or "dataset_type" in rows[0].keys() - ): - raise PreflightError( - f"Metadata TSV contains invalid field: {list(other_type.values())}" - ) + if other_type and ("assay_name" in rows[0].keys() or "dataset_type" in rows[0].keys()): + raise PreflightError(f"Metadata TSV contains invalid field: {list(other_type.values())}") if len(other_type) == 1: return list(other_type.keys())[0] elif len(other_type) > 1: @@ -194,9 +184,7 @@ def get_data_dir_errors( if schema is None: return {"Undefined directory schema": dir_schema} - schema_warning_fields = [ - field for field in schema if field in ["deprecated", "draft"] - ] + schema_warning_fields = [field for field in schema if field in ["deprecated", "draft"]] schema_warning = ( {f"{schema_warning_fields[0].title()} directory schema": dir_schema} if schema_warning_fields @@ -204,9 +192,7 @@ def get_data_dir_errors( ) try: - validate_directory( - data_path, schema["files"], dataset_ignore_globs=dataset_ignore_globs - ) + validate_directory(data_path, schema["files"], dataset_ignore_globs=dataset_ignore_globs) except DirectoryValidationErrors as e: # If there are DirectoryValidationErrors and the schema is deprecated/draft... # schema deprecation/draft status is more important. @@ -259,8 +245,7 @@ def get_context_of_decode_error(e: UnicodeDecodeError) -> str: def get_other_names(): return [ - p.stem.split("-v")[0] - for p in (Path(__file__).parent / "table-schemas/others").iterdir() + p.stem.split("-v")[0] for p in (Path(__file__).parent / "table-schemas/others").iterdir() ] diff --git a/src/ingest_validation_tools/yaml_include_loader.py b/src/ingest_validation_tools/yaml_include_loader.py index dccd690ec..e2a2f1a74 100644 --- a/src/ingest_validation_tools/yaml_include_loader.py +++ b/src/ingest_validation_tools/yaml_include_loader.py @@ -24,31 +24,27 @@ def load_yaml(path: Path) -> dict: def _load_includes(path: Path, indent: int = 0) -> str: text = path.read_text() - if re.match(r'\s', text[0]): - raise Exception(f'Unexpected padding in the first column: {path}') - if re.search(r'\S.*#\s*include:', text): + if re.match(r"\s", text[0]): + raise Exception(f"Unexpected padding in the first column: {path}") + if re.search(r"\S.*#\s*include:", text): raise Exception(f'"# include:" is not alone on a line in: {path}') expanded_text = re.sub( - r'^([ \t]*)#\s*include:\s*(\S+)', + r"^([ \t]*)#\s*include:\s*(\S+)", _expand_match_generator(path.parent), text, - flags=re.MULTILINE + flags=re.MULTILINE, + ) + indent_string = " " * indent + indented_expanded_text = ( + indent_string + + re.sub(r"^", lambda match: indent_string, expanded_text, flags=re.MULTILINE).strip() ) - indent_string = ' ' * indent - indented_expanded_text = indent_string + re.sub( - r'^', - lambda match: indent_string, - expanded_text, - flags=re.MULTILINE - ).strip() return indented_expanded_text def _expand_match_generator(parent_dir: Path) -> Callable: def _expand_match(match): - expanded = _load_includes( - parent_dir / match.group(2), - indent=len(match.group(1)) - ) + expanded = _load_includes(parent_dir / match.group(2), indent=len(match.group(1))) return expanded + return _expand_match diff --git a/src/validate_tsv.py b/src/validate_tsv.py index bb0046e4e..22c3fff5c 100755 --- a/src/validate_tsv.py +++ b/src/validate_tsv.py @@ -1,18 +1,14 @@ #!/usr/bin/env python3 import argparse -from pathlib import Path -import sys import inspect +import sys +from pathlib import Path -from ingest_validation_tools.error_report import ErrorReport from ingest_validation_tools.cli_utils import ShowUsageException, exit_codes +from ingest_validation_tools.error_report import ErrorReport from ingest_validation_tools.schema_loader import PreflightError -from ingest_validation_tools.validation_utils import ( - get_tsv_errors, - get_schema_version, -) - +from ingest_validation_tools.validation_utils import get_schema_version, get_tsv_errors reminder = ( "REMINDER: Besides running validate_tsv.py, " diff --git a/src/validate_upload.py b/src/validate_upload.py index 4da6b7d2d..84307e220 100755 --- a/src/validate_upload.py +++ b/src/validate_upload.py @@ -1,22 +1,22 @@ #!/usr/bin/env python3 import argparse -import sys -from pathlib import Path import inspect +import sys from datetime import datetime +from pathlib import Path +from ingest_validation_tools.check_factory import cache_path +from ingest_validation_tools.cli_utils import ShowUsageException, dir_path, exit_codes from ingest_validation_tools.error_report import ErrorReport from ingest_validation_tools.upload import Upload -from ingest_validation_tools.cli_utils import ShowUsageException, exit_codes, dir_path -from ingest_validation_tools.check_factory import cache_path directory_schemas = sorted( { p.stem - for p in ( - Path(__file__).parent / "ingest_validation_tools" / "directory-schemas" - ).glob("*.yaml") + for p in (Path(__file__).parent / "ingest_validation_tools" / "directory-schemas").glob( + "*.yaml" + ) } ) @@ -105,9 +105,7 @@ def make_parser(): # Are there plugin validations? - parser.add_argument( - "--plugin_directory", action="store", help="Directory of plugin tests." - ) + parser.add_argument("--plugin_directory", action="store", help="Directory of plugin tests.") parser.add_argument( "--run_plugins", required=False, @@ -126,9 +124,7 @@ def make_parser(): error_report_methods = [ name for (name, _) in inspect.getmembers(ErrorReport) if name.startswith("as_") ] - parser.add_argument( - "--output", choices=error_report_methods, default="as_text_list" - ) + parser.add_argument("--output", choices=error_report_methods, default="as_text_list") parser.add_argument( "--add_notes",