Skip to content

Commit

Permalink
re-ran linting/formatting after rebase
Browse files Browse the repository at this point in the history
  • Loading branch information
gesinaphillips committed Feb 2, 2024
1 parent 3f2022a commit 3168f1c
Show file tree
Hide file tree
Showing 22 changed files with 306 additions and 462 deletions.
90 changes: 44 additions & 46 deletions src/cleanup_whitespace.py
Original file line number Diff line number Diff line change
@@ -1,34 +1,30 @@
#!/usr/bin/env python3

import argparse
import csv
import sys
import argparse
from pathlib import Path


def main():
parser = argparse.ArgumentParser(
description='''
description="""
Use the "--tsv_in"/"--tsv_out" options to strip invisible characters from TSVs.
'''
"""
)
mutex = parser.add_mutually_exclusive_group(required=True)
mutex.add_argument(
'--tsv_in',
type=Path,
metavar='INPUT',
help='TSV to strip padding whitespace from')
"--tsv_in", type=Path, metavar="INPUT", help="TSV to strip padding whitespace from"
)
mutex.add_argument(
'--encoding_test',
"--encoding_test",
type=str,
metavar='ENCODING',
help='Generate test TSV using this encoding')
metavar="ENCODING",
help="Generate test TSV using this encoding",
)
parser.add_argument(
'--tsv_out',
type=Path,
metavar='OUTPUT',
help='Destination for clean TSV',
required=True)
"--tsv_out", type=Path, metavar="OUTPUT", help="Destination for clean TSV", required=True
)
args = parser.parse_args()

if args.encoding_test:
Expand All @@ -40,61 +36,63 @@ def main():

def print_encoding_test(encoding, output_path):
space_chars = [
'\u000b', # vertical tab
'\u0020', # normal space
"\u000b", # vertical tab
"\u0020", # normal space
]
if encoding != 'ascii':
if encoding != "ascii":
space_chars += [
'\u00a0', # non-breaking space
"\u00a0", # non-breaking space
]
if encoding not in ['ascii', 'latin-1']:
if encoding not in ["ascii", "latin-1"]:
space_chars += [
'\u2003', # em space
'\u3000', # idiographic space
"\u2003", # em space
"\u3000", # idiographic space
]
padding = ''.join(space_chars)
padding = "".join(space_chars)

with output_path.open(mode='w', encoding=encoding) as f:
with output_path.open(mode="w", encoding=encoding) as f:
# Header:
print(
'quoted', 'empty', 'padded',
'', # Empty column header: should be cleaned up!
sep='\t', file=f
"quoted",
"empty",
"padded",
"", # Empty column header: should be cleaned up!
sep="\t",
file=f,
)

# Body:
print(
f'"{padding}123{padding}"',
'',
f'{padding}123{padding}',
'', '', # Two empty cells: should be cleaned up!
sep='\t', file=f
)
print(
'', '', '', '', # More empty cells: should be cleaned up!
sep='\t', file=f
"",
f"{padding}123{padding}",
"",
"", # Two empty cells: should be cleaned up!
sep="\t",
file=f,
)
print("", "", "", "", sep="\t", file=f) # More empty cells: should be cleaned up!
# Trailing \n means there's a trailing empty line in the TSV to clean up.
return 0


def print_clean_tsv(input_path, output_path):
dialect = 'excel-tab'
writer = csv.writer(output_path.open(mode='w', newline=''), dialect=dialect)
dialect = "excel-tab"
writer = csv.writer(output_path.open(mode="w", newline=""), dialect=dialect)

for encoding in ['utf-8', 'latin-1']:
warn(f'Trying to read {input_path} as {encoding}...')
for encoding in ["utf-8", "latin-1"]:
warn(f"Trying to read {input_path} as {encoding}...")
try:
# Read the file completely to determine if there are encoding problems,
# rather than reading and writing line-by-line.
rows = csv_to_rows(input_path, encoding=encoding, dialect=dialect)
clean_rows = clean(rows)
for row in clean_rows:
writer.writerow(row)
warn('Read succeeded')
warn("Read succeeded")
return 0
except UnicodeDecodeError as e:
warn(f'Read failed: {e}')
warn(f"Read failed: {e}")
continue
return 1

Expand All @@ -108,7 +106,7 @@ def csv_to_rows(tsv_path, encoding=None, dialect=None):


def clean(rows):
'''
"""
>>> clean([
... [' x', 'y ', ''],
... ['', ' Hi! ', '', ''],
Expand All @@ -117,7 +115,7 @@ def clean(rows):
... ])
[['x', 'y'], ['', 'Hi!']]
'''
"""
clean_rows = []
max_i = None
for row in rows:
Expand All @@ -126,16 +124,16 @@ def clean(rows):
continue
if max_i is None:
max_i = last_non_empty_index(stripped_row)
clean_rows.append(stripped_row[:max_i + 1])
clean_rows.append(stripped_row[: max_i + 1])
return clean_rows


def last_non_empty_index(values):
'''
"""
>>> last_non_empty_index(['', '', '0', '', ''])
2
'''
"""
return max(i for i, val in enumerate(values) if len(val))


Expand Down
57 changes: 31 additions & 26 deletions src/factor_field.py
Original file line number Diff line number Diff line change
@@ -1,32 +1,33 @@
#!/usr/bin/env python3

import sys
import argparse
from pathlib import Path
import fileinput
import sys
from collections import defaultdict
from pathlib import Path


def main():
parser = argparse.ArgumentParser(description='''
parser = argparse.ArgumentParser(
description="""
Factor out all variants of a given field.
''')
parser.add_argument(
'--field',
metavar='NAME',
required=True)
"""
)
parser.add_argument("--field", metavar="NAME", required=True)
parser.add_argument(
'--input_dir',
"--input_dir",
type=Path,
metavar='IN',
help='Directory to scan for instances of the field',
default='src/ingest_validation_tools/table-schemas/assays')
metavar="IN",
help="Directory to scan for instances of the field",
default="src/ingest_validation_tools/table-schemas/assays",
)
parser.add_argument(
'--output_dir',
"--output_dir",
type=Path,
metavar='OUT',
help='Directory to write field extracts',
default='src/ingest_validation_tools/table-schemas/includes/fields')
metavar="OUT",
help="Directory to write field extracts",
default="src/ingest_validation_tools/table-schemas/includes/fields",
)
args = parser.parse_args()

factor_field(args.field, args.input_dir, args.output_dir)
Expand All @@ -46,18 +47,22 @@ def pull(field_name, input_dir):
lines=lines,
get_file_name=lambda: str(fileinput.filename()),
field_name=field_name,
definitions=definitions
definitions=definitions,
)
return definitions


def push(field_name, definitions, output_dir):
options = [
f"# {'; '.join(sorted(files))}\n{definition}"
for definition, files in definitions.items()
] if len(definitions) > 1 else definitions.keys()
options = (
[
f"# {'; '.join(sorted(files))}\n{definition}"
for definition, files in definitions.items()
]
if len(definitions) > 1
else definitions.keys()
)
if options:
(output_dir / f'{field_name}.yaml').write_text('\n'.join(options))
(output_dir / f"{field_name}.yaml").write_text("\n".join(options))
else:
print(f"Check spelling of field name: '{field_name}'")
sys.exit(1)
Expand Down Expand Up @@ -93,18 +98,18 @@ def replace(lines, get_file_name, field_name, definitions):
definition = None
for line in lines:
# This assumes the YAML has been cleaned up!
if f'name: {field_name}' in line:
if f"name: {field_name}" in line:
inside = True
print(f'# include: ../includes/fields/{field_name}.yaml')
print(f"# include: ../includes/fields/{field_name}.yaml")
definition = line
continue
elif inside and line[0] not in ['-', '#']:
elif inside and line[0] not in ["-", "#"]:
definition += line
continue
elif inside:
definitions[definition].add(get_file_name())
inside = False
print(line, end='')
print(line, end="")


if __name__ == "__main__":
Expand Down
30 changes: 14 additions & 16 deletions src/generate_docs.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,29 +2,29 @@

import argparse
import os
from pathlib import Path
import sys
from yaml import dump as dump_yaml
from pathlib import Path

from tableschema_to_template.create_xlsx import create_xlsx
from yaml import dump as dump_yaml

from ingest_validation_tools.cli_utils import dir_path
from ingest_validation_tools.docs_utils import (
generate_readme_md,
generate_template_tsv,
get_tsv_name,
get_xlsx_name,
)
from ingest_validation_tools.schema_loader import (
dict_table_schema_versions,
get_table_schema,
dict_directory_schema_versions,
dict_table_schema_versions,
enum_maps_to_lists,
get_directory_schema,
get_fields_wo_headers,
get_is_assay,
enum_maps_to_lists,
get_pipeline_infos,
get_fields_wo_headers,
)
from ingest_validation_tools.docs_utils import (
get_tsv_name,
get_xlsx_name,
generate_template_tsv,
generate_readme_md,
get_table_schema,
)
from ingest_validation_tools.cli_utils import dir_path


def main():
Expand Down Expand Up @@ -158,9 +158,7 @@ def main():
)
max_schema["fields"] = get_fields_wo_headers(max_schema)
if max_schema["fields"][0]["name"] != "is_cedar":
with open(
deprecated_path / get_tsv_name(args.type, is_assay=is_assay), "w"
) as f:
with open(deprecated_path / get_tsv_name(args.type, is_assay=is_assay), "w") as f:
f.write(generate_template_tsv(max_schema))
create_xlsx(
max_schema,
Expand Down
4 changes: 2 additions & 2 deletions src/generate_field_enum_csv.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
#!/usr/bin/env python3
import argparse
import sys
from csv import DictWriter
import argparse

from ingest_validation_tools.schema_loader import (
list_table_schema_versions,
get_table_schema,
list_table_schema_versions,
)


Expand Down
Loading

0 comments on commit 3168f1c

Please sign in to comment.