-
Notifications
You must be signed in to change notification settings - Fork 5
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge remote-tracking branch 'upstream/main' into define-data-transfo…
…rm-types
- Loading branch information
Showing
8 changed files
with
673 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -135,3 +135,6 @@ dmypy.json | |
|
||
# parsl ignores | ||
runinfo | ||
|
||
# test data ignores | ||
tests/in-carta/colas-lab/data |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
65 changes: 65 additions & 0 deletions
65
tests/data/in-carta/colas-lab/shrink_colas_lab_data_for_tests.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
""" | ||
Shrink datasets from Colas Lab from IN Carta provided as collection of CSV's. | ||
Note: built to be run from CytoTable poetry dev environment from project base, e.g.: | ||
`poetry run python tests/data/in-carta/colas-lab/shrink_colas_lab_data_for_tests.py` | ||
""" | ||
|
||
import pathlib | ||
|
||
import duckdb | ||
from pyarrow import csv | ||
|
||
# set a path for local and target data dir | ||
SOURCE_DATA_DIR = "tests/data/in-carta/colas-lab/data" | ||
TARGET_DATA_DIR = "tests/data/in-carta/colas-lab" | ||
|
||
# build a collection of schema | ||
schema_collection = [] | ||
for data_file in pathlib.Path(SOURCE_DATA_DIR).rglob("*.csv"): | ||
with duckdb.connect() as ddb: | ||
# read the csv file as a pyarrow table and extract detected schema | ||
schema_collection.append( | ||
{ | ||
"file": data_file, | ||
"schema": ddb.execute( | ||
f""" | ||
SELECT * | ||
FROM read_csv_auto('{data_file}') | ||
""" | ||
) | ||
.arrow() | ||
.schema, | ||
} | ||
) | ||
|
||
# determine if the schema are exactly alike | ||
for schema in schema_collection: | ||
for schema_to_compare in schema_collection: | ||
# compare every schema to all others | ||
if schema["file"] != schema_to_compare["file"]: | ||
if not schema["schema"].equals(schema_to_compare["schema"]): | ||
# if we detect that the schema are inequal, raise an exception | ||
raise TypeError("Inequal schema detected.") | ||
|
||
|
||
for idx, data_file in enumerate(pathlib.Path(SOURCE_DATA_DIR).rglob("*.csv")): | ||
with duckdb.connect() as ddb: | ||
# Read the csv file with SQL-based filters | ||
# as a pyarrow table then output to a new and | ||
# smaller csv for testing purposes. | ||
csv.write_csv( | ||
# we use duckdb to filter the original dataset in SQL | ||
data=ddb.execute( | ||
f""" | ||
SELECT * | ||
FROM read_csv_auto('{data_file}') as data_file | ||
/* select only the first three objects to limit the dataset */ | ||
WHERE data_file."OBJECT ID" in (1,2,3) | ||
/* select rows C and D to limit the dataset */ | ||
AND data_file."ROW" in ('C', 'D') | ||
""" | ||
).arrow(), | ||
# output the filtered data as a CSV to a new location | ||
output_file=f"{TARGET_DATA_DIR}/test-in-carta-{idx}.csv", | ||
) |
Oops, something went wrong.