Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/main' into define-data-transfo…
Browse files Browse the repository at this point in the history
…rm-types
  • Loading branch information
d33bs committed Jan 12, 2024
2 parents 812520a + 7b0c276 commit b2aaf77
Show file tree
Hide file tree
Showing 8 changed files with 673 additions and 4 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -135,3 +135,6 @@ dmypy.json

# parsl ignores
runinfo

# test data ignores
tests/in-carta/colas-lab/data
17 changes: 17 additions & 0 deletions CITATION.cff
Original file line number Diff line number Diff line change
Expand Up @@ -178,3 +178,20 @@ references:
notes: >-
MapReduce techniques are used via Parsl apps and workflow configuration
to help achieve scalable data engineering for CytoTable.
- authors:
- family-names: Lamba
given-names: Aashna
- family-names: Colas
given-names: Alexandre
date-accessed: "2024-01-09"
title: Colas Lab Example IN Carta Dataset
type: data
notes: >-
The Colas Lab provided access to a single-cell dataset exported from
IN Carta Image Analysis Software (Molecular Devices) for
use within CytoTable tests for furthering development efforts.
A modified testing dataset appears within this project
under `tests/data/in-carta/colas-lab`.
See:
- https://sbpdiscovery.org/our-scientists/alexandre-colas-phd
- https://www.moleculardevices.com/products/cellular-imaging-systems/acquisition-and-analysis-software/in-carta-image-analysis-software
12 changes: 8 additions & 4 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

65 changes: 65 additions & 0 deletions tests/data/in-carta/colas-lab/shrink_colas_lab_data_for_tests.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
"""
Shrink datasets from Colas Lab from IN Carta provided as collection of CSV's.
Note: built to be run from CytoTable poetry dev environment from project base, e.g.:
`poetry run python tests/data/in-carta/colas-lab/shrink_colas_lab_data_for_tests.py`
"""

import pathlib

import duckdb
from pyarrow import csv

# set a path for local and target data dir
SOURCE_DATA_DIR = "tests/data/in-carta/colas-lab/data"
TARGET_DATA_DIR = "tests/data/in-carta/colas-lab"

# build a collection of schema
schema_collection = []
for data_file in pathlib.Path(SOURCE_DATA_DIR).rglob("*.csv"):
with duckdb.connect() as ddb:
# read the csv file as a pyarrow table and extract detected schema
schema_collection.append(
{
"file": data_file,
"schema": ddb.execute(
f"""
SELECT *
FROM read_csv_auto('{data_file}')
"""
)
.arrow()
.schema,
}
)

# determine if the schema are exactly alike
for schema in schema_collection:
for schema_to_compare in schema_collection:
# compare every schema to all others
if schema["file"] != schema_to_compare["file"]:
if not schema["schema"].equals(schema_to_compare["schema"]):
# if we detect that the schema are inequal, raise an exception
raise TypeError("Inequal schema detected.")


for idx, data_file in enumerate(pathlib.Path(SOURCE_DATA_DIR).rglob("*.csv")):
with duckdb.connect() as ddb:
# Read the csv file with SQL-based filters
# as a pyarrow table then output to a new and
# smaller csv for testing purposes.
csv.write_csv(
# we use duckdb to filter the original dataset in SQL
data=ddb.execute(
f"""
SELECT *
FROM read_csv_auto('{data_file}') as data_file
/* select only the first three objects to limit the dataset */
WHERE data_file."OBJECT ID" in (1,2,3)
/* select rows C and D to limit the dataset */
AND data_file."ROW" in ('C', 'D')
"""
).arrow(),
# output the filtered data as a CSV to a new location
output_file=f"{TARGET_DATA_DIR}/test-in-carta-{idx}.csv",
)
Loading

0 comments on commit b2aaf77

Please sign in to comment.