Skip to content

Commit

Permalink
Add image filename columns to CellProfiler presets (#252)
Browse files Browse the repository at this point in the history
* add image filename columns to cellprofiler presets

* update column count for large data test

* code comment flexibility in regards to deprecation

Co-Authored-By: Gregory Way <[email protected]>

---------

Co-authored-by: Gregory Way <[email protected]>
  • Loading branch information
d33bs and gwaybio authored Nov 11, 2024
1 parent 3e98ecd commit 302ac54
Show file tree
Hide file tree
Showing 4 changed files with 66 additions and 45 deletions.
5 changes: 5 additions & 0 deletions cytotable/presets.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
"CONFIG_JOINS": """
SELECT
image.Metadata_ImageNumber,
COLUMNS('Image_FileName_.*'),
cytoplasm.* EXCLUDE (Metadata_ImageNumber),
cells.* EXCLUDE (Metadata_ImageNumber, Metadata_ObjectNumber),
nuclei.* EXCLUDE (Metadata_ImageNumber, Metadata_ObjectNumber)
Expand Down Expand Up @@ -92,6 +93,7 @@
per_image.Metadata_ImageNumber,
per_image.Image_Metadata_Well,
per_image.Image_Metadata_Plate,
COLUMNS('Image_FileName_.*'),
per_cytoplasm.* EXCLUDE (Metadata_ImageNumber),
per_cells.* EXCLUDE (Metadata_ImageNumber),
per_nuclei.* EXCLUDE (Metadata_ImageNumber)
Expand Down Expand Up @@ -148,6 +150,7 @@
image.Metadata_Well,
image.Image_Metadata_Site,
image.Image_Metadata_Row,
COLUMNS('Image_FileName_.*'),
cytoplasm.* EXCLUDE (Metadata_ImageNumber),
cells.* EXCLUDE (Metadata_ImageNumber),
nuclei.* EXCLUDE (Metadata_ImageNumber)
Expand Down Expand Up @@ -206,6 +209,7 @@
per_image.Metadata_ImageNumber,
per_image.Image_Metadata_Well,
per_image.Image_Metadata_Plate,
COLUMNS('Image_FileName_.*'),
per_cytoplasm.* EXCLUDE (Metadata_ImageNumber),
per_cells.* EXCLUDE (Metadata_ImageNumber),
per_nuclei.* EXCLUDE (Metadata_ImageNumber)
Expand Down Expand Up @@ -265,6 +269,7 @@
image.Metadata_ImageNumber,
image.Image_Metadata_Well,
image.Image_Metadata_Plate,
COLUMNS('Image_FileName_.*'),
cytoplasm.* EXCLUDE (Metadata_TableNumber, Metadata_ImageNumber),
cells.* EXCLUDE (Metadata_TableNumber, Metadata_ImageNumber),
nuclei.* EXCLUDE (Metadata_TableNumber, Metadata_ImageNumber)
Expand Down
38 changes: 1 addition & 37 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
import pathlib
import shutil
import sqlite3
import subprocess
import tempfile
from typing import Any, Dict, Generator, List, Tuple

Expand Down Expand Up @@ -138,42 +137,6 @@ def fixture_data_dir_in_carta() -> List[str]:
return [f"{pathlib.Path(__file__).parent}/data/in-carta/colas-lab"]


# skip this fixture to avoid issues with ubuntu 22.04 and CLI usage of
# cytominer-database. Use instead fixture cytominerdatabase_sqlite_static.
@pytest.mark.skip
@pytest.fixture(name="cytominerdatabase_sqlite", scope="function")
def fixture_cytominerdatabase_sqlite(
tmp_path: str,
data_dirs_cytominerdatabase: List[str],
) -> List[str]:
"""
Processed cytominer-database test data as sqlite data
"""

output_paths = []
for data_dir in data_dirs_cytominerdatabase:
# example command for reference as subprocess below
# cytominer-database ingest source_directory sqlite:///backend.sqlite -c ingest_config.ini
output_path = f"sqlite:///{data_dir}/{pathlib.Path(data_dir).name}.sqlite"

# run cytominer-database as command-line call
subprocess.call(
args=[
"cytominer-database",
"ingest",
data_dir,
output_path,
"-c",
f"{data_dir}/config_SQLite.ini",
]
)

# store the sqlite output file within list to be returned
output_paths.append(output_path)

return output_paths


@pytest.fixture(name="cytominerdatabase_sqlite_static", scope="function")
def fixture_cytominerdatabase_sqlite_static():
"""
Expand Down Expand Up @@ -590,6 +553,7 @@ def fixture_cellprofiler_merged_nf1data(
image.ImageNumber,
image.Image_Metadata_Well,
image.Image_Metadata_Plate,
COLUMNS('Image_FileName_.*'),
cytoplasm.*,
cells.*,
nuclei.*
Expand Down
53 changes: 48 additions & 5 deletions tests/test_convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -799,6 +799,24 @@ def test_convert_cytominerdatabase_csv(
source_datatype="csv",
join=True,
drop_null=False,
# These test datasets don't include image FileName columns
# so we use a custom join SQL here to avoid errors on querying for
# columns which aren't present.
joins="""
SELECT
image.Metadata_ImageNumber,
cytoplasm.* EXCLUDE (Metadata_ImageNumber),
cells.* EXCLUDE (Metadata_ImageNumber, Metadata_ObjectNumber),
nuclei.* EXCLUDE (Metadata_ImageNumber, Metadata_ObjectNumber)
FROM
read_parquet('cytoplasm.parquet') AS cytoplasm
LEFT JOIN read_parquet('cells.parquet') AS cells USING (Metadata_ImageNumber)
LEFT JOIN read_parquet('nuclei.parquet') AS nuclei USING (Metadata_ImageNumber)
LEFT JOIN read_parquet('image.parquet') AS image USING (Metadata_ImageNumber)
WHERE
cells.Metadata_ObjectNumber = cytoplasm.Metadata_Cytoplasm_Parent_Cells
AND nuclei.Metadata_ObjectNumber = cytoplasm.Metadata_Cytoplasm_Parent_Nuclei
""",
),
schema=control_table.schema,
)
Expand Down Expand Up @@ -922,6 +940,14 @@ def test_convert_cellprofiler_csv(
source_datatype="csv",
preset="cellprofiler_csv",
)
# drop image filenames which won't be present in the comparison dataset
).drop(
[
"Image_FileName_DNA",
"Image_FileName_OrigOverlay",
"Image_FileName_PH3",
"Image_FileName_cellbody",
]
)

# sort all values by the same columns
Expand Down Expand Up @@ -1091,11 +1117,12 @@ def test_convert_cellprofiler_sqlite_pycytominer_merge(
chunk_size=100,
preset="cellprofiler_sqlite_pycytominer",
)
)
# drop image columns which won't be present in Pycytominer output.
).drop(["Image_FileName_GFP", "Image_FileName_DAPI", "Image_FileName_RFP"])

# find the difference in column names and display it as part of an assertion
# find the symmetric difference in column names and display it as part of an assertion
column_diff = list(
set(pycytominer_table.schema.names) - set(cytotable_table.schema.names)
set(pycytominer_table.schema.names) ^ set(cytotable_table.schema.names)
)
# if there are no differences in column names, we should pass the assertion
# (empty collections evaluate to false)
Expand Down Expand Up @@ -1217,7 +1244,7 @@ def test_cell_health_cellprofiler_to_cytominer_database_legacy(
)

# check that we have the expected shape
assert test_result.shape == (12, 1790)
assert test_result.shape == (12, 1802)
# check that the tablenumber data arrived properly
assert set(test_result["Metadata_TableNumber"].to_pylist()) == {
"88ac13033d9baf49fda78c3458bef89e",
Expand Down Expand Up @@ -1247,7 +1274,23 @@ def test_cell_health_cellprofiler_to_cytominer_database_legacy(
)["Nuclei_Correlation_Costes_AGP_DNA"].to_pylist()
)
)

# drop image filenames which won't be present in fixture output
test_result = test_result.drop(
[
"Image_FileName_CellOutlines",
"Image_FileName_IllumAGP",
"Image_FileName_IllumDNA",
"Image_FileName_IllumER",
"Image_FileName_IllumMito",
"Image_FileName_IllumRNA",
"Image_FileName_NucleiOutlines",
"Image_FileName_OrigAGP",
"Image_FileName_OrigDNA",
"Image_FileName_OrigER",
"Image_FileName_OrigMito",
"Image_FileName_OrigRNA",
]
)
# assert that a manually configured table is equal to the cytotable result
# note: we sort values by all column names ascendingly for equality comparisons
assert test_result.sort_by(
Expand Down
15 changes: 12 additions & 3 deletions tests/test_convert_threaded.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,14 @@ def test_convert_tpe_cellprofiler_csv(
source_datatype="csv",
preset="cellprofiler_csv",
)
# drop image FileName columns which won't be present in the comparison dataset
).drop(
[
"Image_FileName_DNA",
"Image_FileName_OrigOverlay",
"Image_FileName_PH3",
"Image_FileName_cellbody",
]
)

# sort all values by the same columns
Expand Down Expand Up @@ -73,7 +81,8 @@ def test_convert_s3_path_csv(
parquet_file_meta = parquet.ParquetFile(s3_result).metadata

# check the shape of the data
assert (parquet_file_meta.num_rows, parquet_file_meta.num_columns) == (109, 5794)
# note: includes filename columns
assert (parquet_file_meta.num_rows, parquet_file_meta.num_columns) == (109, 5812)


@pytest.mark.large_data_tests
Expand Down Expand Up @@ -109,10 +118,10 @@ def test_convert_s3_path_sqlite_join(
parquet_file_meta = parquet.ParquetFile(s3_result).metadata

# check the shape of the data
assert (parquet_file_meta.num_rows, parquet_file_meta.num_columns) == (74226, 5928)
assert (parquet_file_meta.num_rows, parquet_file_meta.num_columns) == (74226, 5946)

# check that dropping duplicates results in the same shape
assert pd.read_parquet(s3_result).drop_duplicates().shape == (74226, 5928)
assert pd.read_parquet(s3_result).drop_duplicates().shape == (74226, 5946)


def test_get_source_filepaths(
Expand Down

0 comments on commit 302ac54

Please sign in to comment.