Skip to content

Commit

Permalink
fix: SQLite joins should be on ImageNumber,TableNumber and not ImageN…
Browse files Browse the repository at this point in the history
…umber (#378)

* Join in ImageNumber,TableNumber

* Add missing column

* Update test fixtures

* Update test

* Fix type

`TableNumber` is `int64`

https://github.com/cytomining/cytominer-database/blob/5aa00f58e4a31bbbd2a3779c87e7a3620b0030db/cytominer_database/ingest.py#L101
  • Loading branch information
shntnu authored Mar 23, 2024
1 parent 9dd98ac commit af5a4ef
Show file tree
Hide file tree
Showing 5 changed files with 20 additions and 11 deletions.
16 changes: 12 additions & 4 deletions pycytominer/cyto_utils/cell_locations.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ class CellLocation:
Path to the output file. If None, the metadata file is not saved to disk
image_column : default = 'ImageNumber'
Name of the column in the metadata file that links to the single_cell file
Name of the column in the metadata file that links to the single_cell file, in combination with `table_column`
image_key: default = ['Metadata_Plate', 'Metadata_Well', 'Metadata_Site']
Names of the columns in the metadata file that uniquely identify each image
Expand All @@ -67,6 +67,9 @@ class CellLocation:
cell_y_loc : default = 'Nuclei_Location_Center_Y'
Name of the column in the single_cell file that contains the Y location of each cell
table_column : default = 'TableNumber'
Name of the column in the metadata file that links to the single_cell file, in combination with `image_column`
Methods
-------
add_cell_location()
Expand All @@ -82,6 +85,7 @@ def __init__(
overwrite: bool = False,
image_column: str = "ImageNumber",
object_column: str = "ObjectNumber",
table_column: str = "TableNumber",
image_key: list = ["Metadata_Plate", "Metadata_Well", "Metadata_Site"],
cell_x_loc: str = "Nuclei_Location_Center_X",
cell_y_loc: str = "Nuclei_Location_Center_Y",
Expand All @@ -92,6 +96,7 @@ def __init__(
self.overwrite = overwrite
self.image_column = image_column
self.object_column = object_column
self.table_column = table_column
self.image_key = image_key
self.cell_x_loc = cell_x_loc
self.cell_y_loc = cell_y_loc
Expand Down Expand Up @@ -235,7 +240,7 @@ def _create_nested_df(self, df: pd.DataFrame):
output_df_list = collections.defaultdict(list)

# iterate over each group of cells in the merged DataFrame
group_cols = [*self.image_key, self.image_column]
group_cols = [*self.image_key, self.image_column, self.table_column]

for group_values, cell_df in df.groupby(group_cols):
# add the image-level information to the output dictionary
Expand Down Expand Up @@ -317,6 +322,7 @@ def _check_single_cell_correctness(self, engine: sqlalchemy.engine.Engine):
column_name in nuclei_columns
for column_name in [
self.image_column,
self.table_column,
self.object_column,
self.cell_x_loc,
self.cell_y_loc,
Expand All @@ -330,6 +336,7 @@ def _check_single_cell_correctness(self, engine: sqlalchemy.engine.Engine):

if not (
self.image_column in image_columns
and self.table_column in image_columns
and all(elem in image_columns for elem in self.image_key)
):
raise ValueError(
Expand All @@ -351,14 +358,15 @@ def _get_joined_image_nuclei_tables(self):
# merge the Image and Nuclei tables in SQL

join_query = f"""
SELECT Nuclei.{self.image_column},Nuclei.{self.object_column},Nuclei.{self.cell_x_loc},Nuclei.{self.cell_y_loc},Image.{image_index_str}
SELECT Nuclei.{self.table_column},Nuclei.{self.image_column},Nuclei.{self.object_column},Nuclei.{self.cell_x_loc},Nuclei.{self.cell_y_loc},Image.{image_index_str}
FROM Nuclei
INNER JOIN Image
ON Nuclei.{self.image_column} = Image.{self.image_column};
ON Nuclei.{self.image_column} = Image.{self.image_column} and Nuclei.{self.table_column} = Image.{self.table_column};
"""

column_types = {
self.image_column: "int64",
self.table_column: "int64",
self.object_column: "int64",
self.cell_x_loc: "float",
self.cell_y_loc: "float",
Expand Down
5 changes: 3 additions & 2 deletions tests/test_cyto_utils/test_cell_locations.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,10 @@ def test_output_shape_and_required_columns(
metadata_input_dataframe = get_metadata_input_dataframe(cell_loc=cls_cell_loc)

# check the shape of the data
# cell_loc will have 3 extra columns: TableNumber, ImageNumber, CellCenters
assert cell_loc.shape == (
metadata_input_dataframe.shape[0],
metadata_input_dataframe.shape[1] + 2,
metadata_input_dataframe.shape[1] + 3,
)

assert isinstance(cell_loc["CellCenters"][0][0], dict)
Expand Down Expand Up @@ -89,7 +90,7 @@ def test_output_value_correctness(
# gather an engine from the cell_loc class
_, engine = cls_cell_loc._get_single_cell_engine()

nuclei_query = "SELECT ImageNumber, ObjectNumber, Nuclei_Location_Center_X, Nuclei_Location_Center_Y FROM Nuclei;"
nuclei_query = "SELECT TableNumber, ImageNumber, ObjectNumber, Nuclei_Location_Center_X, Nuclei_Location_Center_Y FROM Nuclei;"

nuclei_df = pd.read_sql_query(nuclei_query, engine)

Expand Down
10 changes: 5 additions & 5 deletions tests/test_data/cell_locations_example_data/shrink_BR00126114.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,16 +16,16 @@ aws s3 cp s3://cellpainting-gallery/cpg0016-jump/source_4/workspace/backend/2021
aws s3 cp s3://cellpainting-gallery/cpg0016-jump/source_4/workspace/load_data_csv/2021_08_23_Batch12/BR00126114/load_data_with_illum.parquet .

# Write a SQL query to select rows of the `Image` table in the SQLite file where `ImageNumber` is 1 or 2.
# Only select the columns: `Metadata_Plate`, `Metadata_Well`, `Metadata_Site`, `ImageNumber`
# Only select the columns: `Metadata_Plate`, `Metadata_Well`, `Metadata_Site`, `TableNumber`, `ImageNumber`

sqlite3 -header -csv BR00126114.sqlite "SELECT Metadata_Plate, Metadata_Well, Metadata_Site, ImageNumber FROM Image WHERE ImageNumber = 1 OR ImageNumber = 2;" > image_query.csv
sqlite3 -header -csv BR00126114.sqlite "SELECT Metadata_Plate, Metadata_Well, Metadata_Site, TableNumber, ImageNumber FROM Image WHERE ImageNumber = 1 OR ImageNumber = 2;" > image_query.csv


# Write a SQL query to select rows of the `Nuclei` table in the SQLite file where `ImageNumber` is 1 or 2.
# Only select the columns: `ImageNumber`, `ObjectNumber`, `Nuclei_Location_Center_X`, `Nuclei_Location_Center_Y`
# Only select the columns: `TableNumber``, `ImageNumber`, `ObjectNumber`, `Nuclei_Location_Center_X`, `Nuclei_Location_Center_Y`

sqlite3 -header -csv BR00126114.sqlite "SELECT ImageNumber, ObjectNumber, Nuclei_Location_Center_X, Nuclei_Location_Center_Y FROM Nuclei WHERE ImageNumber = 1 LIMIT 10;" > nuclei_query_1.csv
sqlite3 -header -csv BR00126114.sqlite "SELECT ImageNumber, ObjectNumber, Nuclei_Location_Center_X, Nuclei_Location_Center_Y FROM Nuclei WHERE ImageNumber = 2 LIMIT 10;" > nuclei_query_2.csv
sqlite3 -header -csv BR00126114.sqlite "SELECT TableNumber, ImageNumber, ObjectNumber, Nuclei_Location_Center_X, Nuclei_Location_Center_Y FROM Nuclei WHERE ImageNumber = 1 LIMIT 10;" > nuclei_query_1.csv
sqlite3 -header -csv BR00126114.sqlite "SELECT TableNumber, ImageNumber, ObjectNumber, Nuclei_Location_Center_X, Nuclei_Location_Center_Y FROM Nuclei WHERE ImageNumber = 2 LIMIT 10;" > nuclei_query_2.csv

csvstack nuclei_query_1.csv nuclei_query_2.csv > nuclei_query.csv

Expand Down
Binary file not shown.
Binary file not shown.

0 comments on commit af5a4ef

Please sign in to comment.