From af5a4efbaae20d1b579c6a391b3f14a4721c990f Mon Sep 17 00:00:00 2001 From: Shantanu Singh Date: Sat, 23 Mar 2024 09:21:27 -0400 Subject: [PATCH] fix: SQLite joins should be on ImageNumber,TableNumber and not ImageNumber (#378) * Join in ImageNumber,TableNumber * Add missing column * Update test fixtures * Update test * Fix type `TableNumber` is `int64` https://github.com/cytomining/cytominer-database/blob/5aa00f58e4a31bbbd2a3779c87e7a3620b0030db/cytominer_database/ingest.py#L101 --- pycytominer/cyto_utils/cell_locations.py | 16 ++++++++++++---- tests/test_cyto_utils/test_cell_locations.py | 5 +++-- .../shrink_BR00126114.sh | 10 +++++----- .../test_BR00126114.sqlite | Bin 12288 -> 12288 bytes ...st_BR00126114_load_data_with_illum.parquet | Bin 27012 -> 26409 bytes 5 files changed, 20 insertions(+), 11 deletions(-) diff --git a/pycytominer/cyto_utils/cell_locations.py b/pycytominer/cyto_utils/cell_locations.py index 1532e2e9..9180b1fe 100644 --- a/pycytominer/cyto_utils/cell_locations.py +++ b/pycytominer/cyto_utils/cell_locations.py @@ -53,7 +53,7 @@ class CellLocation: Path to the output file. If None, the metadata file is not saved to disk image_column : default = 'ImageNumber' - Name of the column in the metadata file that links to the single_cell file + Name of the column in the metadata file that links to the single_cell file, in combination with `table_column` image_key: default = ['Metadata_Plate', 'Metadata_Well', 'Metadata_Site'] Names of the columns in the metadata file that uniquely identify each image @@ -67,6 +67,9 @@ class CellLocation: cell_y_loc : default = 'Nuclei_Location_Center_Y' Name of the column in the single_cell file that contains the Y location of each cell + table_column : default = 'TableNumber' + Name of the column in the metadata file that links to the single_cell file, in combination with `image_column` + Methods ------- add_cell_location() @@ -82,6 +85,7 @@ def __init__( overwrite: bool = False, image_column: str = "ImageNumber", object_column: str = "ObjectNumber", + table_column: str = "TableNumber", image_key: list = ["Metadata_Plate", "Metadata_Well", "Metadata_Site"], cell_x_loc: str = "Nuclei_Location_Center_X", cell_y_loc: str = "Nuclei_Location_Center_Y", @@ -92,6 +96,7 @@ def __init__( self.overwrite = overwrite self.image_column = image_column self.object_column = object_column + self.table_column = table_column self.image_key = image_key self.cell_x_loc = cell_x_loc self.cell_y_loc = cell_y_loc @@ -235,7 +240,7 @@ def _create_nested_df(self, df: pd.DataFrame): output_df_list = collections.defaultdict(list) # iterate over each group of cells in the merged DataFrame - group_cols = [*self.image_key, self.image_column] + group_cols = [*self.image_key, self.image_column, self.table_column] for group_values, cell_df in df.groupby(group_cols): # add the image-level information to the output dictionary @@ -317,6 +322,7 @@ def _check_single_cell_correctness(self, engine: sqlalchemy.engine.Engine): column_name in nuclei_columns for column_name in [ self.image_column, + self.table_column, self.object_column, self.cell_x_loc, self.cell_y_loc, @@ -330,6 +336,7 @@ def _check_single_cell_correctness(self, engine: sqlalchemy.engine.Engine): if not ( self.image_column in image_columns + and self.table_column in image_columns and all(elem in image_columns for elem in self.image_key) ): raise ValueError( @@ -351,14 +358,15 @@ def _get_joined_image_nuclei_tables(self): # merge the Image and Nuclei tables in SQL join_query = f""" - SELECT Nuclei.{self.image_column},Nuclei.{self.object_column},Nuclei.{self.cell_x_loc},Nuclei.{self.cell_y_loc},Image.{image_index_str} + SELECT Nuclei.{self.table_column},Nuclei.{self.image_column},Nuclei.{self.object_column},Nuclei.{self.cell_x_loc},Nuclei.{self.cell_y_loc},Image.{image_index_str} FROM Nuclei INNER JOIN Image - ON Nuclei.{self.image_column} = Image.{self.image_column}; + ON Nuclei.{self.image_column} = Image.{self.image_column} and Nuclei.{self.table_column} = Image.{self.table_column}; """ column_types = { self.image_column: "int64", + self.table_column: "int64", self.object_column: "int64", self.cell_x_loc: "float", self.cell_y_loc: "float", diff --git a/tests/test_cyto_utils/test_cell_locations.py b/tests/test_cyto_utils/test_cell_locations.py index 54f8db1d..99f65681 100644 --- a/tests/test_cyto_utils/test_cell_locations.py +++ b/tests/test_cyto_utils/test_cell_locations.py @@ -48,9 +48,10 @@ def test_output_shape_and_required_columns( metadata_input_dataframe = get_metadata_input_dataframe(cell_loc=cls_cell_loc) # check the shape of the data + # cell_loc will have 3 extra columns: TableNumber, ImageNumber, CellCenters assert cell_loc.shape == ( metadata_input_dataframe.shape[0], - metadata_input_dataframe.shape[1] + 2, + metadata_input_dataframe.shape[1] + 3, ) assert isinstance(cell_loc["CellCenters"][0][0], dict) @@ -89,7 +90,7 @@ def test_output_value_correctness( # gather an engine from the cell_loc class _, engine = cls_cell_loc._get_single_cell_engine() - nuclei_query = "SELECT ImageNumber, ObjectNumber, Nuclei_Location_Center_X, Nuclei_Location_Center_Y FROM Nuclei;" + nuclei_query = "SELECT TableNumber, ImageNumber, ObjectNumber, Nuclei_Location_Center_X, Nuclei_Location_Center_Y FROM Nuclei;" nuclei_df = pd.read_sql_query(nuclei_query, engine) diff --git a/tests/test_data/cell_locations_example_data/shrink_BR00126114.sh b/tests/test_data/cell_locations_example_data/shrink_BR00126114.sh index 8a18202f..949fd790 100644 --- a/tests/test_data/cell_locations_example_data/shrink_BR00126114.sh +++ b/tests/test_data/cell_locations_example_data/shrink_BR00126114.sh @@ -16,16 +16,16 @@ aws s3 cp s3://cellpainting-gallery/cpg0016-jump/source_4/workspace/backend/2021 aws s3 cp s3://cellpainting-gallery/cpg0016-jump/source_4/workspace/load_data_csv/2021_08_23_Batch12/BR00126114/load_data_with_illum.parquet . # Write a SQL query to select rows of the `Image` table in the SQLite file where `ImageNumber` is 1 or 2. -# Only select the columns: `Metadata_Plate`, `Metadata_Well`, `Metadata_Site`, `ImageNumber` +# Only select the columns: `Metadata_Plate`, `Metadata_Well`, `Metadata_Site`, `TableNumber`, `ImageNumber` -sqlite3 -header -csv BR00126114.sqlite "SELECT Metadata_Plate, Metadata_Well, Metadata_Site, ImageNumber FROM Image WHERE ImageNumber = 1 OR ImageNumber = 2;" > image_query.csv +sqlite3 -header -csv BR00126114.sqlite "SELECT Metadata_Plate, Metadata_Well, Metadata_Site, TableNumber, ImageNumber FROM Image WHERE ImageNumber = 1 OR ImageNumber = 2;" > image_query.csv # Write a SQL query to select rows of the `Nuclei` table in the SQLite file where `ImageNumber` is 1 or 2. -# Only select the columns: `ImageNumber`, `ObjectNumber`, `Nuclei_Location_Center_X`, `Nuclei_Location_Center_Y` +# Only select the columns: `TableNumber``, `ImageNumber`, `ObjectNumber`, `Nuclei_Location_Center_X`, `Nuclei_Location_Center_Y` -sqlite3 -header -csv BR00126114.sqlite "SELECT ImageNumber, ObjectNumber, Nuclei_Location_Center_X, Nuclei_Location_Center_Y FROM Nuclei WHERE ImageNumber = 1 LIMIT 10;" > nuclei_query_1.csv -sqlite3 -header -csv BR00126114.sqlite "SELECT ImageNumber, ObjectNumber, Nuclei_Location_Center_X, Nuclei_Location_Center_Y FROM Nuclei WHERE ImageNumber = 2 LIMIT 10;" > nuclei_query_2.csv +sqlite3 -header -csv BR00126114.sqlite "SELECT TableNumber, ImageNumber, ObjectNumber, Nuclei_Location_Center_X, Nuclei_Location_Center_Y FROM Nuclei WHERE ImageNumber = 1 LIMIT 10;" > nuclei_query_1.csv +sqlite3 -header -csv BR00126114.sqlite "SELECT TableNumber, ImageNumber, ObjectNumber, Nuclei_Location_Center_X, Nuclei_Location_Center_Y FROM Nuclei WHERE ImageNumber = 2 LIMIT 10;" > nuclei_query_2.csv csvstack nuclei_query_1.csv nuclei_query_2.csv > nuclei_query.csv diff --git a/tests/test_data/cell_locations_example_data/test_BR00126114.sqlite b/tests/test_data/cell_locations_example_data/test_BR00126114.sqlite index efeabe0204b6b3b443051f0ba7849ba9ad8601ba..370710f6f1ecaf67c29f407260cb9990bf80ee74 100644 GIT binary patch delta 774 zcmZvYJ#5oJ6vypQ9GcqqoDUQ{SlI3a!S>zx?DLHURj3lJ1Z3#g7G>dM0I3oiNd!CS zX2c9z6;v^HWMW{11vZ2jn3=eXl#<%qa;Kku@BQDq@!ojvWzRHTUED_Z4975x2AUZ# zLbIdCH&APmHky6UGp9m+MOVAS{ey?2ox?{5qbE@?+}s^r3*-uHnJu{u(ds3$+3)ww zDafaA|37X^euJJp&wvvHzQf6^?Y@U+mm5~tg%Gaa!I;p35VoOi^%7;K3H41a_-6&e zGeS6~l<1B<+cV%3yoLv`1s?i=j?oKr7X{8Q=Y#Xyx$XG&Py5)O+B-@-dyd9m)^27! zD};6&FQpPJNRi-Djggm`$O&VVCnYOr&hs)CoYAf-g5@G<#ZiZLYB<%nUBkNy6IbKG zQnH-HVTVG^ej7)d1XXRyay2H&vK%ues9=T$BmnjAbQPyYm8{s{~9^70{T&!Jh!y{)^g{=_S#Zs zt6Z9E%+FP^^)!!R=t?kvE{4Cs(MPhPrP32)0B=4!4s!#>yw~`;n*6_I?5Tv1#5A=p z!5v({4zyqv^7Niw(=)nG8}uc4Aa~@P?2u2SqW)3)-#5RgtfHkMXPPc^Vis6V6vUAq z)nlLeo*P<`7r1`tM}8c7##kD_Gc2ydI{sqk$R zJrVvCy`c)eP58Os-^i8G$E&zwy;6*f@NEk_Bgoi4v9O~Ef0OuU!v8=&nNEC_SM5?s sA4V<~nHR>9#X`@kN1?^)VPFi2Qwh)TuQot<8 diff --git a/tests/test_data/cell_locations_example_data/test_BR00126114_load_data_with_illum.parquet b/tests/test_data/cell_locations_example_data/test_BR00126114_load_data_with_illum.parquet index 9324e8b4b0f8a32bdae9ceb44d5172fd16996174..5eeb0b24f2e9822ff50f5457adb2e2b5ed01e8d5 100644 GIT binary patch delta 3137 zcmbVOZ){rE6*rFmga5?TE_SFB5f5vz;|JJ~5HF1$U@+9gj|cC;dyeDqz{Z$ACc!pf zuXWs3YSc|yyOZ|AzVySisg)*Wlk=f!rFNV8VOyni)wWO5(n^y$sfyNg)1)cN+?{(L zCbZF%c1XN?&bhzeIp>~p?|tw2KWN^#rFls?)X>uN^3tnx*nfcxa7_br9BTYE9dDjo z1V_^_6vkU2cSvqR%m8Rk=;8LEzfizCO@F83jponkc$HbB1V?L}=zBkE?bgtV%a#hX zv<<_owHusmqcCQ*z+G)u$JG&IpKicTuJEi)ZwbIt9Lc33Ga?R20f13D(Cr2HBP7DS1RYjLBk~Ew5)_jktaqe56rI=ltifx{_ zNA;=0|InK61Z%im{pWDk-d11HPbjd{fWYr(4uE?f4*E54&Pk)~-Fd>V(V74KT&*U1 zubm00Mg8ZLpbDg@JN?(?{w8x<|69^y4GjZhhL%D+pNtm`KQV5b()r|4c`>n;-N@yO zrZdOpw@n$ep=7!e&!?75W5-PScy0*|`HgHAwc>jG$~vkRDz6n#v{7-1h}NeUP*MFY zQs+y1(Q}zja0IPqt@d%txYhJDBGeQXHQF6(wKV3@sbM_S_DiF*@GFmRqy4iGMf?rAGLug}5 zRedwz_B;IZ9)HpA4s0cTVu?5k#;i#KkZjIkGHTt3+0@{2tQK@bidwS-tK`L^;wAy* zf*dVpVzw^_hF|i++)OboNA0VMjV~y<1W6z*io9<&l1eXsC03$>33=CX!3TcWbR;TT zq$t0V^m&&BUwH+taU;zk#pkuarOY+8FOo$c91}}2vR#g<>T|TF;2ZwTAi)2f|_qbvBGdlc%uns#y3sfW{RCpsC z=C@%?Hi9mqhjZa}cn|fXkv907+>7&i(9IWMEUbqQmYd=Fyn*6~{N8+y&U-GX+~S4H zh|_^Z?1nEc5N;<7Eo@TT$7s8;pxSO#Z5FWw#EXOIk8oPx*^6tGvwzV5U!@G+r|MucK{7&wJxB2!qIhV~UK7JW4NF(q?Y#Z{pfaj!d;ey)X7t(j3bH246 zG}I`D;b+p-mPj;fQ7Z6^bmBm~7u#tT19PD;F*XSqy9Z&F(-* zit%d^qKi&0k;~@cb?F(nwa^7KVSV32v;4H`uGv|MN0ac8JW`d8MhcHVgsvqw4Eq^;a!isr zv68T5H9)YaO57I1I6U7pYC!Rot%@Lq zX&nM2YeuL>t**%9CsEzp1dSS^Q4jKDNYt$@=hs-PFCHyt>kmh*`W_GRoh4=&R#W4T zMi7}4p!meE_CO#AEBq%}*MkZ-fjwBo?mTi=Je1eL z+E|XR>ol{8c|GVsiNW?JxKq_0>@W9E9P~iua2q)yUneSj6z=O%YfiFcq-vCdng>Y~ z564t%j!||V_|9+2KjEZrM&h7E9F*~%Vae?w;Rgo2RmfFA zPY)OqM=bcE!Ls$QJo2th`tZ6!5FY!wTAe-HU^oWa^kBRC4TK-F5K5nbOX(x{Z)tD1 M{$xV~$MV+y0tqv#F8}}l delta 3398 zcmbtWTWnj$5hW$+Nxdo3lq}l{?J{X3@sbiLi?-`z`4nmK{ibZoe2dcMBc>#ZbZyzK zg9b%D>Fhk*b^g0??Fku>N>0K;{h7AcA}H9EVOA|>1X zD7pae?#`S!GdnYTFMs+w@y*|hZzbD@M4h)YKVZ|N@6icf5d)i#w*8jP?@h13yyOI2 zpXq`^dz{HMGd#*(X6tt)57>OC;|n(5(v%o}zAGU?zA<>~n=*K(>t9THgR4Pj_X#+! z9Ry?d6r9m&;h}V(_xiIF^1d+x;&q@oW5%b48;ct<6O(|{@OJ@z^HLMuul$Vw|8QK{&g||V850(FjEmx3 zuj&IKmg6TT1o85jbK)JjL_R4$v4oRoL?jwIBWetBW0s5LBXY$w>WAc=^3Dlm_nt=C zy@yqZo_mTEwLOz<_>_pn?QQCNKN3&KB`hyv%Hfrjd@-HbUa|3+tqi}SU0IR0DI|)! zV#Nny^+#`u)vv!T{vUjSu2()d_Vrhg;Cv{e1$?k41D+mclMb7^z`-gJ)PG&37) zepkXv)pr$5{hf|H;>Vl&TbgRCk&begn9&XH2AkjSslhd^;P_tewp9HU*EHjbQP_A9nOJ^VMR`o61)yCM7HHG@DQH3)s$uSCeJ>wA#h562y1@GLOU?)@VUai991 z-)&}L_m0sg^hx{akg%YAa^b(!ubI&Qw|ue}6@OY*NYvem?=O#vL}Sy+&T^udPE?fl zCa$Q2W=WO!%|fxFIyJeh+H0U@0IJ>5v)}HA>kcW*ogF1l0ZdP-1j?0-S~-I(NV_qw zSE0PPSt!>Iia~20g z&vSI>bMWZg*|u{8&MqDAH|Z@oLQDjQy#&{mjx0KOV>S^ntjD9P*`z%@8;@*fT{dIZ z!D}lHvvVyuyRAz`40{+pRddWcY#S#X1;c7G63Tn>rp;NFVc`OP@BF+0OyYiz?~b{ezc1Kf4TS}7NqtK{Kx*Kznqd_=acbV(>!(cvFAib6l1PX{lI>`y9}SWo&}Ak7j6dfvIDBEbkw;Phlg?H zL04|30IB;5yyNYAA{4E2HEnm7(A9N`FO>FyColyc2d~1Qe+VMyWuWrO+Wj6M^tp?4 z4Z~qI>blx(p*-}v3slyjE2r%X7`Kuk{u;hU5uge7G#EzUe!Le(+^Ze-L}Z(XUoXoc z7&rz$3l0JnsUa<$mqT~(7%YbdDRz`$g9Y10JZ$u)3VhKYc2FlzSB(2>@L6~W{^F4} z_};v5XrIrWivs(4WDqXL>)?&{!dc%4JdDa9<$DQUUG9f}M+f1dPu3Z-+q4NQ^!V{0 zsD9l)j<8X<>o2jrGjL=NQV8rYOZoC3#@@7iCP*#&@PN;tWk39Ic>q2Ot}@wjNRTlmV7hK8?elu-}f!lr9p-V5BNj^5`IY%+hr;6@!a#oMmgycN|eRXM#!2HUw9qXps ztgkzYTD*xd{F$mTyo9l+3zsF_o2@1vAwCkqb=5|QB}Ol8870JCwc=7s_Z&gI%V$Bn zi&O~-)4i#RdrOJnMXwMCA&WUPyYjd<)4dz-{|+(7C|8UimpO-A7BnYhi#{BvASPsD zW`uZy_$pbzoz+Y|6ElOdCEXGeU1Ij0mQe=H5sU~ur!W9v^bmrF!r_5mUYZ{Y1lI++ zCGQr;&DP-oO%bY1K1_CLe_sb9yf#T$?Us`qHhcN_(p{P-mF(72%wVR-2zv?zUF zg|opqXF0A72Z)o}bH2yy*D0RaYv_uW0LPOAc*~i_VxnNO%DSD3j z+5&vOa#U=E2MI+u>=aDQ^#zVIRygm0xLbY&f&iWNyA9wwE*;Ui6WDwhVUF6Dm|G67 zV%K08^NWM`){#Bfa(rm05vpqq{Vq7TELanUfkvd31vgcxHt~JOc{onErQ49d|nRfmyW7vzO4JcP>l}_N~;u>LfG@E z<45)(bi$`9?!!*NQECv;?CdAfx$$d5-yvong=qzt?FktBm?F$VC=72N+T^gy$RdV- zax6geQdw>6u9|&tF*A9}m&Esf90^g6a#o&iiIm66iOVl-W;SM0A8^5s%>vhkN? blKq##zrFBj_62yD9qbVudQl|u62I<0QPISg