From 7892e0a734e06b0c8e17923b136740110a29d85f Mon Sep 17 00:00:00 2001 From: Tony Tung Date: Wed, 13 Nov 2019 10:04:28 -0600 Subject: [PATCH] Support multiple codewords for the same target (#1646) `to_json` walks through each of the targets/r/c in a codebook when trying to save. Unfortunately, selecting an array with the same target twice results in a 3D array (target/r/c) rather than a 2D array (r/c). This PR rewrites to_json so it can handle multiple codewords that decode to the same target. Rather than selecting by label, we walk through each of the targets by integer offset. This also optimizes the walkthrough of the codebook object by finding all the non-zero points in numpy code rather than in python code. With the seqFISH dataset, this is at least a 10x speedup in writing out the json file. With denser codebooks, the speedup probably is inconsequential. Test plan: wrote a test that has multiple codewords for the same target. it crashes without the fix and succeehds with the fix. Fixes: #1643 --- starfish/core/codebook/codebook.py | 25 +++++++++------- starfish/core/codebook/test/test_to_json.py | 33 +++++++++++++++++++++ 2 files changed, 47 insertions(+), 11 deletions(-) diff --git a/starfish/core/codebook/codebook.py b/starfish/core/codebook/codebook.py index 4ab640bfd..f9e8d5a06 100644 --- a/starfish/core/codebook/codebook.py +++ b/starfish/core/codebook/codebook.py @@ -400,20 +400,23 @@ def to_json(self, filename: Union[str, Path]) -> None: """ code_array = [] - for target in self[Features.TARGET]: + for target_index in range(self.sizes[Features.TARGET]): codeword = [] - for ch_label in self[Axes.CH.value]: - for round_label in self[Axes.ROUND.value]: - if self.loc[target, round_label, ch_label]: - codeword.append( - { - Axes.CH.value: int(ch_label), - Axes.ROUND.value: int(round_label), - Features.CODE_VALUE: float(self.loc[target, round_label, ch_label]) - }) + target_codeword = self[{Features.TARGET: target_index}] + nonzero_indices = np.nonzero(target_codeword.values) + for round_index, ch_index in zip(*nonzero_indices): + codeword.append( + { + Axes.ROUND.value: int(self.coords[Axes.ROUND.value][round_index]), + Axes.CH.value: int(self.coords[Axes.CH.value][ch_index]), + Features.CODE_VALUE: float(target_codeword[round_index, ch_index]) + }) + target_cell: np.ndarray = self.coords[Features.TARGET][target_index].values + assert len(target_cell.shape) == 0 + target_name = str(target_cell) code_array.append({ Features.CODEWORD: codeword, - Features.TARGET: str(target.values) + Features.TARGET: target_name, }) codebook_document = { DocumentKeys.VERSION_KEY: str(CURRENT_VERSION), diff --git a/starfish/core/codebook/test/test_to_json.py b/starfish/core/codebook/test/test_to_json.py index 893b623bc..ca849ae5e 100644 --- a/starfish/core/codebook/test/test_to_json.py +++ b/starfish/core/codebook/test/test_to_json.py @@ -1,5 +1,6 @@ import os +from starfish.types import Axes, Features from .factories import simple_codebook_array from ..codebook import Codebook @@ -12,3 +13,35 @@ def test_to_json(tmp_path): loaded_codebook = Codebook.open_json(os.fspath(codebook_path)) assert codebook.equals(loaded_codebook) + + +def test_to_json_multiple_codes_for_target(tmp_path): + code_array = [ + { + Features.CODEWORD: [ + {Axes.ROUND.value: 0, Axes.CH.value: 0, Features.CODE_VALUE: 1}, + {Axes.ROUND.value: 1, Axes.CH.value: 1, Features.CODE_VALUE: 1} + ], + Features.TARGET: "SCUBE2" + }, + { + Features.CODEWORD: [ + {Axes.ROUND.value: 0, Axes.CH.value: 1, Features.CODE_VALUE: 1}, + {Axes.ROUND.value: 1, Axes.CH.value: 1, Features.CODE_VALUE: 1} + ], + Features.TARGET: "BRCA" + }, + { + Features.CODEWORD: [ + {Axes.ROUND.value: 0, Axes.CH.value: 1, Features.CODE_VALUE: 1}, + {Axes.ROUND.value: 1, Axes.CH.value: 0, Features.CODE_VALUE: 1} + ], + Features.TARGET: "SCUBE2" + } + ] + codebook = Codebook.from_code_array(code_array) + codebook_path = tmp_path / "codebook.json" + codebook.to_json(codebook_path) + + loaded_codebook = Codebook.open_json(os.fspath(codebook_path)) + assert codebook.equals(loaded_codebook)