Skip to content

Commit

Permalink
Support multiple codewords for the same target
Browse files Browse the repository at this point in the history
`to_json` walks through each of the targets/r/c in a codebook when trying to save.  Unfortunately, selecting an array with the same target twice results in a 3D array (target/r/c) rather than a 2D array (r/c).

This PR rewrites to_json so it can handle multiple codewords that decode to the same target.  Rather than selecting by label, we walk through each of the targets by integer offset.  This also optimizes the walkthrough of the codebook object by finding all the non-zero points in numpy code rather than in python code.  With the seqFISH dataset, this is at least a 10x speedup in writing out the json file.  With denser codebooks, the speedup probably is inconsequential.

Test plan: wrote a test that has multiple codewords for the same target.  it crashes without the fix and succeehds with the fix.
Fixes: #1643
  • Loading branch information
Tony Tung committed Nov 13, 2019
1 parent 3072f3c commit ffbf5d5
Show file tree
Hide file tree
Showing 2 changed files with 47 additions and 11 deletions.
25 changes: 14 additions & 11 deletions starfish/core/codebook/codebook.py
Original file line number Diff line number Diff line change
Expand Up @@ -400,20 +400,23 @@ def to_json(self, filename: Union[str, Path]) -> None:
"""
code_array = []
for target in self[Features.TARGET]:
for target_index in range(self.sizes[Features.TARGET]):
codeword = []
for ch_label in self[Axes.CH.value]:
for round_label in self[Axes.ROUND.value]:
if self.loc[target, round_label, ch_label]:
codeword.append(
{
Axes.CH.value: int(ch_label),
Axes.ROUND.value: int(round_label),
Features.CODE_VALUE: float(self.loc[target, round_label, ch_label])
})
target_codeword = self[{Features.TARGET: target_index}]
nonzero_indices = np.nonzero(target_codeword.values)
for round_index, ch_index in zip(*nonzero_indices):
codeword.append(
{
Axes.ROUND.value: int(self.coords[Axes.ROUND.value][round_index]),
Axes.CH.value: int(self.coords[Axes.CH.value][ch_index]),
Features.CODE_VALUE: float(target_codeword[round_index, ch_index])
})
target_cell: np.ndarray = self.coords[Features.TARGET][target_index].values
assert len(target_cell.shape) == 0
target_name = str(target_cell)
code_array.append({
Features.CODEWORD: codeword,
Features.TARGET: str(target.values)
Features.TARGET: target_name,
})
codebook_document = {
DocumentKeys.VERSION_KEY: str(CURRENT_VERSION),
Expand Down
33 changes: 33 additions & 0 deletions starfish/core/codebook/test/test_to_json.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import os

from starfish.types import Axes, Features
from .factories import simple_codebook_array
from ..codebook import Codebook

Expand All @@ -12,3 +13,35 @@ def test_to_json(tmp_path):

loaded_codebook = Codebook.open_json(os.fspath(codebook_path))
assert codebook.equals(loaded_codebook)


def test_to_json_multiple_codes_for_target(tmp_path):
code_array = [
{
Features.CODEWORD: [
{Axes.ROUND.value: 0, Axes.CH.value: 0, Features.CODE_VALUE: 1},
{Axes.ROUND.value: 1, Axes.CH.value: 1, Features.CODE_VALUE: 1}
],
Features.TARGET: "SCUBE2"
},
{
Features.CODEWORD: [
{Axes.ROUND.value: 0, Axes.CH.value: 1, Features.CODE_VALUE: 1},
{Axes.ROUND.value: 1, Axes.CH.value: 1, Features.CODE_VALUE: 1}
],
Features.TARGET: "BRCA"
},
{
Features.CODEWORD: [
{Axes.ROUND.value: 0, Axes.CH.value: 1, Features.CODE_VALUE: 1},
{Axes.ROUND.value: 1, Axes.CH.value: 0, Features.CODE_VALUE: 1}
],
Features.TARGET: "SCUBE2"
}
]
codebook = Codebook.from_code_array(code_array)
codebook_path = tmp_path / "codebook.json"
codebook.to_json(codebook_path)

loaded_codebook = Codebook.open_json(os.fspath(codebook_path))
assert codebook.equals(loaded_codebook)

0 comments on commit ffbf5d5

Please sign in to comment.