adding DecodedIntensityTable

spacetx · Sep 19, 2019 · b73ab2f · b73ab2f
1 parent d293e13
commit b73ab2f
Show file tree

Hide file tree

Showing 19 changed files with 333 additions and 175 deletions.
diff --git a/docs/source/api/data_structures/decoded_intensity_table.rst b/docs/source/api/data_structures/decoded_intensity_table.rst
@@ -0,0 +1,7 @@
+.. _DecodedIntensityTable:
+
+DecodedIntensityTable
+=====================
+
+.. autoclass:: starfish.core.intensity_table.decoded_intensity_table.DecodedIntensityTable
+   :members:
diff --git a/docs/source/api/data_structures/index.rst b/docs/source/api/data_structures/index.rst
@@ -56,3 +56,6 @@ serialization for use in single-cell analysis environments such as Seurat_ and S
 
 .. toctree::
    intensity_table.rst
+
+.. toctree::
+   decoded_intensity_table.rst
diff --git a/docs/source/help_and_reference/glossary/glossary.md b/docs/source/help_and_reference/glossary/glossary.md
@@ -13,6 +13,9 @@ A feature that is the target of quantification by an image-based assays. Common
 ### IntensityTable
 An intensity Table contains the features identified in an ImageStack. It can be thought of as an array whose entries are the intensities of each feature across the imaging rounds and channels of a field of view. Starfish exposes several processing tools to decode the features of the table, estimate their qualities, and assign features to cells.
 
+### DecodedIntensityTable
+A representation of a decoded intensity table. Contains the features identified in an ImageStack as well as their associated target values.
+
 ### Codeword
 A codeword maps expected intensities across multiple image tiles within a field of view to the target that is encoded by the codeword.
 

diff --git a/starfish/__init__.py b/starfish/__init__.py
@@ -17,6 +17,7 @@
 from .core.experiment.experiment import Experiment, FieldOfView
 from .core.expression_matrix.expression_matrix import ExpressionMatrix
 from .core.imagestack.imagestack import ImageStack
+from .core.intensity_table.decoded_intensity_table import DecodedIntensityTable
 from .core.intensity_table.intensity_table import IntensityTable
 from .core.segmentation_mask import SegmentationMaskCollection
 from .core.starfish import starfish
diff --git a/starfish/core/codebook/codebook.py b/starfish/core/codebook/codebook.py
@@ -16,6 +16,7 @@
     MIN_SUPPORTED_VERSION,
 )
 from starfish.core.config import StarfishConfig
+from starfish.core.intensity_table.decoded_intensity_table import DecodedIntensityTable
 from starfish.core.intensity_table.intensity_table import IntensityTable
 from starfish.core.spacetx_format.util import CodebookValidator
 from starfish.core.types import Axes, Features, Number
@@ -506,7 +507,7 @@ def _validate_decode_intensity_input_matches_codebook_shape(
     def decode_metric(
             self, intensities: IntensityTable, max_distance: Number, min_intensity: Number,
             norm_order: int, metric: str='euclidean'
-    ) -> IntensityTable:
+    ) -> DecodedIntensityTable:
         """
         Assigns intensity patterns that have been extracted from an :py:class:`ImageStack` and
         stored in an :py:class:`IntensityTable` by a :py:class:`SpotFinder` to the gene targets that
@@ -552,10 +553,11 @@ def decode_metric(
 
         # add empty metadata fields and return
         if intensities.sizes[Features.AXIS] == 0:
-            intensities[Features.TARGET] = (Features.AXIS, np.empty(0, dtype='U'))
-            intensities[Features.DISTANCE] = (Features.AXIS, np.empty(0, dtype=float))
-            intensities[Features.PASSES_THRESHOLDS] = (Features.AXIS, np.empty(0, dtype=bool))
-            return intensities
+            return DecodedIntensityTable.from_intensity_table(
+                intensities,
+                targets=(Features.AXIS, np.empty(0, dtype='U')),
+                distances=(Features.AXIS, np.empty(0, dtype=np.float64)),
+                passes_threshold=(Features.AXIS, np.empty(0, dtype=bool)))
 
         # normalize both the intensities and the codebook
         norm_intensities, norms = self._normalize_features(intensities, norm_order=norm_order)
@@ -571,15 +573,14 @@ def decode_metric(
             dtype=np.bool
         )
 
-        # set targets, distances, and filtering results
-        norm_intensities[Features.TARGET] = (Features.AXIS, targets)
-        norm_intensities[Features.DISTANCE] = (Features.AXIS, metric_outputs)
-        norm_intensities[Features.PASSES_THRESHOLDS] = (Features.AXIS, passes_filters)
-
         # norm_intensities is a DataArray, make it back into an IntensityTable
-        return IntensityTable(norm_intensities)
+        return DecodedIntensityTable.from_intensity_table(
+            norm_intensities,
+            targets=(Features.AXIS, targets),
+            distances=(Features.AXIS, metric_outputs),
+            passes_threshold=(Features.AXIS, passes_filters))
 
-    def decode_per_round_max(self, intensities: IntensityTable) -> IntensityTable:
+    def decode_per_round_max(self, intensities: IntensityTable) -> DecodedIntensityTable:
         """
         Assigns intensity patterns that have been extracted from an :py:class:`ImageStack` and
         stored in an :py:class:`IntensityTable` by a :py:class:`SpotFinder` to the gene targets that
@@ -641,10 +642,11 @@ def _view_row_as_element(array: np.ndarray) -> np.ndarray:
 
         # add empty metadata fields and return
         if intensities.sizes[Features.AXIS] == 0:
-            intensities[Features.TARGET] = (Features.AXIS, np.empty(0, dtype='U'))
-            intensities[Features.DISTANCE] = (Features.AXIS, np.empty(0, dtype=float))
-            intensities[Features.PASSES_THRESHOLDS] = (Features.AXIS, np.empty(0, dtype=bool))
-            return intensities
+            return DecodedIntensityTable.from_intensity_table(
+                intensities,
+                targets=(Features.AXIS, np.empty(0, dtype='U')),
+                distances=(Features.AXIS, np.empty(0, dtype=np.float64)),
+                passes_threshold=(Features.AXIS, np.empty(0, dtype=bool)))
 
         max_channels = intensities.argmax(Axes.CH.value)
         codes = self.argmax(Axes.CH.value)
@@ -668,11 +670,11 @@ def _view_row_as_element(array: np.ndarray) -> np.ndarray:
         # a code passes filters if it decodes successfully
         passes_filters = ~pd.isnull(targets)
 
-        intensities[Features.TARGET] = (Features.AXIS, targets.astype('U'))
-        intensities[Features.DISTANCE] = (Features.AXIS, distance)
-        intensities[Features.PASSES_THRESHOLDS] = (Features.AXIS, passes_filters)
-
-        return intensities
+        return DecodedIntensityTable.from_intensity_table(
+            intensities,
+            targets=(Features.AXIS, targets.astype('U')),
+            distances=(Features.AXIS, distance),
+            passes_threshold=(Features.AXIS, passes_filters))
 
     @classmethod
     def synthetic_one_hot_codebook(

diff --git a/starfish/core/expression_matrix/test/test_serialization.py b/starfish/core/expression_matrix/test/test_serialization.py
@@ -1,7 +1,7 @@
 import random
 
-from starfish import IntensityTable
 from starfish.core.codebook.test.factories import codebook_array_factory
+from starfish.core.intensity_table.test import factories
 from starfish.core.types import Features
 
 NUMBER_SPOTS = 10
@@ -11,7 +11,7 @@ def test_save_expression_matrix():
 
     codebook = codebook_array_factory()
 
-    intensities = IntensityTable.synthetic_intensities(
+    decoded_intensities = factories.synthetic_decoded_intenisty_table(
         codebook,
         num_z=3,
         height=100,
@@ -20,9 +20,10 @@ def test_save_expression_matrix():
     )
     # mock out come cell_ids
     cell_ids = random.sample(range(1, 20), NUMBER_SPOTS)
-    intensities[Features.CELL_ID] = (Features.AXIS, cell_ids)
 
-    expression_matrix = intensities.to_expression_matrix()
+    decoded_intensities[Features.CELL_ID] = (Features.AXIS, cell_ids)
+
+    expression_matrix = decoded_intensities.to_expression_matrix()
 
     # test all saving methods
     expression_matrix.save("expression")
diff --git a/starfish/core/intensity_table/decoded_intensity_table.py b/starfish/core/intensity_table/decoded_intensity_table.py
@@ -0,0 +1,176 @@
+from typing import Optional, Tuple
+
+import numpy as np
+import pandas as pd
+
+from starfish.core.expression_matrix.expression_matrix import ExpressionMatrix
+from starfish.core.intensity_table.intensity_table import IntensityTable
+from starfish.core.types import (
+    Axes,
+    Coordinates,
+    DecodedSpots,
+    Features,
+)
+
+
+class DecodedIntensityTable(IntensityTable):
+    """
+    DecodedIntensityTable is a container for spot or pixel features extracted from image data
+    that have been decoded. It is the primary output from starfish :py:class:`Decode` methods.
+
+    An IntensityTable records the numeric intensity of a set of features in each
+    :code:`(round, channel)` tile in which the feature is identified.
+    The :py:class:`IntensityTable` has shape
+    :code:`(n_feature, n_channel, n_round)`.
+
+    Some :py:class:`SpotFinder` methods identify a position and search for Gaussian blobs in a
+    small radius, only recording intensities if they are found in a given tile. Other
+    :py:class:SpotFinder: approaches find blobs in a max-projection and measure them everywhere.
+    As a result, some IntensityTables will be dense, and others will contain :code:`np.nan`
+    entries where no feature was detected.
+
+    Examples
+    --------
+    Create an IntensityTable using the ``synthetic_intensities`` method::
+
+        >>> from starfish.core.test.factories import SyntheticData
+        >>> sd = SyntheticData(n_ch=3, n_round=4, n_codes=2)
+        >>> codes = sd.codebook()
+        >>> sd.intensities(codebook=codes)
+        <xarray.IntensityTable (features: 2, c: 3, h: 4)>
+        array([[[    0.,     0.,     0.,     0.],
+                [    0.,     0.,  8022., 12412.],
+                [11160.,  9546.,     0.,     0.]],
+
+               [[    0.,     0.,     0.,     0.],
+                [    0.,     0., 10506., 10830.],
+                [11172., 12331.,     0.,     0.]]])
+        Coordinates:
+        * features   (features) MultiIndex
+        - z          (features) int64 7 3
+        - y          (features) int64 14 32
+        - x          (features) int64 32 15
+        - r          (features) float64 nan nan
+        * c          (c) int64 0 1 2
+        * h          (h) int64 0 1 2 3
+         target     (features) object 08b1a822-a1b4-4e06-81ea-8a4bd2b004a9 ...
+
+        """
+
+    @classmethod
+    def from_intensity_table(
+            cls,
+            intensities: IntensityTable,
+            targets: Tuple[str, np.ndarray],
+            distances: Optional[Tuple[str, np.ndarray]] = None,
+            passes_threshold: Optional[Tuple[str, np.ndarray]] = None):
+        """
+        Assign target values to intensities.
+
+        Parameters
+        ----------
+        intensities : IntensityTable
+            intensity_table to assign target values to
+        targets : Tuple[str, np.ndarray]
+            Target values to assign
+        distances : Optional[Tuple[str, np.ndarray]]
+            Corresponding array of distances from target for each feature
+        passes_threshold : Optional[Tuple[str, np.ndarray]]
+            Corresponding array of boolean values indicating if each itensity passed
+            given thresholds.
+
+        Returns
+        -------
+        DecodedIntensityTable
+        """
+
+        intensities = cls(intensities)
+        intensities[Features.TARGET] = targets
+        if distances:
+            intensities[Features.DISTANCE] = distances
+        if passes_threshold:
+            intensities[Features.PASSES_THRESHOLDS] = passes_threshold
+        return intensities
+
+    def to_decoded_dataframe(self) -> DecodedSpots:
+        """
+        Generates a dataframe containing decoded spot information. Guaranteed to contain physical
+        spot coordinates (z, y, x) and gene target. Does not contain pixel coordinates.
+        """
+        df = self.to_features_dataframe()
+        pixel_coordinates = pd.Index([Axes.X, Axes.Y, Axes.ZPLANE])
+        df = df.drop(pixel_coordinates.intersection(df.columns), axis=1).drop(Features.AXIS, axis=1)
+        return DecodedSpots(df)
+
+    def to_mermaid(self, filename: str) -> pd.DataFrame:
+        """
+        Writes a .csv.gz file in columnar format that is readable by MERMAID visualization
+        software.
+
+        To run MERMAID, follow the installation instructions for that repository and simply
+        replace the data.csv.gz file with the output of this function.
+
+        Parameters
+        ----------
+        filename : str
+            Name for compressed-gzipped MERMAID data file. Should end in '.csv.gz'.
+
+        Notes
+        ------
+        See also https://github.com/JEFworks/MERmaid
+
+        """
+        # construct the MERMAID dataframe. As MERMAID adds support for non-categorical variables,
+        # additional columns can be added here
+        df = self.to_features_dataframe()
+        column_order = [
+            Axes.X,
+            Axes.Y,
+            Features.TARGET,
+            Features.TARGET,  # added twice to support simultaneous coding
+        ]
+        mermaid_data = df[column_order]
+
+        # write to disk
+        mermaid_data.to_csv(filename, compression='gzip', index=False)
+
+    def to_expression_matrix(self) -> ExpressionMatrix:
+        """
+        Generates a cell x gene count matrix where each cell is annotated with spatial metadata.
+
+        Requires that spots in the IntensityTable have been assigned to cells.
+
+        Returns
+        -------
+        ExpressionMatrix :
+            cell x gene expression table
+        """
+        if Features.CELL_ID not in self.coords:
+            raise KeyError(
+                "IntensityTable must have 'cell_id' assignments for each cell before this function "
+                "can be called. See starfish.spots.AssignTargets.Label.")
+        grouped = self.to_features_dataframe().groupby([Features.CELL_ID, Features.TARGET])
+        counts = grouped.count().iloc[:, 0].unstack().fillna(0)
+        if self.has_physical_coords:
+            grouped = self.to_features_dataframe().groupby([Features.CELL_ID])[[
+                Axes.X.value, Axes.Y.value, Axes.ZPLANE.value, Coordinates.X.value,
+                Coordinates.Y.value, Coordinates.Z.value]]
+        else:
+            grouped = self.to_features_dataframe().groupby([Features.CELL_ID])[[
+                Axes.X.value, Axes.Y.value, Axes.ZPLANE.value]]
+        min_ = grouped.min()
+        max_ = grouped.max()
+        coordinate_df = min_ + (max_ - min_) / 2
+        metadata = {name: (Features.CELLS, data.values) for name, data in coordinate_df.items()}
+        metadata[Features.AREA] = (Features.CELLS, np.full(counts.shape[0], fill_value=np.nan))
+        # add genes to the metadata
+        metadata.update({Features.GENES: counts.columns.values})
+        metadata.update({Features.CELL_ID: (Features.CELLS, counts.index.values)})
+
+        mat = ExpressionMatrix(
+            data=counts.values,
+            dims=(Features.CELLS, Features.GENES),
+            coords=metadata,
+            name='expression_matrix'
+        )
+        return mat