From e3c70bff6a31ae9d867df5fabd5f3749898ecb5a Mon Sep 17 00:00:00 2001 From: nickeener Date: Sun, 10 Oct 2021 21:42:32 -0700 Subject: [PATCH 01/30] Initial commit --- .../decoded_intensity_table.py | 22 +- starfish/core/spots/DecodeSpots/__init__.py | 1 + .../spots/DecodeSpots/check_all_decoder.py | 190 +++++ .../core/spots/DecodeSpots/check_all_funcs.py | 695 ++++++++++++++++++ 4 files changed, 893 insertions(+), 15 deletions(-) create mode 100644 starfish/core/spots/DecodeSpots/check_all_decoder.py create mode 100644 starfish/core/spots/DecodeSpots/check_all_funcs.py diff --git a/starfish/core/intensity_table/decoded_intensity_table.py b/starfish/core/intensity_table/decoded_intensity_table.py index 7a062ab8b..0d17097fe 100644 --- a/starfish/core/intensity_table/decoded_intensity_table.py +++ b/starfish/core/intensity_table/decoded_intensity_table.py @@ -17,22 +17,18 @@ class DecodedIntensityTable(IntensityTable): """ DecodedIntensityTable is a container for spot or pixel features extracted from image data that have been decoded. It is the primary output from starfish :py:class:`Decode` methods. - An IntensityTable records the numeric intensity of a set of features in each :code:`(round, channel)` tile in which the feature is identified. The :py:class:`IntensityTable` has shape :code:`(n_feature, n_channel, n_round)`. - Some :py:class:`SpotFinder` methods identify a position and search for Gaussian blobs in a small radius, only recording intensities if they are found in a given tile. Other :py:class:SpotFinder: approaches find blobs in a max-projection and measure them everywhere. As a result, some IntensityTables will be dense, and others will contain :code:`np.nan` entries where no feature was detected. - Examples -------- Create an IntensityTable using the ``synthetic_intensities`` method:: - >>> from starfish.core.test.factories import SyntheticData >>> sd = SyntheticData(n_ch=3, n_round=4, n_codes=2) >>> codes = sd.codebook() @@ -41,7 +37,6 @@ class DecodedIntensityTable(IntensityTable): array([[[ 0., 0., 0., 0.], [ 0., 0., 8022., 12412.], [11160., 9546., 0., 0.]], - [[ 0., 0., 0., 0.], [ 0., 0., 10506., 10830.], [11172., 12331., 0., 0.]]]) @@ -54,7 +49,6 @@ class DecodedIntensityTable(IntensityTable): * c (c) int64 0 1 2 * h (h) int64 0 1 2 3 target (features) object 08b1a822-a1b4-4e06-81ea-8a4bd2b004a9 ... - """ __slots__ = () @@ -65,10 +59,10 @@ def from_intensity_table( intensities: IntensityTable, targets: Tuple[str, np.ndarray], distances: Optional[Tuple[str, np.ndarray]] = None, - passes_threshold: Optional[Tuple[str, np.ndarray]] = None): + passes_threshold: Optional[Tuple[str, np.ndarray]] = None, + filter_tally: Optional[Tuple[str, np.ndarray]] = None): """ Assign target values to intensities. - Parameters ---------- intensities : IntensityTable @@ -80,7 +74,9 @@ def from_intensity_table( passes_threshold : Optional[Tuple[str, np.ndarray]] Corresponding array of boolean values indicating if each itensity passed given thresholds. - + filter_tally: Optional[Tuple[str, np.ndarray]] + Corresponding array of integers indicated the number of rounds this + decoded intensity was found in Returns ------- DecodedIntensityTable @@ -92,6 +88,8 @@ def from_intensity_table( intensities[Features.DISTANCE] = distances if passes_threshold: intensities[Features.PASSES_THRESHOLDS] = passes_threshold + if filter_tally: + intensities['filter_tally'] = filter_tally return intensities def to_decoded_dataframe(self) -> DecodedSpots: @@ -108,19 +106,15 @@ def to_mermaid(self, filename: str) -> pd.DataFrame: """ Writes a .csv.gz file in columnar format that is readable by MERMAID visualization software. - To run MERMAID, follow the installation instructions for that repository and simply replace the data.csv.gz file with the output of this function. - Parameters ---------- filename : str Name for compressed-gzipped MERMAID data file. Should end in '.csv.gz'. - Notes ------ See also https://github.com/JEFworks/MERmaid - """ # construct the MERMAID dataframe. As MERMAID adds support for non-categorical variables, # additional columns can be added here @@ -139,9 +133,7 @@ def to_mermaid(self, filename: str) -> pd.DataFrame: def to_expression_matrix(self) -> ExpressionMatrix: """ Generates a cell x gene count matrix where each cell is annotated with spatial metadata. - Requires that spots in the IntensityTable have been assigned to cells. - Returns ------- ExpressionMatrix : diff --git a/starfish/core/spots/DecodeSpots/__init__.py b/starfish/core/spots/DecodeSpots/__init__.py index 62803d464..71ed89c2e 100644 --- a/starfish/core/spots/DecodeSpots/__init__.py +++ b/starfish/core/spots/DecodeSpots/__init__.py @@ -2,6 +2,7 @@ from .metric_decoder import MetricDistance from .per_round_max_channel_decoder import PerRoundMaxChannel from .simple_lookup_decoder import SimpleLookupDecoder +from .check_all_decoder import CheckAll # autodoc's automodule directive only captures the modules explicitly listed in __all__. __all__ = list(set( diff --git a/starfish/core/spots/DecodeSpots/check_all_decoder.py b/starfish/core/spots/DecodeSpots/check_all_decoder.py new file mode 100644 index 000000000..274d37ec8 --- /dev/null +++ b/starfish/core/spots/DecodeSpots/check_all_decoder.py @@ -0,0 +1,190 @@ +from typing import Callable, Optional +import ray +import pandas as pd +import numpy as np +from copy import deepcopy + +from starfish.core.codebook.codebook import Codebook +from starfish.core.intensity_table.decoded_intensity_table import DecodedIntensityTable +from starfish.core.intensity_table.intensity_table_coordinates import \ + transfer_physical_coords_to_intensity_table +from starfish.core.intensity_table.intensity_table import IntensityTable +from starfish.core.types import SpotFindingResults +from starfish.types import Axes, Features +from ._base import DecodeSpotsAlgorithm + + +from .check_all_funcs import findNeighbors, buildBarcodes, decoder, distanceFilter, cleanup, removeUsedSpots +from .util import _merge_spots_by_round + + +class CheckAll(DecodeSpotsAlgorithm): + """ + Decode spots by selecting the max-valued channel in each sequencing round. + + Note that this assumes that the codebook contains only one "on" channel per sequencing round, + a common pattern in experiments that assign one fluorophore to each DNA nucleotide and + read DNA sequentially. It is also a characteristic of single-molecule FISH and RNAscope + codebooks. + + Parameters + ---------- + codebook : Codebook + Contains codes to decode IntensityTable + trace_building_strategy: TraceBuildingStrategies + Defines the strategy for building spot traces to decode across rounds and chs of spot + finding results. + search_radius : Optional[int] + Only applicable if trace_building_strategy is TraceBuildingStrategies.NEAREST_NEIGHBORS. + Number of pixels over which to search for spots in other rounds and channels. + anchor_round : Optional[int] + Only applicable if trace_building_strategy is TraceBuildingStrategies.NEAREST_NEIGHBORS. + The imaging round against which other rounds will be checked for spots in the same + approximate pixel location. + """ + + def __init__( + self, + codebook: Codebook, + filter_rounds: Optional[int]=None, + search_radius: Optional[float]=3, + round_omit_num: Optional[int]=0): + self.codebook = codebook + self.filterRounds = filter_rounds + self.searchRadius = search_radius + self.roundOmitNum = round_omit_num + + def run(self, spots: SpotFindingResults, n_processes: int=1, *args) -> DecodedIntensityTable: + """Decode spots by selecting the max-valued channel in each sequencing round + + Parameters + ---------- + spots: SpotFindingResults + A Dict of tile indices and their corresponding measured spots + + Returns + ------- + DecodedIntensityTable : + IntensityTable decoded and appended with Features.TARGET and Features.QUALITY values. + + """ + + # Rename n_processes (trying to stay consistent between starFISH's _ variables and my camel case ones) + numJobs = n_processes + + # If using an search radius exactly equal to a possible distance between two pixels (ex: 1), some + # distances will be calculated as slightly less than their exact distance (either due to rounding or + # precision) so search radius needs to be slightly increased to ensure this doesn't happen + self.searchRadius += 0.001 + + # Initialize ray for multi_processing + ray.init(num_cpus=numJobs) + + # Create dictionary where keys are round labels and the values are pandas dataframes containing information on + # the spots found in that round + spotTables = _merge_spots_by_round(spots) + + # If user did not specify the filterRounds variable (it will have default value -1) change it to either one less + # than the number of rounds if roundOmitNum is 0 or the number of rounds minus the roundOmitNum if roundOmitNum > 0 + if self.filterRounds == None: + if self.roundOmitNum == 0: + self.filterRounds = len(spotTables) - 1 + else: + self.filterRounds = len(spotTables) - self.roundOmitNum + + + # Create dictionary of neighbors (within the search radius) in other rounds for each spot + neighborDict = findNeighbors(spotTables, self.searchRadius) + + # Create dictionary with mapping from spot id in spotTables to channel number and one with spot + # coordinates for fast access + channelDict = {} + spotCoords = {} + for r in [*spotTables]: + channelDict[r] = spotTables[r]['c'].to_dict() + spotCoords[r] = spotTables[r][['z','y','x']].T.to_dict() + + # Set list of round omission numbers to loop through + roundOmits = range(self.roundOmitNum+1) + + # Decode for each round omission number + allCodes = pd.DataFrame() + for currentRoundOmitNum in roundOmits: + decodedTables = {} + for r in range(len(spotTables)): + roundData = deepcopy(spotTables[r]) + + # Create dictionary of dataframes (based on perRoundSpotTables data) that contains additional columns for each spot + # containing all the possible barcodes that could be constructed from the neighbors of that spot + roundData = buildBarcodes(roundData, neighborDict, currentRoundOmitNum, channelDict, r, numJobs) + + # Match possible barcodes to codebook and add new columns with info about barcodes that had a codebook match + roundData = decoder(roundData, self.codebook, currentRoundOmitNum, r, numJobs) + + # Choose most likely barcode for each spot in each round by find the possible decodable barcode with the least + # spatial variance between the spots that made up the barcode + roundData = distanceFilter(roundData, spotCoords, r, numJobs) + + # Assign to DecodedTables dictionary + decodedTables[r] = roundData + + # Turn spot table dictionary into single table, filter barcodes by round frequency, add additional information, + # and choose between barcodes that use the same spot(s) + finalCodes = cleanup(decodedTables, spotCoords, self.filterRounds) + + # If this is not the last round omission number to run, remove spots that have just been found to be in + # passing barcodes from neighborDict so they are not used for the next round omission number + if currentRoundOmitNum != roundOmits[-1]: + neighborDict = removeUsedSpots(finalCodes, neighborDict) + + # Append found codes to allCodes table + allCodes = allCodes.append(finalCodes).reset_index(drop=True) + + # Shutdown ray + ray.shutdown() + + + # Create and fill in intensity table + channels=spots.ch_labels + rounds=spots.round_labels + + # create empty IntensityTable filled with np.nan + data = np.full((len(allCodes), len(channels), len(rounds)), fill_value=np.nan) + dims = (Features.AXIS, Axes.CH.value, Axes.ROUND.value) + centers = allCodes['center'] + coords: Mapping[Hashable, Tuple[str, Any]] = { + Features.SPOT_RADIUS: (Features.AXIS, np.full(len(allCodes), 1)), + Axes.ZPLANE.value: (Features.AXIS, np.asarray([round(c[2]) for c in centers])), + Axes.Y.value: (Features.AXIS, np.asarray([round(c[1]) for c in centers])), + Axes.X.value: (Features.AXIS, np.asarray([round(c[0]) for c in centers])), + Features.SPOT_ID: (Features.AXIS, np.arange(len(allCodes))), + Features.AXIS: (Features.AXIS, np.arange(len(allCodes))), + Axes.ROUND.value: (Axes.ROUND.value, rounds), + Axes.CH.value: (Axes.CH.value, channels) + } + intensity_table = IntensityTable(data=data, dims=dims, coords=coords) + + # Fill in data values + table_codes = [] + for i in range(len(allCodes)): + code = [] + for ch in allCodes.loc[i, 'best_barcodes']: + # If a round is not used, row will be all zeros + code.append(np.asarray([0 if j != ch else 1 for j in range(len(channels))])) + table_codes.append(np.asarray(code).T) + intensity_table.values = np.asarray(table_codes) + intensity_table = transfer_physical_coords_to_intensity_table(intensity_table=intensity_table, spots=spots) + intensities = intensity_table.transpose('features', 'r', 'c') + + self.codebook._validate_decode_intensity_input_matches_codebook_shape(intensities) + + # Create DecodedIntensityTable + result=DecodedIntensityTable.from_intensity_table( + intensities, + targets=(Features.AXIS, allCodes['best_targets'].astype('U')), + distances=(Features.AXIS, allCodes["best_distances"]), + passes_threshold=(Features.AXIS, np.full(len(allCodes), True)), + filter_tally=(Features.AXIS, allCodes['rounds_used'])) + + + return result diff --git a/starfish/core/spots/DecodeSpots/check_all_funcs.py b/starfish/core/spots/DecodeSpots/check_all_funcs.py new file mode 100644 index 000000000..d95197a94 --- /dev/null +++ b/starfish/core/spots/DecodeSpots/check_all_funcs.py @@ -0,0 +1,695 @@ +# General modules +import time # TODO: delete for final version +import pickle # TODO: delete for final version +from collections import Counter +from scipy.spatial import cKDTree +from copy import deepcopy +from itertools import product, chain, permutations, combinations +import math +from collections import defaultdict +from typing import Any, Hashable, Mapping, Tuple +import ray +import numpy as np +import pandas as pd +import xarray as xr +import warnings +warnings.filterwarnings('ignore') + +# starFISH stuff +from starfish.types import Axes, Coordinates, CoordinateValue, Features +from starfish.core.codebook.codebook import Codebook + + +def findNeighbors(spotTables: pd.DataFrame, + searchRadius: float) -> dict: + + ''' + Function that takes spatial information from the spot tables from each round and creates a dictionary that contains + all the neighbors for each spot in other rounds that are within the search radius. + + Parameters + ---------- + spotTables : pd.DataFrame + Dictionary with round labels as keys and pandas dataframes containing spot information for its key round + as values (result of _merge_spots_by_round function) + + searchRadius : float + Distance that spots can be from each other and still form a barcode + + roundLabels : list + List of round index labels (can extract using .round_lables on a SpotFindingResults object) + + Returns + ------- + dict: a dictionary with the following structure: + {round: { + spotID in round: { + neighborRound: + [list of spotIDs in neighborRound within searchRadius of spotID in round] + } + } + } + ''' + + # Create empty neighbor dictionary + neighborDict = {} + for r in spotTables: + neighborDict[r] = {i: defaultdict(list, {r: [i]}) for i in range(len(spotTables[r]))} + + # For each pairing of rounds, find all mutual neighbors within the search radius for each spot and assigns them + # in the neighborDict dictionary + # Number assigned each spot in neighborDict is the index of it's original location in spotTables (also its spot_id) + # and is used to track each spot uniquely throughout + for i,r1 in enumerate(range((len(spotTables)))): + tree = cKDTree(spotTables[r1][['z', 'y', 'x']]) + for r2 in list(range((len(spotTables))))[i+1:]: + allNeighbors = tree.query_ball_point(spotTables[r2][['z', 'y', 'x']], searchRadius) + for j,neighbors in enumerate(allNeighbors): + if neighbors != []: + for neighbor in neighbors: + neighborDict[r1][neighbor][r2].append(j) + neighborDict[r2][j][r1].append(neighbor) + + return neighborDict + + +def buildBarcodes(roundData: dict, + neighborDict: dict, + roundOmitNum: int, + channelDict: dict, + currentRound: int, + numJobs: int) -> dict: + + ''' + Function that creates a copy of the spotTables dictionary and adds to it's tables all the possible barcodes + that could be formed using the neighbors of each spot, spots without enough neighbors to form a barcode are + dropped. + + Parameters + ---------- + spotTables : dict + Dictionary with round labels as keys and pandas dataframes containing spot information for its key round + as values (result of _merge_spots_by_round function) + + neighborDict : dict + Dictionary that contains all the neighbors for each spot in other rounds that are within the search radius + + roundOmitNum : int + Maximum hamming distance a barcode can be from it's target in the codebook and still be uniquely identified + (i.e. number of error correction rounds in each the experiment) + + numJobs : int + Number of CPU threads to use in parallel + + Returns + ------- + dict : Copy of spotTables with additional columns in each table which lists all possible barcodes + that could be made from each spot's neighbors + + ''' + + + @ray.remote + def barcodeBuildFunc(data: pd.DataFrame, + channelDict: dict, + rang: tuple, + roundOmitNum: int, + roundNum: int): + ''' + Subfunction to buildBarcodes that allows it to run in parallel chunks using ray + + Parameters + ---------- + data : pd.DataFrame + Spot table from barcodeTables for the current round + + channelDict : dict + Dictionary mapping spot IDs to their channels labels + + rang : tuple + Range of indices to build barcodes for in the current data object + + roundOmitNum : int + Maximum hamming distance a barcode can be from it's target in the codebook and still be uniquely + identified (i.e. number of error correction rounds in each the experiment) + + Returns + ------- + tuple : First element is a list of the possible spot codes while the second element is a list of the + possible barcodes + ''' + + # Build barcodes from neighbors + # spotCodes are the ordered spot IDs of the spots making up each barcode while barcodes are the corresponding + # channel labels, need spotCodes so each barcode can have a unique identifier + allSpotCodes = [] + allBarcodes = [] + allNeighbors = list(data['neighbors'])[rang[0]: rang[1]] + for i in range(len(allNeighbors)): + neighbors = deepcopy(allNeighbors[i]) + neighborLists = [] + for rnd in range(roundNum): + # Adds a -1 to each round of the neighbors dictionary (allows barcodes with dropped rounds to + # be created) + if roundOmitNum > 0: + neighbors[rnd].append(-1) + neighborLists.append(neighbors[rnd]) + # Creates all possible spot code combinations from neighbors + codes = list(product(*neighborLists)) + # Only save the ones with the correct number of dropped rounds + spotCodes = [code for code in codes if Counter(code)[-1] == roundOmitNum] + # Create barcodes from spot codes using the mapping from spot ID to channel + barcodes = [] + for spotCode in spotCodes: + barcode = [] + for spotInd in range(len(spotCode)): + if spotCode[spotInd] == -1: + barcode.append(-1) + else: + barcode.append(channelDict[spotInd][spotCode[spotInd]]) + barcodes.append(tuple(barcode)) + + allBarcodes.append(barcodes) + allSpotCodes.append(spotCodes) + + return (allSpotCodes, allBarcodes) + + # Only keep spots that have enough neighbors to form a barcode (determined by the total number of round and the + # number of rounds that can be omitted from each code) + passingSpots = {} + roundNum = len(neighborDict) + for key in neighborDict[currentRound]: + if len(neighborDict[currentRound][key]) >= roundNum-roundOmitNum: + passingSpots[key] = neighborDict[currentRound][key] + passed = list(passingSpots.keys()) + roundData = roundData.iloc[passed] + roundData['neighbors'] = [passingSpots[i] for i in roundData.index] + roundData = roundData.reset_index(drop=True) + + + # Find all possible barcodes for the spots in each round by splitting each round's spots into numJob chunks and + # constructing each chunks barcodes in parallel + + + # Save the current round's data table and the channelDict to ray memory + dataID = ray.put(roundData) + channelDictID = ray.put(channelDict) + + # Calculates index ranges to chunk data by + ranges = [0] + for i in range(1, numJobs+1): + ranges.append(int((len(roundData)/numJobs)*i)) + + # Run in parallel + results = [barcodeBuildFunc.remote(dataID, channelDictID, (ranges[i], ranges[i+1]), roundOmitNum, roundNum) for i in range(len(ranges[:-1]))] + rayResults = ray.get(results) + + # Add possible barcodes and spot codes (same order) to spot dictionary (must chain results rom different jobs + # together) + roundData['spot_codes'] = list(chain(*[job[0] for job in rayResults])) + roundData['barcodes'] = list(chain(*[job[1] for job in rayResults])) + + return roundData + +def decoder(roundData: dict, + codebook: Codebook, + roundOmitNum: int, + currentRound: int, + numJobs: int): + + ''' + Function that takes spots tables with possible barcodes added and matches each to the codebook to identify any + matches. Matches are added to the spot tables and spots without any matches are dropped + + Parameters + ---------- + barcodeTables : dict + Dictionary with modified spot tables containing all possible barcodes that can be made from each spot + + codebook : Codebook + starFISH Codebook object containg the barcode information for the experiment + + roundOmitNum : int + Number of rounds that can be dropped from each barcode + + numJobs : int + Number of CPU threads to use in parallel + + Returns + ------- + dict : barcodeTables dictionary with added columns with information on decodable barcodes + ''' + + def generateRoundPermutations(size: int, roundOmitNum: int) -> list: + ''' + Creates list of lists of logicals detailing the rounds to be used for decoding based on the current roundOmitNum + + Parameters + ---------- + size : int + Number of rounds in experiment + + roundOmitNum: int + Number of rounds that can be dropped from each barcode + + Returns + ------- + list : list of lists of logicals detailing the rounds to be used for decoding based on the current roundOmitNum + ''' + if roundOmitNum == 0: + return [tuple([True]*size)] + else: + return sorted(set(list(permutations([False]*roundOmitNum + [True]*(size-roundOmitNum))))) + + + @ray.remote + def decodeFunc(data: pd.DataFrame, + roundPermutations: list, + permutationCodes: dict, + rnd: int) -> tuple: + + ''' + Subfunction for decoder that allows it to run in parallel chunks using ray + + Parameters + ---------- + data : pd.DataFrame + Spot table from barcodeTables for the current round + + roundPermutations : list + List of logicals from generateRoundPermutations that details the rounds to use in decoding + + permutationCodes : dict + Dictionary containing barcode information for each roundPermutation + + rang : tuple + Range of indices to decode barcodes for in the current data object + + rnd : int + Current round being decoded + + Returns + ------- + tuple : First element is a list of all decoded targets, second element is a list of all decoded barcodes, + third element is a list of all decoded spot codes, and the fourth element is a list of rounds + that were omitted for each decoded barcode + ''' + + # Goes through all possible decodings of each spot (ensures each spot is only looked up once) + allTargets = [] + allDecodedBarcodes = [] + allDecodedSpotCodes = [] + allRoundOmit = [] + allBarcodes = list(data['barcodes']) + allSpotCodes = list(data['spot_codes']) + for i in range(len(allBarcodes)): + targets = [] + decodedBarcodes = [] + decodedSpotCodes = [] + roundOmit = [] + fullBarcodes = allBarcodes[i] + fullSpotCodes = allSpotCodes[i] + + for currentRounds in roundPermutations: + + # Set omittedRound to the round being dropped, if no round is dropped omittedRound becomes -1 + if 0 in currentRounds: + omittedRound = np.argwhere([not cr for cr in currentRounds])[0][0] + else: + omittedRound = -1 + + # Only try to decode barcodes for this spot if the current round is being omitted from the barcodes, we + # don't want to try to assign barcodes for spots from that round + if rnd != omittedRound: + # Modify spot codes and barcodes so that they match the current set of rounds being used for decoding + if omittedRound != -1: + spotCodes = [code for code in np.asarray([np.asarray(spotCode)[list(currentRounds)] for spotCode in fullSpotCodes]) if -1 not in code] + barcodes = [code for code in np.asarray([np.asarray(barcode)[list(currentRounds)] for barcode in fullBarcodes]) if -1 not in code] + else: + spotCodes = fullSpotCodes + barcodes = fullBarcodes + # If all barcodes omit a round other than omittedRound, barcodes will be empty + if len(barcodes) > 0: + # Tries to find a match to each possible barcode from the spot + for j,barcode in enumerate(barcodes): + try: + # Try to assign target by using barcode as key in permutationsCodes dictionary for + # current set of rounds. If there is no barcode match, it will error and go to the except + # and if it succeeds it will add the data to the other lists for this barcode + targets.append(permutationCodes[currentRounds][tuple(barcode)]) + decodedBarcodes.append(barcode) + decodedSpotCodes.append(list(spotCodes[j])) + roundOmit.append(omittedRound) + except: + pass + allTargets.append(targets) + allDecodedBarcodes.append(decodedBarcodes) + allDecodedSpotCodes.append(decodedSpotCodes) + allRoundOmit.append(roundOmit) + + return (allTargets, allDecodedBarcodes, allDecodedSpotCodes, allRoundOmit) + + # Create list of logical arrays corresponding to the round sets being used to decode + roundPermutations = generateRoundPermutations(codebook.sizes[Axes.ROUND], roundOmitNum) + + + # Create dictionary where the keys are the different round sets that can be used for decoding and the values + # are the modified codebooks corresponding to the rounds used + permCodeDict = {} + for currentRounds in roundPermutations: + codes = codebook.argmax(Axes.CH.value) + currentCodes = codes.sel(r=list(currentRounds)) + currentCodes.values = np.ascontiguousarray(currentCodes.values) + permCodeDict[currentRounds] = dict(zip([tuple(code) for code in currentCodes.data], currentCodes['target'].data)) + + + # Goes through each round in filtered_prsr and tries to decode each spot's barcodes + roundNum = len(codebook['r']) + + + # Put data table and permutations codes dictionary in ray storage + permutationCodesID = ray.put(permCodeDict) + + # Calculates index ranges to chunk data by + ranges = [0] + for i in range(1, numJobs+1): + ranges.append(int((len(roundData)/numJobs)*i)) + chunkedData = [] + for i in range(len(ranges[:-1])): + chunkedData.append(deepcopy(roundData[ranges[i]:ranges[i+1]])) + + # Run in parallel + results = [decodeFunc.remote(chunkedData[i], roundPermutations, permutationCodesID, currentRound) for i in range(len(ranges[:-1]))] + rayResults = ray.get(results) + + # Update table + roundData['targets'] = list(chain(*[job[0] for job in rayResults])) + roundData['decoded_barcodes'] = list(chain(*[job[1] for job in rayResults])) + roundData['decoded_spot_codes'] = list(chain(*[job[2] for job in rayResults])) + roundData['omitted_round'] = list(chain(*[job[3] for job in rayResults])) + + # Drop barcodes and spot_codes column (saves memory) + roundData = roundData.drop(['neighbors', 'spot_codes', 'barcodes'], axis=1) + + + # Remove rows that have no decoded barcodes and add -1 spacer back into partial barcodes/spot codes so we can + # easily tell which round each spot ID is from + keep = [] + allBarcodes = [] + allSpotCodes = [] + roundData = roundData[roundData['targets'].astype(bool)].reset_index(drop=True) + dataBarcodes = roundData['decoded_barcodes'] + dataSpotCodes = roundData['decoded_spot_codes'] + dataOmittedRounds = roundData['omitted_round'] + for i in range(len(roundData)): + barcodes = [list(code) for code in dataBarcodes[i]] + spotCodes = [list(code) for code in dataSpotCodes[i]] + omittedRounds = dataOmittedRounds[i] + if omittedRounds[0] != -1: + barcodes = [barcodes[j][:omittedRounds[j]] + [-1] + barcodes[j][omittedRounds[j]:] for j in range(len(barcodes))] + spotCodes = [spotCodes[j][:omittedRounds[j]] + [-1] + spotCodes[j][omittedRounds[j]:] for j in range(len(barcodes))] + allBarcodes.append(barcodes) + allSpotCodes.append(spotCodes) + roundData['decoded_barcodes'] = allBarcodes + roundData['decoded_spot_codes'] = allSpotCodes + + return roundData + +def distanceFilter(roundData: dict, + spotCoords: dict, + currentRound: int, + numJobs: int) -> tuple: + ''' + Function that chooses between the best barcode for each spot from the set of decodable barcodes. Does this by + choosing the barcode with the least spatial variance among the spots that make it up. If there is a tie, the spot + is dropped as ambiguous + + Parameters + ---------- + decodedTables : dict + Dictionary containing modified spot tables with decoded barcode information + + spotTables : dict + Original spot tables dictionary without any added columns + + numJobs : int + Number of CPU threads to use in parallel + + Returns + ------- + tuple : First element is a modified version of decodedTables with added columns to tables with info on the + "best" barcode found for each spot and the second element is a dictionary containing spatial locations + for spots by their IDs in the original spotTables object + ''' + + @ray.remote + def distanceFunc(subSpotCodes: list, spotCoords: dict) -> list: + ''' + Subfunction for distanceFilter to allow it to run in parallel using ray + + Parameters + ---------- + subSpotCodes : list + Chunk of full list of spot codes for the current round to calculate the spatial variance for + + spotCoords : dict + Dictionary containing spatial locations for spots by their IDs in the original spotTables object + + + Returns + ------- + list: list of spatial variances for the current chunk of spot codes + + ''' + + # Calculate spatial variances for current chunk of spot codes + allDistances = [] + for spotCodes in subSpotCodes: + distances = [] + for s,spotCode in enumerate(spotCodes): + coords = [] + for j,spot in enumerate(spotCode): + if spot != -1: + # Extract spot coordinates from spotCoords + z = spotCoords[j][spot]['z'] + y = spotCoords[j][spot]['y'] + x = spotCoords[j][spot]['x'] + coords.append([z, y, x]) + coords = np.asarray(coords) + # Distance is calculate as the sum of variances of the coordinates along each axis + distances.append(sum(np.var(coords, axis = 0))) + allDistances.append(distances) + return allDistances + + + # Calculate the spatial variance for each decodable barcode for each spot in each round + allSpotCodes = roundData['decoded_spot_codes'] + + + # Put spotCoords dictionary into ray memory + spotCoordsID = ray.put(spotCoords) + + # Calculates index ranges to chunk data by + ranges = [0] + for i in range(1, numJobs): + ranges.append(int((len(roundData)/numJobs)*i)) + ranges.append(len(roundData)) + chunkedSpotCodes = [allSpotCodes[ranges[i]:ranges[i+1]] for i in range(len(ranges[:-1]))] + + # Run in parallel using ray + results = [distanceFunc.remote(subSpotCodes, spotCoordsID) for subSpotCodes in chunkedSpotCodes] + rayResults = ray.get(results) + + # Add distances to decodedTables as new column + roundData['distance'] = list(chain(*[job for job in rayResults])) + + + + # Pick minimum distance barcode(s) for each spot + bestSpotCodes = [] + bestBarcodes = [] + bestTargets = [] + bestDistances = [] + dataSpotCodes = list(roundData['decoded_spot_codes']) + dataBarcodes = list(roundData['decoded_barcodes']) + dataDistances = list(roundData['distance']) + dataTargets = list(roundData['targets']) + for i in range(len(roundData)): + spotCodes = dataSpotCodes[i] + barcodes = dataBarcodes[i] + distances = dataDistances[i] + targets = dataTargets[i] + # If only one barcode to choose from, that one is picked as best + if len(distances) == 1: + bestSpotCodes.append(spotCodes) + bestBarcodes.append(barcodes) + bestTargets.append(targets) + bestDistances.append(distances) + # Otherwise find the minimum, and if there are multiple minimums + else: + minDist = 100 + minCount = 0 + for d,distance in enumerate(distances): + if distance < minDist: + minDist = distance + minCount = 1 + minInds = [] + minInds.append(d) + elif distance == minDist: + minCount += 1 + minInds.append(d) + bestSpotCodes.append([spotCodes[i] for i in range(len(spotCodes)) if i in minInds]) + bestBarcodes.append([barcodes[i] for i in range(len(barcodes)) if i in minInds]) + bestTargets.append([targets[i] for i in range(len(targets)) if i in minInds]) + bestDistances.append([distances[i] for i in range(len(distances)) if i in minInds]) + # Create new columns with minimum distance barcode information + roundData['best_spot_codes'] = bestSpotCodes + roundData['best_barcodes'] = bestBarcodes + roundData['best_targets'] = bestTargets + roundData['best_distances'] = bestDistances + + # Drop old columns + roundData = roundData.drop(['targets', 'decoded_barcodes', 'decoded_spot_codes', 'omitted_round'], axis=1) + + + # Only keep barcodes with only one minimum distance + keep = [] + barcodes = roundData['best_barcodes'] + for i in range(len(roundData)): + if len(barcodes[i]) == 1: + keep.append(i) + roundData = roundData.iloc[keep] + + return roundData + + +def cleanup(bestPerSpotTables: dict, + spotCoords: dict, + filterRounds: int) -> pd.DataFrame: + + ''' + Function that combines all "best" codes for each spot in each round into a single table, filters them by their + frequency (with a user-defined threshold), chooses between overlapping codes (using the same distance function + as used earlier), and finally adds some additional information to the final set of barcodes + + Parameters + ---------- + bestPerSpotTables : dict + Spot tables dictionary containing columns with information on the "best" barcode found for each spot + + spotCoords : dict + Dictionary containing spatial locations of spots + + filterRounds : int + Number of rounds that a barcode must be identified in to pass filters (higher = more stringent filtering), + default = 1 - #rounds or 1 - roundOmitNum if roundOmitNum > 0 + + Returns + ------- + pd.DataFrame : Dataframe containing final set of codes that have passed all filters + + ''' + + # Create merged spot results dataframe containing the passing barcodes found in all the rounds + mergedCodes = pd.DataFrame() + roundNum = len(bestPerSpotTables) + for r in range(roundNum): + barcodes = bestPerSpotTables[r]['best_barcodes'] + spotCodes = bestPerSpotTables[r]['best_spot_codes'] + targets = bestPerSpotTables[r]['best_targets'] + distances = bestPerSpotTables[r]['best_distances'] + # Turn each barcode and spot code into a tuple so they can be used as dictionary keys + bestPerSpotTables[r]['best_barcodes'] = [tuple(barcode[0]) for barcode in barcodes] + bestPerSpotTables[r]['best_spot_codes'] = [tuple(spotCode[0]) for spotCode in spotCodes] + bestPerSpotTables[r]['best_targets'] = [target[0] for target in targets] + bestPerSpotTables[r]['best_distances'] = [distance[0] for distance in distances] + mergedCodes = mergedCodes.append(bestPerSpotTables[r]) + mergedCodes = mergedCodes.reset_index(drop=True) + + # Only use codes that were found in >= filterRounds rounds + spotCodes = mergedCodes['best_spot_codes'] + counts = defaultdict(int) + for code in spotCodes: + counts[code] += 1 + passing = list(set(code for code in counts if counts[code] >= filterRounds)) + finalCodes = mergedCodes[mergedCodes['best_spot_codes'].isin(passing)].reset_index(drop=True) + finalCodes = finalCodes.iloc[finalCodes['best_spot_codes'].drop_duplicates().index].reset_index(drop=True) + + # Choose between overlapping spot codes based on which has the smaller spatial variance + for r in range(roundNum): + roundSpots = [code[r] for code in finalCodes['best_spot_codes'] if code[r] != -1] + dupSpots = set([spot for spot in roundSpots if Counter(roundSpots)[spot] > 1]) + nonDupSpots = [spot for spot in roundSpots if spot not in dupSpots] + drop = [] + for spot in dupSpots: + locs = np.where(np.asarray(roundSpots) == spot)[0] + distances = [finalCodes.loc[loc, 'best_distances'] for loc in locs] + minInd = np.where(distances == min(distances))[0] + if len(minInd) > 1: + drop.extend([ind for ind in minInd]) + else: + drop.extend([locs[i] for i in range(len(locs)) if i != minInd]) + finalCodes = finalCodes.iloc[[i for i in range(len(finalCodes)) if i not in drop]].reset_index(drop=True) + + # Add spot coordinates, barcode center coordinates, and number of rounds used for each barcode to table + allCoords = [] + centers = [] + distance = [] + roundsUsed = [] + for i in range(len(finalCodes)): + coords = [] + spotCode = finalCodes.iloc[i]['best_spot_codes'] + roundsUsed.append(roundNum-Counter(spotCode)[-1]) + for r in range(roundNum): + if spotCode[r] != -1: + z = spotCoords[r][spotCode[r]]['z'] + y = spotCoords[r][spotCode[r]]['y'] + x = spotCoords[r][spotCode[r]]['x'] + coords.append((x,y,z)) + else: + coords.append(-1) + allCoords.append(coords) + coords = np.asarray([coord for coord in coords if coord != -1]) + center = np.asarray(coords).mean(axis=0) + centers.append(center) + finalCodes['coords'] = allCoords + finalCodes['center'] = centers + finalCodes['rounds_used'] = roundsUsed + + return finalCodes + +def removeUsedSpots(finalCodes: pd.DataFrame, neighborDict: dict) -> dict: + ''' + Remove spots found to be in barcodes for the current round omission number so they are not used for the next + + Parameters + ---------- + finalCodes : pd.DataFrame + Dataframe containing final set of codes that have passed all filters + + neighborDict : dict + Dictionary that contains all the neighbors for each spot in other rounds that are within the search radius + + Returns + ------- + dict : Modified version of neighborDict with spots that have been used in the current round omission removed + ''' + + # Remove used spots + roundNum = len(neighborDict) + for r in range(roundNum): + usedSpots = list(set([passed[r] for passed in finalCodes['best_spot_codes'] if passed[r] != -1])) + for spot in usedSpots: + for key in neighborDict[r][spot]: + for neighbor in neighborDict[r][spot][key]: + neighborDict[key][neighbor][r] = [i for i in neighborDict[key][neighbor][r] if i != spot] + del neighborDict[r][spot] + + # Remove empty lists + for r in range(roundNum): + for spot in neighborDict[r]: + for key in [*neighborDict[r][spot]]: + if neighborDict[r][spot][key] == []: + del neighborDict[r][spot][key] + + return neighborDict \ No newline at end of file From 9386a03c6a4fd7f6dc2c9993a66e1de98edc80e3 Mon Sep 17 00:00:00 2001 From: nickeener Date: Tue, 12 Oct 2021 14:19:49 -0700 Subject: [PATCH 02/30] Updated comments --- .../spots/DecodeSpots/check_all_decoder.py | 73 ++++++---- .../core/spots/DecodeSpots/check_all_funcs.py | 137 +++++++++--------- 2 files changed, 114 insertions(+), 96 deletions(-) diff --git a/starfish/core/spots/DecodeSpots/check_all_decoder.py b/starfish/core/spots/DecodeSpots/check_all_decoder.py index 274d37ec8..af16c4684 100644 --- a/starfish/core/spots/DecodeSpots/check_all_decoder.py +++ b/starfish/core/spots/DecodeSpots/check_all_decoder.py @@ -20,27 +20,40 @@ class CheckAll(DecodeSpotsAlgorithm): """ - Decode spots by selecting the max-valued channel in each sequencing round. - - Note that this assumes that the codebook contains only one "on" channel per sequencing round, - a common pattern in experiments that assign one fluorophore to each DNA nucleotide and - read DNA sequentially. It is also a characteristic of single-molecule FISH and RNAscope - codebooks. + Decode spots by generating all possible combinations of spots to form barcodes given a radius distance that + spots must be from each other in order to form a barcode. Then chooses the best set of nonoverlapping spot + combinations by choosing the ones with the least spatial variance of their spot coordinates and are also found + to be best for multiple spots in the barcode (see algorithm below). Allows for error correction rounds. + + (see input parmeters below) + 1. For each spot in each round, find all neighbors in other rounds that are within the search radius + 2. For each spot in each round, build all possible full length barcodes based on the channel labels of the spot's + neighbors and itself + 3. Drop barcodes that don't have a matching target in the codebook + 4. Choose the "best" barcode of each spot's possible target matching barcodes by calculating the sum of variances + for each of the spatial coordinates of the spots that make up each barcode and choosing the minimum distance barcode + (if there is a tie, they are all dropped as ambiguous). Each spot is assigned a "best" barcode in this way. + 5. Only keep barcodes/targets that were found as "best" in a certain number of the rounds (determined by filter_rounds + parameter) + 6. If a specific spot is used in more than one of the remaining barcodes, the barcode with the higher spatial variance + between it's spots is dropped (ensures each spot is only used once) + (End here if number of error_rounds = 0) + 7. Remove all spots used in decoded targets that passed the previous filtering steps from the original set of spots + 8. Rerun steps 2-5 for barcodes that use less than the full set of rounds for codebook matching (how many rounds can be + dropped determined by error_rounds parameter) Parameters ---------- codebook : Codebook Contains codes to decode IntensityTable - trace_building_strategy: TraceBuildingStrategies - Defines the strategy for building spot traces to decode across rounds and chs of spot - finding results. - search_radius : Optional[int] - Only applicable if trace_building_strategy is TraceBuildingStrategies.NEAREST_NEIGHBORS. + search_radius : Optional[float] Number of pixels over which to search for spots in other rounds and channels. - anchor_round : Optional[int] - Only applicable if trace_building_strategy is TraceBuildingStrategies.NEAREST_NEIGHBORS. - The imaging round against which other rounds will be checked for spots in the same - approximate pixel location. + filterRounds : Optional[int] + Number of rounds that a barcode must be identified in to pass filters (higher = more stringent filtering), + default = #rounds - 1 or #rounds - error_rounds if error_rounds > 0 + error_rounds : Optional[int] + Maximum hamming distance a barcode can be from it's target in the codebook and still be uniquely identified + (i.e. number of error correction rounds in each the experiment) """ def __init__( @@ -48,14 +61,15 @@ def __init__( codebook: Codebook, filter_rounds: Optional[int]=None, search_radius: Optional[float]=3, - round_omit_num: Optional[int]=0): + error_rounds: Optional[int]=0): self.codebook = codebook self.filterRounds = filter_rounds self.searchRadius = search_radius - self.roundOmitNum = round_omit_num + self.errorRounds = error_rounds - def run(self, spots: SpotFindingResults, n_processes: int=1, *args) -> DecodedIntensityTable: - """Decode spots by selecting the max-valued channel in each sequencing round + def run(self, spots: SpotFindingResults, n_processes: Optional[int]=1, *args) -> DecodedIntensityTable: + """ + Decode spots by finding the set of nonoverlapping barcodes that have the minimum spatial variance within each barcode Parameters ---------- @@ -74,7 +88,7 @@ def run(self, spots: SpotFindingResults, n_processes: int=1, *args) -> DecodedIn # If using an search radius exactly equal to a possible distance between two pixels (ex: 1), some # distances will be calculated as slightly less than their exact distance (either due to rounding or - # precision) so search radius needs to be slightly increased to ensure this doesn't happen + # precision errors) so search radius needs to be slightly increased to ensure this doesn't happen self.searchRadius += 0.001 # Initialize ray for multi_processing @@ -84,19 +98,19 @@ def run(self, spots: SpotFindingResults, n_processes: int=1, *args) -> DecodedIn # the spots found in that round spotTables = _merge_spots_by_round(spots) - # If user did not specify the filterRounds variable (it will have default value -1) change it to either one less + # If user did not specify the filterRounds variable (it will have default value None), change it to either one less # than the number of rounds if roundOmitNum is 0 or the number of rounds minus the roundOmitNum if roundOmitNum > 0 if self.filterRounds == None: if self.roundOmitNum == 0: self.filterRounds = len(spotTables) - 1 else: - self.filterRounds = len(spotTables) - self.roundOmitNum + self.filterRounds = len(spotTables) - self.errorRounds # Create dictionary of neighbors (within the search radius) in other rounds for each spot neighborDict = findNeighbors(spotTables, self.searchRadius) - # Create dictionary with mapping from spot id in spotTables to channel number and one with spot + # Create dictionaries with mapping from spot id (row index) in spotTables to channel number and one with spot # coordinates for fast access channelDict = {} spotCoords = {} @@ -105,16 +119,20 @@ def run(self, spots: SpotFindingResults, n_processes: int=1, *args) -> DecodedIn spotCoords[r] = spotTables[r][['z','y','x']].T.to_dict() # Set list of round omission numbers to loop through - roundOmits = range(self.roundOmitNum+1) + roundOmits = range(self.errorRounds+1) - # Decode for each round omission number + # Decode for each round omission number, store results in allCodes table allCodes = pd.DataFrame() for currentRoundOmitNum in roundOmits: + + # Chooses best barcode for all spots in each round sequentially (possible barcode space can become quite large which + # can increase memory needs so I do it this way so we only need to store all potential barcodes that originate from + # one round at a time) decodedTables = {} for r in range(len(spotTables)): roundData = deepcopy(spotTables[r]) - # Create dictionary of dataframes (based on perRoundSpotTables data) that contains additional columns for each spot + # Create dictionary of dataframes (based on spotTables data) that contains additional columns for each spot # containing all the possible barcodes that could be constructed from the neighbors of that spot roundData = buildBarcodes(roundData, neighborDict, currentRoundOmitNum, channelDict, r, numJobs) @@ -129,7 +147,7 @@ def run(self, spots: SpotFindingResults, n_processes: int=1, *args) -> DecodedIn decodedTables[r] = roundData # Turn spot table dictionary into single table, filter barcodes by round frequency, add additional information, - # and choose between barcodes that use the same spot(s) + # and choose between barcodes that have overlapping spots finalCodes = cleanup(decodedTables, spotCoords, self.filterRounds) # If this is not the last round omission number to run, remove spots that have just been found to be in @@ -176,6 +194,7 @@ def run(self, spots: SpotFindingResults, n_processes: int=1, *args) -> DecodedIn intensity_table = transfer_physical_coords_to_intensity_table(intensity_table=intensity_table, spots=spots) intensities = intensity_table.transpose('features', 'r', 'c') + # Validate results are correct shape self.codebook._validate_decode_intensity_input_matches_codebook_shape(intensities) # Create DecodedIntensityTable diff --git a/starfish/core/spots/DecodeSpots/check_all_funcs.py b/starfish/core/spots/DecodeSpots/check_all_funcs.py index d95197a94..2485a26e3 100644 --- a/starfish/core/spots/DecodeSpots/check_all_funcs.py +++ b/starfish/core/spots/DecodeSpots/check_all_funcs.py @@ -20,8 +20,7 @@ from starfish.core.codebook.codebook import Codebook -def findNeighbors(spotTables: pd.DataFrame, - searchRadius: float) -> dict: +def findNeighbors(spotTables: dict, searchRadius: float) -> dict: ''' Function that takes spatial information from the spot tables from each round and creates a dictionary that contains @@ -29,15 +28,12 @@ def findNeighbors(spotTables: pd.DataFrame, Parameters ---------- - spotTables : pd.DataFrame + spotTables : dict Dictionary with round labels as keys and pandas dataframes containing spot information for its key round as values (result of _merge_spots_by_round function) searchRadius : float Distance that spots can be from each other and still form a barcode - - roundLabels : list - List of round index labels (can extract using .round_lables on a SpotFindingResults object) Returns ------- @@ -58,7 +54,7 @@ def findNeighbors(spotTables: pd.DataFrame, # For each pairing of rounds, find all mutual neighbors within the search radius for each spot and assigns them # in the neighborDict dictionary - # Number assigned each spot in neighborDict is the index of it's original location in spotTables (also its spot_id) + # Number assigned each spot in neighborDict is the index of it's original location in spotTables # and is used to track each spot uniquely throughout for i,r1 in enumerate(range((len(spotTables)))): tree = cKDTree(spotTables[r1][['z', 'y', 'x']]) @@ -73,55 +69,58 @@ def findNeighbors(spotTables: pd.DataFrame, return neighborDict -def buildBarcodes(roundData: dict, +def buildBarcodes(roundData: pd.DataFrame, neighborDict: dict, roundOmitNum: int, channelDict: dict, currentRound: int, - numJobs: int) -> dict: + numJobs: int) -> pd.DataFrame: ''' - Function that creates a copy of the spotTables dictionary and adds to it's tables all the possible barcodes - that could be formed using the neighbors of each spot, spots without enough neighbors to form a barcode are - dropped. + Function that adds to the current rounds spot table all the possible barcodes that could be formed using the neighbors + of each spot, spots without enough neighbors to form a barcode are dropped. Parameters ---------- - spotTables : dict - Dictionary with round labels as keys and pandas dataframes containing spot information for its key round - as values (result of _merge_spots_by_round function) + roundData : dict + Spot data table for the current round neighborDict : dict Dictionary that contains all the neighbors for each spot in other rounds that are within the search radius roundOmitNum : int Maximum hamming distance a barcode can be from it's target in the codebook and still be uniquely identified - (i.e. number of error correction rounds in each the experiment) + (i.e. number of error correction rounds in each the experiment + + channelDict : dict + Dictionary with mappings between spot IDs and their channel labels + + currentRound : int + Current round to build barcodes for (same round that roundData is from) numJobs : int Number of CPU threads to use in parallel Returns ------- - dict : Copy of spotTables with additional columns in each table which lists all possible barcodes + pd.DataFrame : Copy of roundData with additional columns which list all possible barcodes that could be made from each spot's neighbors ''' - @ray.remote def barcodeBuildFunc(data: pd.DataFrame, channelDict: dict, rang: tuple, roundOmitNum: int, - roundNum: int): + roundNum: int) -> tuple: ''' Subfunction to buildBarcodes that allows it to run in parallel chunks using ray Parameters ---------- data : pd.DataFrame - Spot table from barcodeTables for the current round + Spot table for the current round channelDict : dict Dictionary mapping spot IDs to their channels labels @@ -132,6 +131,9 @@ def barcodeBuildFunc(data: pd.DataFrame, roundOmitNum : int Maximum hamming distance a barcode can be from it's target in the codebook and still be uniquely identified (i.e. number of error correction rounds in each the experiment) + + roundNum : int + Current round Returns ------- @@ -139,7 +141,7 @@ def barcodeBuildFunc(data: pd.DataFrame, possible barcodes ''' - # Build barcodes from neighbors + # Build barcodes from neighbors # spotCodes are the ordered spot IDs of the spots making up each barcode while barcodes are the corresponding # channel labels, need spotCodes so each barcode can have a unique identifier allSpotCodes = [] @@ -174,7 +176,8 @@ def barcodeBuildFunc(data: pd.DataFrame, return (allSpotCodes, allBarcodes) - # Only keep spots that have enough neighbors to form a barcode (determined by the total number of round and the + + # Only keep spots that have enough neighbors to form a barcode (determined by the total number of rounds and the # number of rounds that can be omitted from each code) passingSpots = {} roundNum = len(neighborDict) @@ -204,18 +207,19 @@ def barcodeBuildFunc(data: pd.DataFrame, results = [barcodeBuildFunc.remote(dataID, channelDictID, (ranges[i], ranges[i+1]), roundOmitNum, roundNum) for i in range(len(ranges[:-1]))] rayResults = ray.get(results) - # Add possible barcodes and spot codes (same order) to spot dictionary (must chain results rom different jobs + # Add possible barcodes and spot codes (same order) to spot table (must chain results rom different jobs # together) roundData['spot_codes'] = list(chain(*[job[0] for job in rayResults])) roundData['barcodes'] = list(chain(*[job[1] for job in rayResults])) return roundData -def decoder(roundData: dict, + +def decoder(roundData: pd.DataFrame, codebook: Codebook, roundOmitNum: int, currentRound: int, - numJobs: int): + numJobs: int) -> pd.DataFrane: ''' Function that takes spots tables with possible barcodes added and matches each to the codebook to identify any @@ -223,21 +227,24 @@ def decoder(roundData: dict, Parameters ---------- - barcodeTables : dict - Dictionary with modified spot tables containing all possible barcodes that can be made from each spot + roundData : pd.DataFrane + Modified spot table containing all possible barcodes that can be made from each spot for the current round codebook : Codebook starFISH Codebook object containg the barcode information for the experiment roundOmitNum : int Number of rounds that can be dropped from each barcode + + currentRound : int + Current round being for which spots are being decoded numJobs : int Number of CPU threads to use in parallel Returns ------- - dict : barcodeTables dictionary with added columns with information on decodable barcodes + pd.DataFrane : Modified spot table with added columns with information on decodable barcodes ''' def generateRoundPermutations(size: int, roundOmitNum: int) -> list: @@ -274,16 +281,13 @@ def decodeFunc(data: pd.DataFrame, Parameters ---------- data : pd.DataFrame - Spot table from barcodeTables for the current round + Spot table for the current round roundPermutations : list List of logicals from generateRoundPermutations that details the rounds to use in decoding permutationCodes : dict Dictionary containing barcode information for each roundPermutation - - rang : tuple - Range of indices to decode barcodes for in the current data object rnd : int Current round being decoded @@ -318,8 +322,7 @@ def decodeFunc(data: pd.DataFrame, else: omittedRound = -1 - # Only try to decode barcodes for this spot if the current round is being omitted from the barcodes, we - # don't want to try to assign barcodes for spots from that round + # Only try to decode barcodes for this spot if the current round is not the omitted round if rnd != omittedRound: # Modify spot codes and barcodes so that they match the current set of rounds being used for decoding if omittedRound != -1: @@ -349,10 +352,10 @@ def decodeFunc(data: pd.DataFrame, return (allTargets, allDecodedBarcodes, allDecodedSpotCodes, allRoundOmit) + # Create list of logical arrays corresponding to the round sets being used to decode roundPermutations = generateRoundPermutations(codebook.sizes[Axes.ROUND], roundOmitNum) - # Create dictionary where the keys are the different round sets that can be used for decoding and the values # are the modified codebooks corresponding to the rounds used permCodeDict = {} @@ -362,15 +365,12 @@ def decodeFunc(data: pd.DataFrame, currentCodes.values = np.ascontiguousarray(currentCodes.values) permCodeDict[currentRounds] = dict(zip([tuple(code) for code in currentCodes.data], currentCodes['target'].data)) - # Goes through each round in filtered_prsr and tries to decode each spot's barcodes - roundNum = len(codebook['r']) - # Put data table and permutations codes dictionary in ray storage permutationCodesID = ray.put(permCodeDict) - # Calculates index ranges to chunk data by + # Calculates index ranges to chunk data by and creates list of chunked data to loop through ranges = [0] for i in range(1, numJobs+1): ranges.append(int((len(roundData)/numJobs)*i)) @@ -392,33 +392,34 @@ def decodeFunc(data: pd.DataFrame, roundData = roundData.drop(['neighbors', 'spot_codes', 'barcodes'], axis=1) - # Remove rows that have no decoded barcodes and add -1 spacer back into partial barcodes/spot codes so we can - # easily tell which round each spot ID is from - keep = [] - allBarcodes = [] - allSpotCodes = [] + # Remove rows that have no decoded barcodes roundData = roundData[roundData['targets'].astype(bool)].reset_index(drop=True) - dataBarcodes = roundData['decoded_barcodes'] - dataSpotCodes = roundData['decoded_spot_codes'] - dataOmittedRounds = roundData['omitted_round'] - for i in range(len(roundData)): - barcodes = [list(code) for code in dataBarcodes[i]] - spotCodes = [list(code) for code in dataSpotCodes[i]] - omittedRounds = dataOmittedRounds[i] - if omittedRounds[0] != -1: + + # Add -1 spacer back into partial barcodes/spot codes so we can easily tell which round each spot ID is from + if roundOmitNum > 0: + allBarcodes = [] + allSpotCodes = [] + dataBarcodes = roundData['decoded_barcodes'] + dataSpotCodes = roundData['decoded_spot_codes'] + dataOmittedRounds = roundData['omitted_round'] + for i in range(len(roundData)): + barcodes = [list(code) for code in dataBarcodes[i]] + spotCodes = [list(code) for code in dataSpotCodes[i]] + omittedRounds = dataOmittedRounds[i] barcodes = [barcodes[j][:omittedRounds[j]] + [-1] + barcodes[j][omittedRounds[j]:] for j in range(len(barcodes))] spotCodes = [spotCodes[j][:omittedRounds[j]] + [-1] + spotCodes[j][omittedRounds[j]:] for j in range(len(barcodes))] - allBarcodes.append(barcodes) - allSpotCodes.append(spotCodes) - roundData['decoded_barcodes'] = allBarcodes - roundData['decoded_spot_codes'] = allSpotCodes + allBarcodes.append(barcodes) + allSpotCodes.append(spotCodes) + roundData['decoded_barcodes'] = allBarcodes + roundData['decoded_spot_codes'] = allSpotCodes return roundData -def distanceFilter(roundData: dict, + +def distanceFilter(roundData: pd.DataFrame, spotCoords: dict, currentRound: int, - numJobs: int) -> tuple: + numJobs: int) -> pd.DataFrame: ''' Function that chooses between the best barcode for each spot from the set of decodable barcodes. Does this by choosing the barcode with the least spatial variance among the spots that make it up. If there is a tie, the spot @@ -426,20 +427,21 @@ def distanceFilter(roundData: dict, Parameters ---------- - decodedTables : dict - Dictionary containing modified spot tables with decoded barcode information + roundData : pd.DataFrame + Modified spot table containing info on decodable barcodes for the spots in the current round + + spotCoords : dict + Dictionary containing spatial coordinates of spots in each round indexed by their IDs - spotTables : dict - Original spot tables dictionary without any added columns + currentRound : int + Current round number to calculate distances for numJobs : int Number of CPU threads to use in parallel Returns ------- - tuple : First element is a modified version of decodedTables with added columns to tables with info on the - "best" barcode found for each spot and the second element is a dictionary containing spatial locations - for spots by their IDs in the original spotTables object + pd.DataFrame : Modified spot table with added columns to with info on the "best" barcode found for each spot ''' @ray.remote @@ -485,7 +487,6 @@ def distanceFunc(subSpotCodes: list, spotCoords: dict) -> list: # Calculate the spatial variance for each decodable barcode for each spot in each round allSpotCodes = roundData['decoded_spot_codes'] - # Put spotCoords dictionary into ray memory spotCoordsID = ray.put(spotCoords) @@ -503,8 +504,6 @@ def distanceFunc(subSpotCodes: list, spotCoords: dict) -> list: # Add distances to decodedTables as new column roundData['distance'] = list(chain(*[job for job in rayResults])) - - # Pick minimum distance barcode(s) for each spot bestSpotCodes = [] bestBarcodes = [] @@ -551,7 +550,6 @@ def distanceFunc(subSpotCodes: list, spotCoords: dict) -> list: # Drop old columns roundData = roundData.drop(['targets', 'decoded_barcodes', 'decoded_spot_codes', 'omitted_round'], axis=1) - # Only keep barcodes with only one minimum distance keep = [] barcodes = roundData['best_barcodes'] @@ -658,6 +656,7 @@ def cleanup(bestPerSpotTables: dict, return finalCodes + def removeUsedSpots(finalCodes: pd.DataFrame, neighborDict: dict) -> dict: ''' Remove spots found to be in barcodes for the current round omission number so they are not used for the next From 5d3e8d02ced2f6c16c2fcff5e3ef22f221ed04d2 Mon Sep 17 00:00:00 2001 From: nickeener Date: Tue, 12 Oct 2021 15:25:23 -0700 Subject: [PATCH 03/30] Changed filter_tally to rounds_used in decoded_intensity_table --- starfish/core/intensity_table/decoded_intensity_table.py | 6 +++--- starfish/core/spots/DecodeSpots/check_all_decoder.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/starfish/core/intensity_table/decoded_intensity_table.py b/starfish/core/intensity_table/decoded_intensity_table.py index 0d17097fe..1f4b12a46 100644 --- a/starfish/core/intensity_table/decoded_intensity_table.py +++ b/starfish/core/intensity_table/decoded_intensity_table.py @@ -60,7 +60,7 @@ def from_intensity_table( targets: Tuple[str, np.ndarray], distances: Optional[Tuple[str, np.ndarray]] = None, passes_threshold: Optional[Tuple[str, np.ndarray]] = None, - filter_tally: Optional[Tuple[str, np.ndarray]] = None): + rounds_used: Optional[Tuple[str, np.ndarray]] = None): """ Assign target values to intensities. Parameters @@ -74,7 +74,7 @@ def from_intensity_table( passes_threshold : Optional[Tuple[str, np.ndarray]] Corresponding array of boolean values indicating if each itensity passed given thresholds. - filter_tally: Optional[Tuple[str, np.ndarray]] + rounds_used: Optional[Tuple[str, np.ndarray]] Corresponding array of integers indicated the number of rounds this decoded intensity was found in Returns @@ -89,7 +89,7 @@ def from_intensity_table( if passes_threshold: intensities[Features.PASSES_THRESHOLDS] = passes_threshold if filter_tally: - intensities['filter_tally'] = filter_tally + intensities['rounds_used'] = rounds_used return intensities def to_decoded_dataframe(self) -> DecodedSpots: diff --git a/starfish/core/spots/DecodeSpots/check_all_decoder.py b/starfish/core/spots/DecodeSpots/check_all_decoder.py index af16c4684..58dc5fa3e 100644 --- a/starfish/core/spots/DecodeSpots/check_all_decoder.py +++ b/starfish/core/spots/DecodeSpots/check_all_decoder.py @@ -203,7 +203,7 @@ def run(self, spots: SpotFindingResults, n_processes: Optional[int]=1, *args) -> targets=(Features.AXIS, allCodes['best_targets'].astype('U')), distances=(Features.AXIS, allCodes["best_distances"]), passes_threshold=(Features.AXIS, np.full(len(allCodes), True)), - filter_tally=(Features.AXIS, allCodes['rounds_used'])) + rounds_used=(Features.AXIS, allCodes['rounds_used'])) return result From b21b57f725cd74d2852e3cf89c8155d342ba902a Mon Sep 17 00:00:00 2001 From: nickeener Date: Wed, 13 Oct 2021 13:21:58 -0700 Subject: [PATCH 04/30] Fixed validation errors --- .../decoded_intensity_table.py | 4 +- .../spots/DecodeSpots/check_all_decoder.py | 181 ++++---- .../core/spots/DecodeSpots/check_all_funcs.py | 407 +++++++++--------- 3 files changed, 315 insertions(+), 277 deletions(-) diff --git a/starfish/core/intensity_table/decoded_intensity_table.py b/starfish/core/intensity_table/decoded_intensity_table.py index 1f4b12a46..73ee10caa 100644 --- a/starfish/core/intensity_table/decoded_intensity_table.py +++ b/starfish/core/intensity_table/decoded_intensity_table.py @@ -75,7 +75,7 @@ def from_intensity_table( Corresponding array of boolean values indicating if each itensity passed given thresholds. rounds_used: Optional[Tuple[str, np.ndarray]] - Corresponding array of integers indicated the number of rounds this + Corresponding array of integers indicated the number of rounds this decoded intensity was found in Returns ------- @@ -88,7 +88,7 @@ def from_intensity_table( intensities[Features.DISTANCE] = distances if passes_threshold: intensities[Features.PASSES_THRESHOLDS] = passes_threshold - if filter_tally: + if rounds_used: intensities['rounds_used'] = rounds_used return intensities diff --git a/starfish/core/spots/DecodeSpots/check_all_decoder.py b/starfish/core/spots/DecodeSpots/check_all_decoder.py index 58dc5fa3e..40f2522c1 100644 --- a/starfish/core/spots/DecodeSpots/check_all_decoder.py +++ b/starfish/core/spots/DecodeSpots/check_all_decoder.py @@ -1,4 +1,4 @@ -from typing import Callable, Optional +from typing import Mapping, Hashable, Tuple, Any import ray import pandas as pd import numpy as np @@ -14,68 +14,80 @@ from ._base import DecodeSpotsAlgorithm -from .check_all_funcs import findNeighbors, buildBarcodes, decoder, distanceFilter, cleanup, removeUsedSpots +from .check_all_funcs import findNeighbors, buildBarcodes, decoder, distanceFilter, cleanup, \ + removeUsedSpots from .util import _merge_spots_by_round class CheckAll(DecodeSpotsAlgorithm): """ - Decode spots by generating all possible combinations of spots to form barcodes given a radius distance that - spots must be from each other in order to form a barcode. Then chooses the best set of nonoverlapping spot - combinations by choosing the ones with the least spatial variance of their spot coordinates and are also found - to be best for multiple spots in the barcode (see algorithm below). Allows for error correction rounds. + Decode spots by generating all possible combinations of spots to form barcodes given a radius + distance that spots must be from each other in order to form a barcode. Then chooses the best + set of nonoverlapping spot combinations by choosing the ones with the least spatial variance + of their spot coordinates and are also found to be best for multiple spots in the barcode + (see algorithm below). Allows for error correction rounds. (see input parmeters below) - 1. For each spot in each round, find all neighbors in other rounds that are within the search radius - 2. For each spot in each round, build all possible full length barcodes based on the channel labels of the spot's - neighbors and itself - 3. Drop barcodes that don't have a matching target in the codebook - 4. Choose the "best" barcode of each spot's possible target matching barcodes by calculating the sum of variances - for each of the spatial coordinates of the spots that make up each barcode and choosing the minimum distance barcode - (if there is a tie, they are all dropped as ambiguous). Each spot is assigned a "best" barcode in this way. - 5. Only keep barcodes/targets that were found as "best" in a certain number of the rounds (determined by filter_rounds - parameter) - 6. If a specific spot is used in more than one of the remaining barcodes, the barcode with the higher spatial variance - between it's spots is dropped (ensures each spot is only used once) - (End here if number of error_rounds = 0) - 7. Remove all spots used in decoded targets that passed the previous filtering steps from the original set of spots - 8. Rerun steps 2-5 for barcodes that use less than the full set of rounds for codebook matching (how many rounds can be - dropped determined by error_rounds parameter) + 1. For each spot in each round, find all neighbors in other rounds that are within the search + radius + 2. For each spot in each round, build all possible full length barcodes based on the channel + labels of the spot's neighbors and itself + 3. Drop barcodes that don't have a matching target in the codebook + 4. Choose the "best" barcode of each spot's possible target matching barcodes by calculating + the sum of variances for each of the spatial coordinates of the spots that make up each barcode + and choosing the minimum distance barcode (if there is a tie, they are all dropped as + ambiguous). Each spot is assigned a "best" barcode in this way. + 5. Only keep barcodes/targets that were found as "best" in a certain number of the rounds + (determined by filter_rounds parameter) + 6. If a specific spot is used in more than one of the remaining barcodes, the barcode with the + higher spatial variance between it's spots is dropped (ensures each spot is only used once) + (End here if number of error_rounds = 0) + 7. Remove all spots used in decoded targets that passed the previous filtering steps from the + original set of spots + 8. Rerun steps 2-5 for barcodes that use less than the full set of rounds for codebook + matching (how many rounds can be dropped determined by error_rounds parameter) Parameters ---------- codebook : Codebook Contains codes to decode IntensityTable - search_radius : Optional[float] + search_radius : float Number of pixels over which to search for spots in other rounds and channels. - filterRounds : Optional[int] - Number of rounds that a barcode must be identified in to pass filters (higher = more stringent filtering), - default = #rounds - 1 or #rounds - error_rounds if error_rounds > 0 - error_rounds : Optional[int] - Maximum hamming distance a barcode can be from it's target in the codebook and still be uniquely identified - (i.e. number of error correction rounds in each the experiment) + filterRounds : int + Number of rounds that a barcode must be identified in to pass filters (higher = more + stringent filtering), default = #rounds - 1 or #rounds - error_rounds if error_rounds > 0 + error_rounds : int + Maximum hamming distance a barcode can be from it's target in the codebook and still be + uniquely identified (i.e. number of error correction rounds in each the experiment) """ def __init__( self, codebook: Codebook, - filter_rounds: Optional[int]=None, - search_radius: Optional[float]=3, - error_rounds: Optional[int]=0): + filter_rounds: int=None, + search_radius: float=3, + error_rounds: int=0): self.codebook = codebook self.filterRounds = filter_rounds self.searchRadius = search_radius self.errorRounds = error_rounds - def run(self, spots: SpotFindingResults, n_processes: Optional[int]=1, *args) -> DecodedIntensityTable: + def run(self, + spots: SpotFindingResults, + n_processes: int=1, + *args) -> DecodedIntensityTable: """ - Decode spots by finding the set of nonoverlapping barcodes that have the minimum spatial variance within each barcode + Decode spots by finding the set of nonoverlapping barcodes that have the minimum spatial + variance within each barcode Parameters ---------- spots: SpotFindingResults A Dict of tile indices and their corresponding measured spots + n_processes: int + Number of threads to run decoder in parallel with + Returns ------- DecodedIntensityTable : @@ -83,88 +95,95 @@ def run(self, spots: SpotFindingResults, n_processes: Optional[int]=1, *args) -> """ - # Rename n_processes (trying to stay consistent between starFISH's _ variables and my camel case ones) + # Rename n_processes (trying to stay consistent between starFISH's _ variables and my + # camel case ones) numJobs = n_processes - # If using an search radius exactly equal to a possible distance between two pixels (ex: 1), some - # distances will be calculated as slightly less than their exact distance (either due to rounding or - # precision errors) so search radius needs to be slightly increased to ensure this doesn't happen + # If using an search radius exactly equal to a possible distance between two pixels + # (ex: 1), some distances will be calculated as slightly less than their exact distance + # (either due to rounding or precision errors) so search radius needs to be slightly + # increased to ensure this doesn't happen self.searchRadius += 0.001 # Initialize ray for multi_processing ray.init(num_cpus=numJobs) - - # Create dictionary where keys are round labels and the values are pandas dataframes containing information on - # the spots found in that round + + # Create dictionary where keys are round labels and the values are pandas dataframes + # containing information on the spots found in that round spotTables = _merge_spots_by_round(spots) - - # If user did not specify the filterRounds variable (it will have default value None), change it to either one less - # than the number of rounds if roundOmitNum is 0 or the number of rounds minus the roundOmitNum if roundOmitNum > 0 - if self.filterRounds == None: - if self.roundOmitNum == 0: + + # If user did not specify the filterRounds variable (it will have default value None), + # change it to either one less than the number of rounds if errorRounds is 0 or the + # number of rounds minus the errorRounds if errorRounds > 0 + if self.filterRounds is None: + if self.errorRounds == 0: self.filterRounds = len(spotTables) - 1 else: self.filterRounds = len(spotTables) - self.errorRounds - # Create dictionary of neighbors (within the search radius) in other rounds for each spot neighborDict = findNeighbors(spotTables, self.searchRadius) - - # Create dictionaries with mapping from spot id (row index) in spotTables to channel number and one with spot - # coordinates for fast access + + # Create dictionaries with mapping from spot id (row index) in spotTables to channel + # number and one with spot coordinates for fast access channelDict = {} spotCoords = {} for r in [*spotTables]: channelDict[r] = spotTables[r]['c'].to_dict() - spotCoords[r] = spotTables[r][['z','y','x']].T.to_dict() - + spotCoords[r] = spotTables[r][['z', 'y', 'x']].T.to_dict() + # Set list of round omission numbers to loop through - roundOmits = range(self.errorRounds+1) - + roundOmits = range(self.errorRounds + 1) + # Decode for each round omission number, store results in allCodes table allCodes = pd.DataFrame() for currentRoundOmitNum in roundOmits: - # Chooses best barcode for all spots in each round sequentially (possible barcode space can become quite large which - # can increase memory needs so I do it this way so we only need to store all potential barcodes that originate from - # one round at a time) + # Chooses best barcode for all spots in each round sequentially (possible barcode + # space can become quite large which can increase memory needs so I do it this way so + # we only need to store all potential barcodes that originate from one round at a + # time) decodedTables = {} for r in range(len(spotTables)): roundData = deepcopy(spotTables[r]) - - # Create dictionary of dataframes (based on spotTables data) that contains additional columns for each spot - # containing all the possible barcodes that could be constructed from the neighbors of that spot - roundData = buildBarcodes(roundData, neighborDict, currentRoundOmitNum, channelDict, r, numJobs) - - # Match possible barcodes to codebook and add new columns with info about barcodes that had a codebook match + + # Create dictionary of dataframes (based on spotTables data) that contains + # additional columns for each spot containing all the possible barcodes that + # could be constructed from the neighbors of that spot + roundData = buildBarcodes(roundData, neighborDict, currentRoundOmitNum, + channelDict, r, numJobs) + + # Match possible barcodes to codebook and add new columns with info about barcodes + # that had a codebook match roundData = decoder(roundData, self.codebook, currentRoundOmitNum, r, numJobs) - # Choose most likely barcode for each spot in each round by find the possible decodable barcode with the least - # spatial variance between the spots that made up the barcode + # Choose most likely barcode for each spot in each round by find the possible + # decodable barcode with the least spatial variance between the spots that made up + # the barcode roundData = distanceFilter(roundData, spotCoords, r, numJobs) - + # Assign to DecodedTables dictionary decodedTables[r] = roundData - # Turn spot table dictionary into single table, filter barcodes by round frequency, add additional information, - # and choose between barcodes that have overlapping spots + # Turn spot table dictionary into single table, filter barcodes by round frequency, add + # additional information, and choose between barcodes that have overlapping spots finalCodes = cleanup(decodedTables, spotCoords, self.filterRounds) - - # If this is not the last round omission number to run, remove spots that have just been found to be in - # passing barcodes from neighborDict so they are not used for the next round omission number + + # If this is not the last round omission number to run, remove spots that have just + # been found to be in passing barcodes from neighborDict so they are not used for the + # next round omission number if currentRoundOmitNum != roundOmits[-1]: neighborDict = removeUsedSpots(finalCodes, neighborDict) - + # Append found codes to allCodes table allCodes = allCodes.append(finalCodes).reset_index(drop=True) - + # Shutdown ray ray.shutdown() - # Create and fill in intensity table - channels=spots.ch_labels - rounds=spots.round_labels + channels = spots.ch_labels + rounds = spots.round_labels # create empty IntensityTable filled with np.nan data = np.full((len(allCodes), len(channels), len(rounds)), fill_value=np.nan) @@ -180,7 +199,7 @@ def run(self, spots: SpotFindingResults, n_processes: Optional[int]=1, *args) -> Axes.ROUND.value: (Axes.ROUND.value, rounds), Axes.CH.value: (Axes.CH.value, channels) } - intensity_table = IntensityTable(data=data, dims=dims, coords=coords) + int_table = IntensityTable(data=data, dims=dims, coords=coords) # Fill in data values table_codes = [] @@ -190,20 +209,20 @@ def run(self, spots: SpotFindingResults, n_processes: Optional[int]=1, *args) -> # If a round is not used, row will be all zeros code.append(np.asarray([0 if j != ch else 1 for j in range(len(channels))])) table_codes.append(np.asarray(code).T) - intensity_table.values = np.asarray(table_codes) - intensity_table = transfer_physical_coords_to_intensity_table(intensity_table=intensity_table, spots=spots) - intensities = intensity_table.transpose('features', 'r', 'c') + int_table.values = np.asarray(table_codes) + int_table = transfer_physical_coords_to_intensity_table(intensity_table=int_table, + spots=spots) + intensities = int_table.transpose('features', 'r', 'c') # Validate results are correct shape self.codebook._validate_decode_intensity_input_matches_codebook_shape(intensities) # Create DecodedIntensityTable - result=DecodedIntensityTable.from_intensity_table( + result = DecodedIntensityTable.from_intensity_table( intensities, targets=(Features.AXIS, allCodes['best_targets'].astype('U')), distances=(Features.AXIS, allCodes["best_distances"]), passes_threshold=(Features.AXIS, np.full(len(allCodes), True)), rounds_used=(Features.AXIS, allCodes['rounds_used'])) - return result diff --git a/starfish/core/spots/DecodeSpots/check_all_funcs.py b/starfish/core/spots/DecodeSpots/check_all_funcs.py index 2485a26e3..d3b14c7b8 100644 --- a/starfish/core/spots/DecodeSpots/check_all_funcs.py +++ b/starfish/core/spots/DecodeSpots/check_all_funcs.py @@ -1,37 +1,29 @@ -# General modules -import time # TODO: delete for final version -import pickle # TODO: delete for final version from collections import Counter from scipy.spatial import cKDTree from copy import deepcopy -from itertools import product, chain, permutations, combinations -import math +from itertools import product, chain, permutations from collections import defaultdict -from typing import Any, Hashable, Mapping, Tuple import ray import numpy as np import pandas as pd -import xarray as xr import warnings -warnings.filterwarnings('ignore') - -# starFISH stuff -from starfish.types import Axes, Coordinates, CoordinateValue, Features +from starfish.types import Axes from starfish.core.codebook.codebook import Codebook - +warnings.filterwarnings('ignore') def findNeighbors(spotTables: dict, searchRadius: float) -> dict: - + ''' - Function that takes spatial information from the spot tables from each round and creates a dictionary that contains - all the neighbors for each spot in other rounds that are within the search radius. - + Function that takes spatial information from the spot tables from each round and creates a + dictionary that contains all the neighbors for each spot in other rounds that are within the + search radius. + Parameters ---------- spotTables : dict - Dictionary with round labels as keys and pandas dataframes containing spot information for its key round - as values (result of _merge_spots_by_round function) - + Dictionary with round labels as keys and pandas dataframes containing spot information + for its key round as values (result of _merge_spots_by_round function) + searchRadius : float Distance that spots can be from each other and still form a barcode @@ -41,7 +33,7 @@ def findNeighbors(spotTables: dict, searchRadius: float) -> dict: {round: { spotID in round: { neighborRound: - [list of spotIDs in neighborRound within searchRadius of spotID in round] + [list of spotIDs in neighborRound within searchRadius of spotID in round] } } } @@ -52,22 +44,22 @@ def findNeighbors(spotTables: dict, searchRadius: float) -> dict: for r in spotTables: neighborDict[r] = {i: defaultdict(list, {r: [i]}) for i in range(len(spotTables[r]))} - # For each pairing of rounds, find all mutual neighbors within the search radius for each spot and assigns them - # in the neighborDict dictionary - # Number assigned each spot in neighborDict is the index of it's original location in spotTables - # and is used to track each spot uniquely throughout - for i,r1 in enumerate(range((len(spotTables)))): + # For each pairing of rounds, find all mutual neighbors within the search radius for each spot + # and assigns them in the neighborDict dictionary + # Number assigned each spot in neighborDict is the index of it's original location in + # spotTables and is used to track each spot uniquely throughout + for i, r1 in enumerate(range((len(spotTables)))): tree = cKDTree(spotTables[r1][['z', 'y', 'x']]) - for r2 in list(range((len(spotTables))))[i+1:]: + for r2 in list(range((len(spotTables))))[i + 1:]: allNeighbors = tree.query_ball_point(spotTables[r2][['z', 'y', 'x']], searchRadius) - for j,neighbors in enumerate(allNeighbors): + for j, neighbors in enumerate(allNeighbors): if neighbors != []: for neighbor in neighbors: neighborDict[r1][neighbor][r2].append(j) neighborDict[r2][j][r1].append(neighbor) return neighborDict - + def buildBarcodes(roundData: pd.DataFrame, neighborDict: dict, @@ -75,39 +67,41 @@ def buildBarcodes(roundData: pd.DataFrame, channelDict: dict, currentRound: int, numJobs: int) -> pd.DataFrame: - + ''' - Function that adds to the current rounds spot table all the possible barcodes that could be formed using the neighbors - of each spot, spots without enough neighbors to form a barcode are dropped. - + Function that adds to the current rounds spot table all the possible barcodes that could be + formed using the neighbors of each spot, spots without enough neighbors to form a barcode + # are dropped. + Parameters ---------- roundData : dict Spot data table for the current round - + neighborDict : dict - Dictionary that contains all the neighbors for each spot in other rounds that are within the search radius - + Dictionary that contains all the neighbors for each spot in other rounds that are + within the search radius + roundOmitNum : int - Maximum hamming distance a barcode can be from it's target in the codebook and still be uniquely identified - (i.e. number of error correction rounds in each the experiment + Maximum hamming distance a barcode can be from it's target in the codebook and still + be uniquely identified (i.e. number of error correction rounds in each the experiment channelDict : dict Dictionary with mappings between spot IDs and their channel labels currentRound : int Current round to build barcodes for (same round that roundData is from) - + numJobs : int Number of CPU threads to use in parallel - + Returns ------- pd.DataFrame : Copy of roundData with additional columns which list all possible barcodes that could be made from each spot's neighbors - + ''' - + @ray.remote def barcodeBuildFunc(data: pd.DataFrame, channelDict: dict, @@ -116,43 +110,45 @@ def barcodeBuildFunc(data: pd.DataFrame, roundNum: int) -> tuple: ''' Subfunction to buildBarcodes that allows it to run in parallel chunks using ray - + Parameters ---------- data : pd.DataFrame Spot table for the current round - + channelDict : dict Dictionary mapping spot IDs to their channels labels - + rang : tuple Range of indices to build barcodes for in the current data object - + roundOmitNum : int - Maximum hamming distance a barcode can be from it's target in the codebook and still be uniquely - identified (i.e. number of error correction rounds in each the experiment) + Maximum hamming distance a barcode can be from it's target in the codebook and + still be uniquely identified (i.e. number of error correction rounds in each the + experiment) roundNum : int Current round - + Returns ------- - tuple : First element is a list of the possible spot codes while the second element is a list of the - possible barcodes + tuple : First element is a list of the possible spot codes while the second element is + a list of the possible barcodes ''' # Build barcodes from neighbors - # spotCodes are the ordered spot IDs of the spots making up each barcode while barcodes are the corresponding - # channel labels, need spotCodes so each barcode can have a unique identifier + # spotCodes are the ordered spot IDs of the spots making up each barcode while barcodes are + # the corresponding channel labels, need spotCodes so each barcode can have a unique + # identifier allSpotCodes = [] allBarcodes = [] allNeighbors = list(data['neighbors'])[rang[0]: rang[1]] for i in range(len(allNeighbors)): neighbors = deepcopy(allNeighbors[i]) - neighborLists = [] + neighborLists = [] for rnd in range(roundNum): - # Adds a -1 to each round of the neighbors dictionary (allows barcodes with dropped rounds to - # be created) + # Adds a -1 to each round of the neighbors dictionary (allows barcodes with dropped + # rounds to be created) if roundOmitNum > 0: neighbors[rnd].append(-1) neighborLists.append(neighbors[rnd]) @@ -176,23 +172,20 @@ def barcodeBuildFunc(data: pd.DataFrame, return (allSpotCodes, allBarcodes) - - # Only keep spots that have enough neighbors to form a barcode (determined by the total number of rounds and the - # number of rounds that can be omitted from each code) + # Only keep spots that have enough neighbors to form a barcode (determined by the total number + # of rounds and the number of rounds that can be omitted from each code) passingSpots = {} roundNum = len(neighborDict) for key in neighborDict[currentRound]: - if len(neighborDict[currentRound][key]) >= roundNum-roundOmitNum: + if len(neighborDict[currentRound][key]) >= roundNum - roundOmitNum: passingSpots[key] = neighborDict[currentRound][key] passed = list(passingSpots.keys()) roundData = roundData.iloc[passed] roundData['neighbors'] = [passingSpots[i] for i in roundData.index] roundData = roundData.reset_index(drop=True) - - # Find all possible barcodes for the spots in each round by splitting each round's spots into numJob chunks and - # constructing each chunks barcodes in parallel - + # Find all possible barcodes for the spots in each round by splitting each round's spots into + # numJob chunks and constructing each chunks barcodes in parallel # Save the current round's data table and the channelDict to ray memory dataID = ray.put(roundData) @@ -200,106 +193,114 @@ def barcodeBuildFunc(data: pd.DataFrame, # Calculates index ranges to chunk data by ranges = [0] - for i in range(1, numJobs+1): - ranges.append(int((len(roundData)/numJobs)*i)) + for i in range(1, numJobs + 1): + ranges.append(int((len(roundData) / numJobs) * i)) # Run in parallel - results = [barcodeBuildFunc.remote(dataID, channelDictID, (ranges[i], ranges[i+1]), roundOmitNum, roundNum) for i in range(len(ranges[:-1]))] + results = [barcodeBuildFunc.remote(dataID, channelDictID, (ranges[i], ranges[i + 1]), + roundOmitNum, roundNum) + for i in range(len(ranges[:-1]))] rayResults = ray.get(results) - # Add possible barcodes and spot codes (same order) to spot table (must chain results rom different jobs - # together) + # Add possible barcodes and spot codes (same order) to spot table (must chain results from + # different jobs together) roundData['spot_codes'] = list(chain(*[job[0] for job in rayResults])) roundData['barcodes'] = list(chain(*[job[1] for job in rayResults])) - - return roundData + return roundData def decoder(roundData: pd.DataFrame, codebook: Codebook, roundOmitNum: int, currentRound: int, numJobs: int) -> pd.DataFrane: - + ''' - Function that takes spots tables with possible barcodes added and matches each to the codebook to identify any - matches. Matches are added to the spot tables and spots without any matches are dropped - + Function that takes spots tables with possible barcodes added and matches each to the codebook + to identify any matches. Matches are added to the spot tables and spots without any matches are + dropped + Parameters ---------- roundData : pd.DataFrane - Modified spot table containing all possible barcodes that can be made from each spot for the current round - + Modified spot table containing all possible barcodes that can be made from each spot + for the current round + codebook : Codebook starFISH Codebook object containg the barcode information for the experiment - + roundOmitNum : int Number of rounds that can be dropped from each barcode currentRound : int Current round being for which spots are being decoded - + numJobs : int Number of CPU threads to use in parallel - + Returns ------- - pd.DataFrane : Modified spot table with added columns with information on decodable barcodes + pd.DataFrane : Modified spot table with added columns with information on decodable + barcodes ''' def generateRoundPermutations(size: int, roundOmitNum: int) -> list: ''' - Creates list of lists of logicals detailing the rounds to be used for decoding based on the current roundOmitNum - + Creates list of lists of logicals detailing the rounds to be used for decoding based on the + current roundOmitNum + Parameters ---------- size : int Number of rounds in experiment - + roundOmitNum: int Number of rounds that can be dropped from each barcode - + Returns ------- - list : list of lists of logicals detailing the rounds to be used for decoding based on the current roundOmitNum + list : list of lists of logicals detailing the rounds to be used for decoding based on + the current roundOmitNum ''' if roundOmitNum == 0: - return [tuple([True]*size)] + return [tuple([True] * size)] else: - return sorted(set(list(permutations([False]*roundOmitNum + [True]*(size-roundOmitNum))))) - + return sorted(set(list(permutations([*([False] * roundOmitNum), + *([True] * (size - roundOmitNum))])))) @ray.remote def decodeFunc(data: pd.DataFrame, roundPermutations: list, permutationCodes: dict, rnd: int) -> tuple: - + ''' Subfunction for decoder that allows it to run in parallel chunks using ray - + Parameters ---------- data : pd.DataFrame Spot table for the current round - + roundPermutations : list - List of logicals from generateRoundPermutations that details the rounds to use in decoding - + List of logicals from generateRoundPermutations that details the rounds to use in + decoding + permutationCodes : dict Dictionary containing barcode information for each roundPermutation - + rnd : int Current round being decoded - + Returns ------- - tuple : First element is a list of all decoded targets, second element is a list of all decoded barcodes, - third element is a list of all decoded spot codes, and the fourth element is a list of rounds - that were omitted for each decoded barcode + tuple : First element is a list of all decoded targets, second element is a list of all + decoded barcodes,third element is a list of all decoded spot codes, and the + fourth element is a list of rounds that were omitted for each decoded barcode ''' - # Goes through all possible decodings of each spot (ensures each spot is only looked up once) + # Goes through all possible decodings of each spot (ensures each spot is only looked up + # once) allTargets = [] allDecodedBarcodes = [] allDecodedSpotCodes = [] @@ -313,37 +314,45 @@ def decodeFunc(data: pd.DataFrame, roundOmit = [] fullBarcodes = allBarcodes[i] fullSpotCodes = allSpotCodes[i] - + for currentRounds in roundPermutations: - # Set omittedRound to the round being dropped, if no round is dropped omittedRound becomes -1 + # Set omittedRound to the round being dropped, if no round is dropped omittedRound + # becomes -1 if 0 in currentRounds: omittedRound = np.argwhere([not cr for cr in currentRounds])[0][0] else: omittedRound = -1 - # Only try to decode barcodes for this spot if the current round is not the omitted round + # Only try to decode barcodes for this spot if the current round is not the omitted + # round if rnd != omittedRound: - # Modify spot codes and barcodes so that they match the current set of rounds being used for decoding + # Modify spot codes and barcodes so that they match the current set of rounds + # being used for decoding if omittedRound != -1: - spotCodes = [code for code in np.asarray([np.asarray(spotCode)[list(currentRounds)] for spotCode in fullSpotCodes]) if -1 not in code] - barcodes = [code for code in np.asarray([np.asarray(barcode)[list(currentRounds)] for barcode in fullBarcodes]) if -1 not in code] + spotCodes = [code for code in + np.asarray([np.asarray(spotCode)[list(currentRounds)] + for spotCode in fullSpotCodes]) if -1 not in code] + barcodes = [code for code in + np.asarray([np.asarray(barcode)[list(currentRounds)] + for barcode in fullBarcodes]) if -1 not in code] else: spotCodes = fullSpotCodes barcodes = fullBarcodes # If all barcodes omit a round other than omittedRound, barcodes will be empty if len(barcodes) > 0: # Tries to find a match to each possible barcode from the spot - for j,barcode in enumerate(barcodes): + for j, barcode in enumerate(barcodes): try: - # Try to assign target by using barcode as key in permutationsCodes dictionary for - # current set of rounds. If there is no barcode match, it will error and go to the except - # and if it succeeds it will add the data to the other lists for this barcode + # Try to assign target by using barcode as key in permutationsCodes + # dictionary for current set of rounds. If there is no barcode + # match, it will error and go to the except and if it succeeds it + # will add the data to the other lists for this barcode targets.append(permutationCodes[currentRounds][tuple(barcode)]) decodedBarcodes.append(barcode) decodedSpotCodes.append(list(spotCodes[j])) roundOmit.append(omittedRound) - except: + except Exception: pass allTargets.append(targets) allDecodedBarcodes.append(decodedBarcodes) @@ -352,18 +361,18 @@ def decodeFunc(data: pd.DataFrame, return (allTargets, allDecodedBarcodes, allDecodedSpotCodes, allRoundOmit) - # Create list of logical arrays corresponding to the round sets being used to decode - roundPermutations = generateRoundPermutations(codebook.sizes[Axes.ROUND], roundOmitNum) + roundPermutations = generateRoundPermutations(codebook.sizes[Axes.ROUND], roundOmitNum) - # Create dictionary where the keys are the different round sets that can be used for decoding and the values - # are the modified codebooks corresponding to the rounds used + # Create dictionary where the keys are the different round sets that can be used for decoding + # and the values are the modified codebooks corresponding to the rounds used permCodeDict = {} for currentRounds in roundPermutations: codes = codebook.argmax(Axes.CH.value) - currentCodes = codes.sel(r=list(currentRounds)) + currentCodes = codes.sel(r=list(currentRounds)) currentCodes.values = np.ascontiguousarray(currentCodes.values) - permCodeDict[currentRounds] = dict(zip([tuple(code) for code in currentCodes.data], currentCodes['target'].data)) + permCodeDict[currentRounds] = dict(zip([tuple(code) for code in currentCodes.data], + currentCodes['target'].data)) # Goes through each round in filtered_prsr and tries to decode each spot's barcodes @@ -372,14 +381,15 @@ def decodeFunc(data: pd.DataFrame, # Calculates index ranges to chunk data by and creates list of chunked data to loop through ranges = [0] - for i in range(1, numJobs+1): - ranges.append(int((len(roundData)/numJobs)*i)) + for i in range(1, numJobs + 1): + ranges.append(int((len(roundData) / numJobs) * i)) chunkedData = [] for i in range(len(ranges[:-1])): - chunkedData.append(deepcopy(roundData[ranges[i]:ranges[i+1]])) + chunkedData.append(deepcopy(roundData[ranges[i]:ranges[i + 1]])) # Run in parallel - results = [decodeFunc.remote(chunkedData[i], roundPermutations, permutationCodesID, currentRound) for i in range(len(ranges[:-1]))] + results = [decodeFunc.remote(chunkedData[i], roundPermutations, permutationCodesID, + currentRound) for i in range(len(ranges[:-1]))] rayResults = ray.get(results) # Update table @@ -391,11 +401,11 @@ def decodeFunc(data: pd.DataFrame, # Drop barcodes and spot_codes column (saves memory) roundData = roundData.drop(['neighbors', 'spot_codes', 'barcodes'], axis=1) - - # Remove rows that have no decoded barcodes + # Remove rows that have no decoded barcodes roundData = roundData[roundData['targets'].astype(bool)].reset_index(drop=True) - # Add -1 spacer back into partial barcodes/spot codes so we can easily tell which round each spot ID is from + # Add -1 spacer back into partial barcodes/spot codes so we can easily tell which round each + # spot ID is from if roundOmitNum > 0: allBarcodes = [] allSpotCodes = [] @@ -406,71 +416,75 @@ def decodeFunc(data: pd.DataFrame, barcodes = [list(code) for code in dataBarcodes[i]] spotCodes = [list(code) for code in dataSpotCodes[i]] omittedRounds = dataOmittedRounds[i] - barcodes = [barcodes[j][:omittedRounds[j]] + [-1] + barcodes[j][omittedRounds[j]:] for j in range(len(barcodes))] - spotCodes = [spotCodes[j][:omittedRounds[j]] + [-1] + spotCodes[j][omittedRounds[j]:] for j in range(len(barcodes))] + barcodes = [barcodes[j][:omittedRounds[j]] + [-1] + barcodes[j][omittedRounds[j]:] + for j in range(len(barcodes))] + spotCodes = [spotCodes[j][:omittedRounds[j]] + [-1] + spotCodes[j][omittedRounds[j]:] + for j in range(len(barcodes))] allBarcodes.append(barcodes) allSpotCodes.append(spotCodes) roundData['decoded_barcodes'] = allBarcodes roundData['decoded_spot_codes'] = allSpotCodes - - return roundData + return roundData def distanceFilter(roundData: pd.DataFrame, spotCoords: dict, currentRound: int, numJobs: int) -> pd.DataFrame: ''' - Function that chooses between the best barcode for each spot from the set of decodable barcodes. Does this by - choosing the barcode with the least spatial variance among the spots that make it up. If there is a tie, the spot - is dropped as ambiguous - + Function that chooses between the best barcode for each spot from the set of decodable barcodes. + Does this by choosing the barcode with the least spatial variance among the spots that make it + up. If there is a tie, the spot is dropped as ambiguous. + Parameters ---------- roundData : pd.DataFrame - Modified spot table containing info on decodable barcodes for the spots in the current round + Modified spot table containing info on decodable barcodes for the spots in the current + round spotCoords : dict Dictionary containing spatial coordinates of spots in each round indexed by their IDs - + currentRound : int Current round number to calculate distances for - + numJobs : int Number of CPU threads to use in parallel - + Returns ------- - pd.DataFrame : Modified spot table with added columns to with info on the "best" barcode found for each spot + pd.DataFrame : Modified spot table with added columns to with info on the "best" barcode + found for each spot ''' - + @ray.remote def distanceFunc(subSpotCodes: list, spotCoords: dict) -> list: ''' Subfunction for distanceFilter to allow it to run in parallel using ray - + Parameters ---------- subSpotCodes : list - Chunk of full list of spot codes for the current round to calculate the spatial variance for - + Chunk of full list of spot codes for the current round to calculate the spatial + variance for + spotCoords : dict - Dictionary containing spatial locations for spots by their IDs in the original spotTables object - - + Dictionary containing spatial locations for spots by their IDs in the original + spotTables object + Returns ------- list: list of spatial variances for the current chunk of spot codes - + ''' - + # Calculate spatial variances for current chunk of spot codes allDistances = [] - for spotCodes in subSpotCodes: + for spotCodes in subSpotCodes: distances = [] - for s,spotCode in enumerate(spotCodes): + for s, spotCode in enumerate(spotCodes): coords = [] - for j,spot in enumerate(spotCode): + for j, spot in enumerate(spotCode): if spot != -1: # Extract spot coordinates from spotCoords z = spotCoords[j][spot]['z'] @@ -479,11 +493,10 @@ def distanceFunc(subSpotCodes: list, spotCoords: dict) -> list: coords.append([z, y, x]) coords = np.asarray(coords) # Distance is calculate as the sum of variances of the coordinates along each axis - distances.append(sum(np.var(coords, axis = 0))) + distances.append(sum(np.var(coords, axis=0))) allDistances.append(distances) return allDistances - # Calculate the spatial variance for each decodable barcode for each spot in each round allSpotCodes = roundData['decoded_spot_codes'] @@ -493,17 +506,18 @@ def distanceFunc(subSpotCodes: list, spotCoords: dict) -> list: # Calculates index ranges to chunk data by ranges = [0] for i in range(1, numJobs): - ranges.append(int((len(roundData)/numJobs)*i)) + ranges.append(int((len(roundData) / numJobs) * i)) ranges.append(len(roundData)) - chunkedSpotCodes = [allSpotCodes[ranges[i]:ranges[i+1]] for i in range(len(ranges[:-1]))] + chunkedSpotCodes = [allSpotCodes[ranges[i]:ranges[i + 1]] for i in range(len(ranges[:-1]))] # Run in parallel using ray - results = [distanceFunc.remote(subSpotCodes, spotCoordsID) for subSpotCodes in chunkedSpotCodes] + results = [distanceFunc.remote(subSpotCodes, spotCoordsID) for subSpotCodes + in chunkedSpotCodes] rayResults = ray.get(results) # Add distances to decodedTables as new column roundData['distance'] = list(chain(*[job for job in rayResults])) - + # Pick minimum distance barcode(s) for each spot bestSpotCodes = [] bestBarcodes = [] @@ -528,7 +542,7 @@ def distanceFunc(subSpotCodes: list, spotCoords: dict) -> list: else: minDist = 100 minCount = 0 - for d,distance in enumerate(distances): + for d, distance in enumerate(distances): if distance < minDist: minDist = distance minCount = 1 @@ -548,7 +562,8 @@ def distanceFunc(subSpotCodes: list, spotCoords: dict) -> list: roundData['best_distances'] = bestDistances # Drop old columns - roundData = roundData.drop(['targets', 'decoded_barcodes', 'decoded_spot_codes', 'omitted_round'], axis=1) + roundData = roundData.drop(['targets', 'decoded_barcodes', 'decoded_spot_codes', + 'omitted_round'], axis=1) # Only keep barcodes with only one minimum distance keep = [] @@ -557,35 +572,36 @@ def distanceFunc(subSpotCodes: list, spotCoords: dict) -> list: if len(barcodes[i]) == 1: keep.append(i) roundData = roundData.iloc[keep] - + return roundData - def cleanup(bestPerSpotTables: dict, spotCoords: dict, filterRounds: int) -> pd.DataFrame: - + ''' - Function that combines all "best" codes for each spot in each round into a single table, filters them by their - frequency (with a user-defined threshold), chooses between overlapping codes (using the same distance function - as used earlier), and finally adds some additional information to the final set of barcodes - + Function that combines all "best" codes for each spot in each round into a single table, + filters them by their frequency (with a user-defined threshold), chooses between overlapping + codes (using the same distance function as used earlier), and finally adds some additional + information to the final set of barcodes + Parameters ---------- bestPerSpotTables : dict - Spot tables dictionary containing columns with information on the "best" barcode found for each spot - + Spot tables dictionary containing columns with information on the "best" barcode found + for each spot + spotCoords : dict Dictionary containing spatial locations of spots - + filterRounds : int - Number of rounds that a barcode must be identified in to pass filters (higher = more stringent filtering), - default = 1 - #rounds or 1 - roundOmitNum if roundOmitNum > 0 - + Number of rounds that a barcode must be identified in to pass filters (higher = more + stringent filtering), default = 1 - #rounds or 1 - roundOmitNum if roundOmitNum > 0 + Returns ------- pd.DataFrame : Dataframe containing final set of codes that have passed all filters - + ''' # Create merged spot results dataframe containing the passing barcodes found in all the rounds @@ -606,18 +622,18 @@ def cleanup(bestPerSpotTables: dict, # Only use codes that were found in >= filterRounds rounds spotCodes = mergedCodes['best_spot_codes'] - counts = defaultdict(int) + counts = defaultdict(int) # type: dict for code in spotCodes: counts[code] += 1 passing = list(set(code for code in counts if counts[code] >= filterRounds)) finalCodes = mergedCodes[mergedCodes['best_spot_codes'].isin(passing)].reset_index(drop=True) - finalCodes = finalCodes.iloc[finalCodes['best_spot_codes'].drop_duplicates().index].reset_index(drop=True) - + finalCodes = finalCodes.iloc[finalCodes['best_spot_codes'].drop_duplicates().index] + finalCodes = finalCodes.reset_index(drop=True) + # Choose between overlapping spot codes based on which has the smaller spatial variance for r in range(roundNum): roundSpots = [code[r] for code in finalCodes['best_spot_codes'] if code[r] != -1] dupSpots = set([spot for spot in roundSpots if Counter(roundSpots)[spot] > 1]) - nonDupSpots = [spot for spot in roundSpots if spot not in dupSpots] drop = [] for spot in dupSpots: locs = np.where(np.asarray(roundSpots) == spot)[0] @@ -627,61 +643,64 @@ def cleanup(bestPerSpotTables: dict, drop.extend([ind for ind in minInd]) else: drop.extend([locs[i] for i in range(len(locs)) if i != minInd]) - finalCodes = finalCodes.iloc[[i for i in range(len(finalCodes)) if i not in drop]].reset_index(drop=True) + finalCodes = finalCodes.iloc[[i for i in range(len(finalCodes)) if i not in drop]] + finalCodes = finalCodes.reset_index(drop=True) - # Add spot coordinates, barcode center coordinates, and number of rounds used for each barcode to table + # Add spot coordinates, barcode center coordinates, and number of rounds used for each barcode + # to table allCoords = [] centers = [] - distance = [] roundsUsed = [] for i in range(len(finalCodes)): coords = [] spotCode = finalCodes.iloc[i]['best_spot_codes'] - roundsUsed.append(roundNum-Counter(spotCode)[-1]) + roundsUsed.append(roundNum - Counter(spotCode)[-1]) for r in range(roundNum): if spotCode[r] != -1: z = spotCoords[r][spotCode[r]]['z'] y = spotCoords[r][spotCode[r]]['y'] x = spotCoords[r][spotCode[r]]['x'] - coords.append((x,y,z)) - else: - coords.append(-1) + coords.append((x, y, z)) allCoords.append(coords) - coords = np.asarray([coord for coord in coords if coord != -1]) + coords = np.asarray([coord for coord in coords]) center = np.asarray(coords).mean(axis=0) centers.append(center) finalCodes['coords'] = allCoords finalCodes['center'] = centers finalCodes['rounds_used'] = roundsUsed - - return finalCodes + return finalCodes def removeUsedSpots(finalCodes: pd.DataFrame, neighborDict: dict) -> dict: ''' - Remove spots found to be in barcodes for the current round omission number so they are not used for the next - + Remove spots found to be in barcodes for the current round omission number so they are not used + for the next + Parameters ---------- finalCodes : pd.DataFrame Dataframe containing final set of codes that have passed all filters - + neighborDict : dict - Dictionary that contains all the neighbors for each spot in other rounds that are within the search radius - + Dictionary that contains all the neighbors for each spot in other rounds that are + within the search radius + Returns ------- - dict : Modified version of neighborDict with spots that have been used in the current round omission removed + dict : Modified version of neighborDict with spots that have been used in the current round + omission removed ''' # Remove used spots roundNum = len(neighborDict) for r in range(roundNum): - usedSpots = list(set([passed[r] for passed in finalCodes['best_spot_codes'] if passed[r] != -1])) + usedSpots = list(set([passed[r] for passed in finalCodes['best_spot_codes'] + if passed[r] != -1])) for spot in usedSpots: for key in neighborDict[r][spot]: for neighbor in neighborDict[r][spot][key]: - neighborDict[key][neighbor][r] = [i for i in neighborDict[key][neighbor][r] if i != spot] + neighborDict[key][neighbor][r] = [i for i in neighborDict[key][neighbor][r] + if i != spot] del neighborDict[r][spot] # Remove empty lists @@ -690,5 +709,5 @@ def removeUsedSpots(finalCodes: pd.DataFrame, neighborDict: dict) -> dict: for key in [*neighborDict[r][spot]]: if neighborDict[r][spot][key] == []: del neighborDict[r][spot][key] - - return neighborDict \ No newline at end of file + + return neighborDict From 548e4e57fc9ab8439c82046e04eec22a68110d81 Mon Sep 17 00:00:00 2001 From: nickeener Date: Sun, 17 Oct 2021 14:10:09 -0700 Subject: [PATCH 05/30] added tests for checkAll decoder --- .../core/spots/DecodeSpots/check_all_funcs.py | 2 +- .../spots/DecodeSpots/test/test_check_all.py | 167 ++++++++++++++++++ 2 files changed, 168 insertions(+), 1 deletion(-) create mode 100644 starfish/core/spots/DecodeSpots/test/test_check_all.py diff --git a/starfish/core/spots/DecodeSpots/check_all_funcs.py b/starfish/core/spots/DecodeSpots/check_all_funcs.py index d3b14c7b8..882520c5f 100644 --- a/starfish/core/spots/DecodeSpots/check_all_funcs.py +++ b/starfish/core/spots/DecodeSpots/check_all_funcs.py @@ -213,7 +213,7 @@ def decoder(roundData: pd.DataFrame, codebook: Codebook, roundOmitNum: int, currentRound: int, - numJobs: int) -> pd.DataFrane: + numJobs: int) -> pd.DataFrame: ''' Function that takes spots tables with possible barcodes added and matches each to the codebook diff --git a/starfish/core/spots/DecodeSpots/test/test_check_all.py b/starfish/core/spots/DecodeSpots/test/test_check_all.py new file mode 100644 index 000000000..31cab19b3 --- /dev/null +++ b/starfish/core/spots/DecodeSpots/test/test_check_all.py @@ -0,0 +1,167 @@ +import numpy as np +import random +from scipy.ndimage.filters import gaussian_filter + +from starfish import ImageStack +from starfish.core.spots.DecodeSpots.check_all_decoder import CheckAll +from starfish.core.codebook.codebook import Codebook +from starfish.core.spots.FindSpots import BlobDetector + +def syntheticSeqfish(x, y, z, codebook, nSpots, jitter, error): + nRound = codebook.shape[1] + nChannel = codebook.shape[2] + img = np.zeros((nRound, nChannel, z, y, x), dtype=np.float32) + + intCodes = np.argmax(codebook.data, axis=2) + + targets = [] + for _ in range(nSpots): + randx = random.choice(range(5, x - 5)) + randy = random.choice(range(5, y - 5)) + randz = random.choice(range(2, z - 2)) + randCode = random.choice(range(len(codebook))) + targets.append((randCode, (randx, randy, randz))) + if jitter > 0: + randx += random.choice(range(jitter + 1)) * random.choice([1, -1]) + randy += random.choice(range(jitter + 1)) * random.choice([1, -1]) + if error: + skip = random.choice(range(nRound)) + else: + skip = 100 + for r, ch in enumerate(intCodes[randCode]): + if r != skip: + img[r, ch, randz, randy, randx] = 10 + + gaussian_filter(img, (0, 0, 0.5, 1.5, 1.5), output=img) + + return ImageStack.from_numpy(img / img.max()), targets + + +def seqfishCodebook(nRound, nChannel, nCodes): + + def barcodeConv(lis, chs): + barcode = np.zeros((len(lis), chs)) + for i in range(len(lis)): + barcode[i][lis[i]] = 1 + return barcode + + def incrBarcode(lis, chs): + currInd = len(lis) - 1 + lis[currInd] += 1 + while lis[currInd] == chs: + lis[currInd] = 0 + currInd -= 1 + lis[currInd] += 1 + return lis + + allCombo = np.zeros((nChannel ** nRound, nRound, nChannel)) + + barcode = [0] * nRound + for i in range(np.shape(allCombo)[0]): + allCombo[i] = barcodeConv(barcode, nChannel) + barcode = incrBarcode(barcode, nChannel) + + hammingDistance = 1 + blanks = [] + i = 0 + while i < len(allCombo): + blanks.append(allCombo[i]) + j = i + 1 + while j < len(allCombo): + if np.count_nonzero(~(allCombo[i] == allCombo[j])) / 2 <= hammingDistance: + allCombo = allCombo[[k for k in range(len(allCombo)) if k != j]] + else: + j += 1 + i += 1 + + data = np.asarray(blanks)[random.sample(range(len(blanks)), nCodes)] + + return Codebook.from_numpy(code_names=range(len(data)), n_round=nRound, + n_channel=nChannel, data=data) + +def testExactMatches(): + + codebook = seqfishCodebook(5, 3, 20) + + img, trueTargets = syntheticSeqfish(100, 100, 20, codebook, 20, 0, False) + + bd = BlobDetector(min_sigma=1, max_sigma=4, num_sigma=30, threshold=.1, exclude_border=False) + spots = bd.run(image_stack=img) + assert spots.count_total_spots() == 5 * 20, 'Spot detector did not find all spots' + + decoder = CheckAll(codebook=codebook, search_radius=1, error_rounds=0) + hits = decoder.run(spots=spots, n_processes=4) + + testTargets = [] + for i in range(len(hits)): + testTargets.append((int(hits[i]['target'].data), + (int(hits[i]['x'].data), int(hits[i]['y'].data), + int(hits[i]['z'].data)))) + + matches = 0 + for true in trueTargets: + for test in testTargets: + if true[0] == test[0]: + if test[1][0] + 1 >= true[1][0] >= test[1][0] - 1 and \ + test[1][1] + 1 >= true[1][1] >= test[1][1] - 1: + matches += 1 + + assert matches == len(trueTargets) + +def testJitteredMatches(): + + codebook = seqfishCodebook(5, 3, 20) + + img, trueTargets = syntheticSeqfish(100, 100, 20, codebook, 20, 2, False) + + bd = BlobDetector(min_sigma=1, max_sigma=4, num_sigma=30, threshold=.1, exclude_border=False) + spots = bd.run(image_stack=img) + assert spots.count_total_spots() == 5 * 20, 'Spot detector did not find all spots' + + decoder = CheckAll(codebook=codebook, search_radius=3, error_rounds=0) + hits = decoder.run(spots=spots, n_processes=4) + + testTargets = [] + for i in range(len(hits)): + testTargets.append((int(hits[i]['target'].data), + (int(hits[i]['x'].data), int(hits[i]['y'].data), + int(hits[i]['z'].data)))) + + matches = 0 + for true in trueTargets: + for test in testTargets: + if true[0] == test[0]: + if test[1][0] + 3 >= true[1][0] >= test[1][0] - 3 and \ + test[1][1] + 3 >= true[1][1] >= test[1][1] - 3: + matches += 1 + + assert matches == len(trueTargets) + +def testErrorCorrection(): + + codebook = seqfishCodebook(5, 3, 20) + + img, trueTargets = syntheticSeqfish(100, 100, 20, codebook, 20, 0, True) + + bd = BlobDetector(min_sigma=1, max_sigma=4, num_sigma=30, threshold=.1, exclude_border=False) + spots = bd.run(image_stack=img) + assert spots.count_total_spots() == 4 * 20, 'Spot detector did not find all spots' + + decoder = CheckAll(codebook=codebook, search_radius=1, error_rounds=1) + hits = decoder.run(spots=spots, n_processes=4) + + testTargets = [] + for i in range(len(hits)): + testTargets.append((int(str(hits[i]['target'].data).split('.')[0]), + (int(hits[i]['x'].data), int(hits[i]['y'].data), + int(hits[i]['z'].data)))) + + matches = 0 + for true in trueTargets: + for test in testTargets: + if true[0] == test[0]: + if test[1][0] + 1 >= true[1][0] >= test[1][0] - 1 and \ + test[1][1] + 1 >= true[1][1] >= test[1][1] - 1: + matches += 1 + + assert matches == len(trueTargets) From c04f5039bcdcef23a94dc6f3d55b763381e6c1f5 Mon Sep 17 00:00:00 2001 From: nickeener Date: Mon, 18 Oct 2021 12:32:20 -0700 Subject: [PATCH 06/30] Fixed imports order --- starfish/core/spots/DecodeSpots/__init__.py | 2 +- .../core/spots/DecodeSpots/check_all_decoder.py | 16 ++++++++-------- .../core/spots/DecodeSpots/check_all_funcs.py | 17 ++++++++++------- 3 files changed, 19 insertions(+), 16 deletions(-) diff --git a/starfish/core/spots/DecodeSpots/__init__.py b/starfish/core/spots/DecodeSpots/__init__.py index 71ed89c2e..5660f1339 100644 --- a/starfish/core/spots/DecodeSpots/__init__.py +++ b/starfish/core/spots/DecodeSpots/__init__.py @@ -1,8 +1,8 @@ from ._base import DecodeSpotsAlgorithm +from .check_all_decoder import CheckAll from .metric_decoder import MetricDistance from .per_round_max_channel_decoder import PerRoundMaxChannel from .simple_lookup_decoder import SimpleLookupDecoder -from .check_all_decoder import CheckAll # autodoc's automodule directive only captures the modules explicitly listed in __all__. __all__ = list(set( diff --git a/starfish/core/spots/DecodeSpots/check_all_decoder.py b/starfish/core/spots/DecodeSpots/check_all_decoder.py index 40f2522c1..83e1df4c4 100644 --- a/starfish/core/spots/DecodeSpots/check_all_decoder.py +++ b/starfish/core/spots/DecodeSpots/check_all_decoder.py @@ -1,20 +1,20 @@ -from typing import Mapping, Hashable, Tuple, Any -import ray -import pandas as pd -import numpy as np from copy import deepcopy +from typing import Any, Hashable, Mapping, Tuple + +import numpy as np +import pandas as pd +import ray + from starfish.core.codebook.codebook import Codebook from starfish.core.intensity_table.decoded_intensity_table import DecodedIntensityTable +from starfish.core.intensity_table.intensity_table import IntensityTable from starfish.core.intensity_table.intensity_table_coordinates import \ transfer_physical_coords_to_intensity_table -from starfish.core.intensity_table.intensity_table import IntensityTable from starfish.core.types import SpotFindingResults from starfish.types import Axes, Features from ._base import DecodeSpotsAlgorithm - - -from .check_all_funcs import findNeighbors, buildBarcodes, decoder, distanceFilter, cleanup, \ +from .check_all_funcs import buildBarcodes, cleanup, decoder, distanceFilter, findNeighbors, \ removeUsedSpots from .util import _merge_spots_by_round diff --git a/starfish/core/spots/DecodeSpots/check_all_funcs.py b/starfish/core/spots/DecodeSpots/check_all_funcs.py index 882520c5f..fa76fcdcf 100644 --- a/starfish/core/spots/DecodeSpots/check_all_funcs.py +++ b/starfish/core/spots/DecodeSpots/check_all_funcs.py @@ -1,14 +1,17 @@ -from collections import Counter -from scipy.spatial import cKDTree +import warnings +from collections import Counter, defaultdict from copy import deepcopy -from itertools import product, chain, permutations -from collections import defaultdict -import ray +from itertools import chain, permutations, product + + import numpy as np import pandas as pd -import warnings -from starfish.types import Axes +import ray +from scipy.spatial import cKDTree + from starfish.core.codebook.codebook import Codebook +from starfish.types import Axes + warnings.filterwarnings('ignore') def findNeighbors(spotTables: dict, searchRadius: float) -> dict: From 15f5fa0a98ab9effbb17c7f265ac64a8c64e2139 Mon Sep 17 00:00:00 2001 From: nickeener Date: Tue, 19 Oct 2021 13:05:11 -0700 Subject: [PATCH 07/30] include test updates --- starfish/core/spots/DecodeSpots/test/test_check_all.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/starfish/core/spots/DecodeSpots/test/test_check_all.py b/starfish/core/spots/DecodeSpots/test/test_check_all.py index 31cab19b3..36b267e2c 100644 --- a/starfish/core/spots/DecodeSpots/test/test_check_all.py +++ b/starfish/core/spots/DecodeSpots/test/test_check_all.py @@ -1,10 +1,11 @@ -import numpy as np import random + +import numpy as np from scipy.ndimage.filters import gaussian_filter from starfish import ImageStack -from starfish.core.spots.DecodeSpots.check_all_decoder import CheckAll from starfish.core.codebook.codebook import Codebook +from starfish.core.spots.DecodeSpots.check_all_decoder import CheckAll from starfish.core.spots.FindSpots import BlobDetector def syntheticSeqfish(x, y, z, codebook, nSpots, jitter, error): From bfd07154dc4d120cf438f85e905e0e9f045613b3 Mon Sep 17 00:00:00 2001 From: nickeener Date: Tue, 19 Oct 2021 15:03:34 -0700 Subject: [PATCH 08/30] annotation fixes and speed improvements --- .../decoded_intensity_table.py | 3 + .../spots/DecodeSpots/check_all_decoder.py | 32 +-- .../core/spots/DecodeSpots/check_all_funcs.py | 230 +++++++----------- 3 files changed, 100 insertions(+), 165 deletions(-) diff --git a/starfish/core/intensity_table/decoded_intensity_table.py b/starfish/core/intensity_table/decoded_intensity_table.py index 73ee10caa..0dc98f951 100644 --- a/starfish/core/intensity_table/decoded_intensity_table.py +++ b/starfish/core/intensity_table/decoded_intensity_table.py @@ -61,8 +61,10 @@ def from_intensity_table( distances: Optional[Tuple[str, np.ndarray]] = None, passes_threshold: Optional[Tuple[str, np.ndarray]] = None, rounds_used: Optional[Tuple[str, np.ndarray]] = None): + """ Assign target values to intensities. + Parameters ---------- intensities : IntensityTable @@ -77,6 +79,7 @@ def from_intensity_table( rounds_used: Optional[Tuple[str, np.ndarray]] Corresponding array of integers indicated the number of rounds this decoded intensity was found in + Returns ------- DecodedIntensityTable diff --git a/starfish/core/spots/DecodeSpots/check_all_decoder.py b/starfish/core/spots/DecodeSpots/check_all_decoder.py index 83e1df4c4..5bfded221 100644 --- a/starfish/core/spots/DecodeSpots/check_all_decoder.py +++ b/starfish/core/spots/DecodeSpots/check_all_decoder.py @@ -5,7 +5,6 @@ import pandas as pd import ray - from starfish.core.codebook.codebook import Codebook from starfish.core.intensity_table.decoded_intensity_table import DecodedIntensityTable from starfish.core.intensity_table.intensity_table import IntensityTable @@ -14,7 +13,7 @@ from starfish.core.types import SpotFindingResults from starfish.types import Axes, Features from ._base import DecodeSpotsAlgorithm -from .check_all_funcs import buildBarcodes, cleanup, decoder, distanceFilter, findNeighbors, \ +from .check_all_funcs import buildBarcodes, cleanup, createRefDicts, decoder, distanceFilter, \ removeUsedSpots from .util import _merge_spots_by_round @@ -121,17 +120,6 @@ def run(self, else: self.filterRounds = len(spotTables) - self.errorRounds - # Create dictionary of neighbors (within the search radius) in other rounds for each spot - neighborDict = findNeighbors(spotTables, self.searchRadius) - - # Create dictionaries with mapping from spot id (row index) in spotTables to channel - # number and one with spot coordinates for fast access - channelDict = {} - spotCoords = {} - for r in [*spotTables]: - channelDict[r] = spotTables[r]['c'].to_dict() - spotCoords[r] = spotTables[r][['z', 'y', 'x']].T.to_dict() - # Set list of round omission numbers to loop through roundOmits = range(self.errorRounds + 1) @@ -139,6 +127,9 @@ def run(self, allCodes = pd.DataFrame() for currentRoundOmitNum in roundOmits: + # Create necessary reference dictionaries + neighborDict, channelDict, spotCoords = createRefDicts(spotTables, self.searchRadius) + # Chooses best barcode for all spots in each round sequentially (possible barcode # space can become quite large which can increase memory needs so I do it this way so # we only need to store all potential barcodes that originate from one round at a @@ -170,10 +161,10 @@ def run(self, finalCodes = cleanup(decodedTables, spotCoords, self.filterRounds) # If this is not the last round omission number to run, remove spots that have just - # been found to be in passing barcodes from neighborDict so they are not used for the + # been found to be in passing barcodes from spotTables so they are not used for the # next round omission number if currentRoundOmitNum != roundOmits[-1]: - neighborDict = removeUsedSpots(finalCodes, neighborDict) + spotTables = removeUsedSpots(finalCodes, spotTables) # Append found codes to allCodes table allCodes = allCodes.append(finalCodes).reset_index(drop=True) @@ -186,8 +177,8 @@ def run(self, rounds = spots.round_labels # create empty IntensityTable filled with np.nan - data = np.full((len(allCodes), len(channels), len(rounds)), fill_value=np.nan) - dims = (Features.AXIS, Axes.CH.value, Axes.ROUND.value) + data = np.full((len(allCodes), len(rounds), len(channels)), fill_value=np.nan) + dims = (Features.AXIS, Axes.ROUND.value, Axes.CH.value) centers = allCodes['center'] coords: Mapping[Hashable, Tuple[str, Any]] = { Features.SPOT_RADIUS: (Features.AXIS, np.full(len(allCodes), 1)), @@ -208,18 +199,17 @@ def run(self, for ch in allCodes.loc[i, 'best_barcodes']: # If a round is not used, row will be all zeros code.append(np.asarray([0 if j != ch else 1 for j in range(len(channels))])) - table_codes.append(np.asarray(code).T) + table_codes.append(np.asarray(code)) int_table.values = np.asarray(table_codes) int_table = transfer_physical_coords_to_intensity_table(intensity_table=int_table, spots=spots) - intensities = int_table.transpose('features', 'r', 'c') # Validate results are correct shape - self.codebook._validate_decode_intensity_input_matches_codebook_shape(intensities) + self.codebook._validate_decode_intensity_input_matches_codebook_shape(int_table) # Create DecodedIntensityTable result = DecodedIntensityTable.from_intensity_table( - intensities, + int_table, targets=(Features.AXIS, allCodes['best_targets'].astype('U')), distances=(Features.AXIS, allCodes["best_distances"]), passes_threshold=(Features.AXIS, np.full(len(allCodes), True)), diff --git a/starfish/core/spots/DecodeSpots/check_all_funcs.py b/starfish/core/spots/DecodeSpots/check_all_funcs.py index fa76fcdcf..c6eec5f43 100644 --- a/starfish/core/spots/DecodeSpots/check_all_funcs.py +++ b/starfish/core/spots/DecodeSpots/check_all_funcs.py @@ -3,7 +3,6 @@ from copy import deepcopy from itertools import chain, permutations, product - import numpy as np import pandas as pd import ray @@ -14,8 +13,42 @@ warnings.filterwarnings('ignore') -def findNeighbors(spotTables: dict, searchRadius: float) -> dict: +def createRefDicts(spotTables: dict, searchRadius: float) -> tuple: + ''' + Creates reference dictionary that have mappings between the each spot's round and ID and their + neighbors, channel label, and spatial coordinates. + Parameters + ---------- + spotTables : dict + Dictionary with round labels as keys and pandas dataframes containing spot information + for its key round as values (result of _merge_spots_by_round function) + searchRadius : float + Distance that spots can be from each other and still form a barcode + + Returns + ------- + tuple : First object is the neighbors dictionary, second is the channel dictionary, and the + third object is the spatial coordinate dictionary + ''' + + # Create dictionary of neighbors (within the search radius) in other rounds for each spot + neighborDict = findNeighbors(spotTables, searchRadius) + + # Create dictionaries with mapping from spot id (row index) in spotTables to channel + # number and one with spot coordinates for fast access + channelDict = {} + spotCoords = {} + for r in [*spotTables]: + channelDict[r] = spotTables[r]['c'].to_dict() + spotCoords[r] = spotTables[r][['z', 'y', 'x']].T.to_dict() + for key in [*spotCoords[r]]: + spotCoords[r][key] = tuple([item[1] for item in sorted(spotCoords[r][key].items(), + key=lambda x: x[0])]) + + return neighborDict, channelDict, spotCoords + +def findNeighbors(spotTables: dict, searchRadius: float) -> dict: ''' Function that takes spatial information from the spot tables from each round and creates a dictionary that contains all the neighbors for each spot in other rounds that are within the @@ -70,7 +103,6 @@ def buildBarcodes(roundData: pd.DataFrame, channelDict: dict, currentRound: int, numJobs: int) -> pd.DataFrame: - ''' Function that adds to the current rounds spot table all the possible barcodes that could be formed using the neighbors of each spot, spots without enough neighbors to form a barcode @@ -158,7 +190,8 @@ def barcodeBuildFunc(data: pd.DataFrame, # Creates all possible spot code combinations from neighbors codes = list(product(*neighborLists)) # Only save the ones with the correct number of dropped rounds - spotCodes = [code for code in codes if Counter(code)[-1] == roundOmitNum] + counters = [Counter(code) for code in codes] # type: list + spotCodes = [code for j, code in enumerate(codes) if counters[j][-1] == roundOmitNum] # Create barcodes from spot codes using the mapping from spot ID to channel barcodes = [] for spotCode in spotCodes: @@ -217,7 +250,6 @@ def decoder(roundData: pd.DataFrame, roundOmitNum: int, currentRound: int, numJobs: int) -> pd.DataFrame: - ''' Function that takes spots tables with possible barcodes added and matches each to the codebook to identify any matches. Matches are added to the spot tables and spots without any matches are @@ -276,7 +308,6 @@ def decodeFunc(data: pd.DataFrame, roundPermutations: list, permutationCodes: dict, rnd: int) -> tuple: - ''' Subfunction for decoder that allows it to run in parallel chunks using ray @@ -307,77 +338,45 @@ def decodeFunc(data: pd.DataFrame, allTargets = [] allDecodedBarcodes = [] allDecodedSpotCodes = [] - allRoundOmit = [] allBarcodes = list(data['barcodes']) allSpotCodes = list(data['spot_codes']) for i in range(len(allBarcodes)): targets = [] decodedBarcodes = [] decodedSpotCodes = [] - roundOmit = [] - fullBarcodes = allBarcodes[i] - fullSpotCodes = allSpotCodes[i] - - for currentRounds in roundPermutations: - - # Set omittedRound to the round being dropped, if no round is dropped omittedRound - # becomes -1 - if 0 in currentRounds: - omittedRound = np.argwhere([not cr for cr in currentRounds])[0][0] - else: - omittedRound = -1 - - # Only try to decode barcodes for this spot if the current round is not the omitted - # round - if rnd != omittedRound: - # Modify spot codes and barcodes so that they match the current set of rounds - # being used for decoding - if omittedRound != -1: - spotCodes = [code for code in - np.asarray([np.asarray(spotCode)[list(currentRounds)] - for spotCode in fullSpotCodes]) if -1 not in code] - barcodes = [code for code in - np.asarray([np.asarray(barcode)[list(currentRounds)] - for barcode in fullBarcodes]) if -1 not in code] - else: - spotCodes = fullSpotCodes - barcodes = fullBarcodes - # If all barcodes omit a round other than omittedRound, barcodes will be empty - if len(barcodes) > 0: - # Tries to find a match to each possible barcode from the spot - for j, barcode in enumerate(barcodes): - try: - # Try to assign target by using barcode as key in permutationsCodes - # dictionary for current set of rounds. If there is no barcode - # match, it will error and go to the except and if it succeeds it - # will add the data to the other lists for this barcode - targets.append(permutationCodes[currentRounds][tuple(barcode)]) - decodedBarcodes.append(barcode) - decodedSpotCodes.append(list(spotCodes[j])) - roundOmit.append(omittedRound) - except Exception: - pass + for j, barcode in enumerate(allBarcodes[i]): + if barcode[rnd] != -1: + # Try to assign target by using barcode as key in permutationsCodes dictionary + # for current set of rounds. If there is no barcode match, it will error and go + # to the except and if it succeeds it will add the data to the other lists for + # this barcode + try: + targets.append(permutationCodes[tuple(barcode)]) + decodedBarcodes.append(barcode) + decodedSpotCodes.append(list(allSpotCodes[i][j])) + except Exception: + pass allTargets.append(targets) allDecodedBarcodes.append(decodedBarcodes) allDecodedSpotCodes.append(decodedSpotCodes) - allRoundOmit.append(roundOmit) - return (allTargets, allDecodedBarcodes, allDecodedSpotCodes, allRoundOmit) + return (allTargets, allDecodedBarcodes, allDecodedSpotCodes) # Create list of logical arrays corresponding to the round sets being used to decode roundPermutations = generateRoundPermutations(codebook.sizes[Axes.ROUND], roundOmitNum) - # Create dictionary where the keys are the different round sets that can be used for decoding - # and the values are the modified codebooks corresponding to the rounds used + # Create dictionary where the keys are all the possible barocodes (where dropped rounds + # are set to -1) for the current roundOmitNum. Provides fast mapping from barcode to + # target mRNA and having all the different dropped rounds together eliminates the need + # to loop through them. permCodeDict = {} for currentRounds in roundPermutations: - codes = codebook.argmax(Axes.CH.value) - currentCodes = codes.sel(r=list(currentRounds)) - currentCodes.values = np.ascontiguousarray(currentCodes.values) - permCodeDict[currentRounds] = dict(zip([tuple(code) for code in currentCodes.data], - currentCodes['target'].data)) - - # Goes through each round in filtered_prsr and tries to decode each spot's barcodes + codes = codebook.data.argmax(axis=2) + if roundOmitNum > 0: + omittedRounds = np.argwhere(~np.asarray(currentRounds)) + codes[:, omittedRounds] = -1 + roundDict = dict(zip([tuple(code) for code in codes], codebook['target'].data)) + permCodeDict.update(roundDict) # Put data table and permutations codes dictionary in ray storage permutationCodesID = ray.put(permCodeDict) @@ -399,7 +398,6 @@ def decodeFunc(data: pd.DataFrame, roundData['targets'] = list(chain(*[job[0] for job in rayResults])) roundData['decoded_barcodes'] = list(chain(*[job[1] for job in rayResults])) roundData['decoded_spot_codes'] = list(chain(*[job[2] for job in rayResults])) - roundData['omitted_round'] = list(chain(*[job[3] for job in rayResults])) # Drop barcodes and spot_codes column (saves memory) roundData = roundData.drop(['neighbors', 'spot_codes', 'barcodes'], axis=1) @@ -407,27 +405,6 @@ def decodeFunc(data: pd.DataFrame, # Remove rows that have no decoded barcodes roundData = roundData[roundData['targets'].astype(bool)].reset_index(drop=True) - # Add -1 spacer back into partial barcodes/spot codes so we can easily tell which round each - # spot ID is from - if roundOmitNum > 0: - allBarcodes = [] - allSpotCodes = [] - dataBarcodes = roundData['decoded_barcodes'] - dataSpotCodes = roundData['decoded_spot_codes'] - dataOmittedRounds = roundData['omitted_round'] - for i in range(len(roundData)): - barcodes = [list(code) for code in dataBarcodes[i]] - spotCodes = [list(code) for code in dataSpotCodes[i]] - omittedRounds = dataOmittedRounds[i] - barcodes = [barcodes[j][:omittedRounds[j]] + [-1] + barcodes[j][omittedRounds[j]:] - for j in range(len(barcodes))] - spotCodes = [spotCodes[j][:omittedRounds[j]] + [-1] + spotCodes[j][omittedRounds[j]:] - for j in range(len(barcodes))] - allBarcodes.append(barcodes) - allSpotCodes.append(spotCodes) - roundData['decoded_barcodes'] = allBarcodes - roundData['decoded_spot_codes'] = allSpotCodes - return roundData def distanceFilter(roundData: pd.DataFrame, @@ -486,15 +463,8 @@ def distanceFunc(subSpotCodes: list, spotCoords: dict) -> list: for spotCodes in subSpotCodes: distances = [] for s, spotCode in enumerate(spotCodes): - coords = [] - for j, spot in enumerate(spotCode): - if spot != -1: - # Extract spot coordinates from spotCoords - z = spotCoords[j][spot]['z'] - y = spotCoords[j][spot]['y'] - x = spotCoords[j][spot]['x'] - coords.append([z, y, x]) - coords = np.asarray(coords) + coords = np.asarray([spotCoords[j][spot] for j, spot in enumerate(spotCode) + if spot != -1]) # Distance is calculate as the sum of variances of the coordinates along each axis distances.append(sum(np.var(coords, axis=0))) allDistances.append(distances) @@ -541,23 +511,13 @@ def distanceFunc(subSpotCodes: list, spotCoords: dict) -> list: bestBarcodes.append(barcodes) bestTargets.append(targets) bestDistances.append(distances) - # Otherwise find the minimum, and if there are multiple minimums + # Otherwise find the minimum(s) else: - minDist = 100 - minCount = 0 - for d, distance in enumerate(distances): - if distance < minDist: - minDist = distance - minCount = 1 - minInds = [] - minInds.append(d) - elif distance == minDist: - minCount += 1 - minInds.append(d) - bestSpotCodes.append([spotCodes[i] for i in range(len(spotCodes)) if i in minInds]) - bestBarcodes.append([barcodes[i] for i in range(len(barcodes)) if i in minInds]) - bestTargets.append([targets[i] for i in range(len(targets)) if i in minInds]) - bestDistances.append([distances[i] for i in range(len(distances)) if i in minInds]) + mins = np.argwhere(distances == min(distances)) + bestSpotCodes.append([spotCodes[m[0]] for m in mins]) + bestBarcodes.append([barcodes[m[0]] for m in mins]) + bestTargets.append([targets[m[0]] for m in mins]) + bestDistances.append([distances[m[0]] for m in mins]) # Create new columns with minimum distance barcode information roundData['best_spot_codes'] = bestSpotCodes roundData['best_barcodes'] = bestBarcodes @@ -565,8 +525,8 @@ def distanceFunc(subSpotCodes: list, spotCoords: dict) -> list: roundData['best_distances'] = bestDistances # Drop old columns - roundData = roundData.drop(['targets', 'decoded_barcodes', 'decoded_spot_codes', - 'omitted_round'], axis=1) + roundData = roundData.drop(['targets', 'decoded_barcodes', 'decoded_spot_codes'], + axis=1) # Only keep barcodes with only one minimum distance keep = [] @@ -581,7 +541,6 @@ def distanceFunc(subSpotCodes: list, spotCoords: dict) -> list: def cleanup(bestPerSpotTables: dict, spotCoords: dict, filterRounds: int) -> pd.DataFrame: - ''' Function that combines all "best" codes for each spot in each round into a single table, filters them by their frequency (with a user-defined threshold), chooses between overlapping @@ -655,15 +614,10 @@ def cleanup(bestPerSpotTables: dict, centers = [] roundsUsed = [] for i in range(len(finalCodes)): - coords = [] spotCode = finalCodes.iloc[i]['best_spot_codes'] - roundsUsed.append(roundNum - Counter(spotCode)[-1]) - for r in range(roundNum): - if spotCode[r] != -1: - z = spotCoords[r][spotCode[r]]['z'] - y = spotCoords[r][spotCode[r]]['y'] - x = spotCoords[r][spotCode[r]]['x'] - coords.append((x, y, z)) + counter = Counter(spotCode) # type: Counter + roundsUsed.append(roundNum - counter[-1]) + coords = np.asarray([spotCoords[j][spot] for j, spot in enumerate(spotCode) if spot != -1]) allCoords.append(coords) coords = np.asarray([coord for coord in coords]) center = np.asarray(coords).mean(axis=0) @@ -674,43 +628,31 @@ def cleanup(bestPerSpotTables: dict, return finalCodes -def removeUsedSpots(finalCodes: pd.DataFrame, neighborDict: dict) -> dict: +def removeUsedSpots(finalCodes: pd.DataFrame, spotTables: dict) -> dict: ''' - Remove spots found to be in barcodes for the current round omission number so they are not used - for the next + Remove spots found to be in barcodes for the current round omission number from the spotTables + so they are not used for the next round omission number Parameters ---------- finalCodes : pd.DataFrame Dataframe containing final set of codes that have passed all filters - neighborDict : dict - Dictionary that contains all the neighbors for each spot in other rounds that are - within the search radius + spotTables : dict + Dictionary of original data tables extracted from SpotFindingResults objects by the + _merge_spots_by_round() function Returns ------- - dict : Modified version of neighborDict with spots that have been used in the current round + dict : Modified version of spotTables with spots that have been used in the current round omission removed ''' # Remove used spots - roundNum = len(neighborDict) - for r in range(roundNum): - usedSpots = list(set([passed[r] for passed in finalCodes['best_spot_codes'] - if passed[r] != -1])) - for spot in usedSpots: - for key in neighborDict[r][spot]: - for neighbor in neighborDict[r][spot][key]: - neighborDict[key][neighbor][r] = [i for i in neighborDict[key][neighbor][r] - if i != spot] - del neighborDict[r][spot] - - # Remove empty lists - for r in range(roundNum): - for spot in neighborDict[r]: - for key in [*neighborDict[r][spot]]: - if neighborDict[r][spot][key] == []: - del neighborDict[r][spot][key] + for r in range(len(spotTables)): + usedSpots = set([passed[r] for passed in finalCodes['best_spot_codes'] + if passed[r] != -1]) + spotTables[r] = spotTables[r].iloc[[i for i in range(len(spotTables[r])) if i + not in usedSpots]].reset_index(drop=True) - return neighborDict + return spotTables From 102d5546d8b1a03bbe0fcbf78d657c8403ddda32 Mon Sep 17 00:00:00 2001 From: nickeener Date: Tue, 19 Oct 2021 16:07:42 -0700 Subject: [PATCH 09/30] added ray to requirements --- REQUIREMENTS.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/REQUIREMENTS.txt b/REQUIREMENTS.txt index b1c700a84..66c517fc2 100644 --- a/REQUIREMENTS.txt +++ b/REQUIREMENTS.txt @@ -10,6 +10,7 @@ jsonschema matplotlib numpy != 1.13.0, >= 1.20.0 pandas >= 0.23.4 +ray==1.7.0 read_roi regional semantic_version From 900c6e2be71b23842ea7130306e2e72669be0c01 Mon Sep 17 00:00:00 2001 From: nickeener Date: Wed, 27 Oct 2021 20:48:57 -0700 Subject: [PATCH 10/30] More speed and memory improvements --- .../spots/DecodeSpots/check_all_decoder.py | 27 +-- .../core/spots/DecodeSpots/check_all_funcs.py | 212 ++++++++++-------- 2 files changed, 130 insertions(+), 109 deletions(-) diff --git a/starfish/core/spots/DecodeSpots/check_all_decoder.py b/starfish/core/spots/DecodeSpots/check_all_decoder.py index 5bfded221..905d6905e 100644 --- a/starfish/core/spots/DecodeSpots/check_all_decoder.py +++ b/starfish/core/spots/DecodeSpots/check_all_decoder.py @@ -36,14 +36,11 @@ class CheckAll(DecodeSpotsAlgorithm): the sum of variances for each of the spatial coordinates of the spots that make up each barcode and choosing the minimum distance barcode (if there is a tie, they are all dropped as ambiguous). Each spot is assigned a "best" barcode in this way. - 5. Only keep barcodes/targets that were found as "best" in a certain number of the rounds - (determined by filter_rounds parameter) - 6. If a specific spot is used in more than one of the remaining barcodes, the barcode with the - higher spatial variance between it's spots is dropped (ensures each spot is only used once) + 5. Only keep barcodes/targets that were found as "best" in each of the rounds they have spots in (End here if number of error_rounds = 0) - 7. Remove all spots used in decoded targets that passed the previous filtering steps from the + 6. Remove all spots used in decoded targets that passed the previous filtering steps from the original set of spots - 8. Rerun steps 2-5 for barcodes that use less than the full set of rounds for codebook + 7. Rerun steps 2-5 for barcodes that use less than the full set of rounds for codebook matching (how many rounds can be dropped determined by error_rounds parameter) Parameters @@ -52,9 +49,6 @@ class CheckAll(DecodeSpotsAlgorithm): Contains codes to decode IntensityTable search_radius : float Number of pixels over which to search for spots in other rounds and channels. - filterRounds : int - Number of rounds that a barcode must be identified in to pass filters (higher = more - stringent filtering), default = #rounds - 1 or #rounds - error_rounds if error_rounds > 0 error_rounds : int Maximum hamming distance a barcode can be from it's target in the codebook and still be uniquely identified (i.e. number of error correction rounds in each the experiment) @@ -63,11 +57,9 @@ class CheckAll(DecodeSpotsAlgorithm): def __init__( self, codebook: Codebook, - filter_rounds: int=None, search_radius: float=3, error_rounds: int=0): self.codebook = codebook - self.filterRounds = filter_rounds self.searchRadius = search_radius self.errorRounds = error_rounds @@ -111,14 +103,9 @@ def run(self, # containing information on the spots found in that round spotTables = _merge_spots_by_round(spots) - # If user did not specify the filterRounds variable (it will have default value None), - # change it to either one less than the number of rounds if errorRounds is 0 or the - # number of rounds minus the errorRounds if errorRounds > 0 - if self.filterRounds is None: - if self.errorRounds == 0: - self.filterRounds = len(spotTables) - 1 - else: - self.filterRounds = len(spotTables) - self.errorRounds + # Add one to channels labels (prevents collisions between hashes of barcodes later) + for r in spots.round_labels: + spotTables[r]['c'] += 1 # Set list of round omission numbers to loop through roundOmits = range(self.errorRounds + 1) @@ -158,7 +145,7 @@ def run(self, # Turn spot table dictionary into single table, filter barcodes by round frequency, add # additional information, and choose between barcodes that have overlapping spots - finalCodes = cleanup(decodedTables, spotCoords, self.filterRounds) + finalCodes = cleanup(decodedTables, spotCoords, channelDict, currentRoundOmitNum) # If this is not the last round omission number to run, remove spots that have just # been found to be in passing barcodes from spotTables so they are not used for the diff --git a/starfish/core/spots/DecodeSpots/check_all_funcs.py b/starfish/core/spots/DecodeSpots/check_all_funcs.py index c6eec5f43..077cc5e69 100644 --- a/starfish/core/spots/DecodeSpots/check_all_funcs.py +++ b/starfish/core/spots/DecodeSpots/check_all_funcs.py @@ -1,7 +1,8 @@ import warnings from collections import Counter, defaultdict from copy import deepcopy -from itertools import chain, permutations, product +from itertools import chain, islice, permutations, product +from functools import partial import numpy as np import pandas as pd @@ -16,13 +17,15 @@ def createRefDicts(spotTables: dict, searchRadius: float) -> tuple: ''' Creates reference dictionary that have mappings between the each spot's round and ID and their - neighbors, channel label, and spatial coordinates. + neighbors, channel label, and spatial coordinates. Spot IDs correspond to their 1-based index + location in the spotTables dataframes. Parameters ---------- spotTables : dict Dictionary with round labels as keys and pandas dataframes containing spot information for its key round as values (result of _merge_spots_by_round function) + searchRadius : float Distance that spots can be from each other and still form a barcode @@ -40,7 +43,9 @@ def createRefDicts(spotTables: dict, searchRadius: float) -> tuple: channelDict = {} spotCoords = {} for r in [*spotTables]: + spotTables[r].index += 1 channelDict[r] = spotTables[r]['c'].to_dict() + channelDict[r][0] = 0 spotCoords[r] = spotTables[r][['z', 'y', 'x']].T.to_dict() for key in [*spotCoords[r]]: spotCoords[r][key] = tuple([item[1] for item in sorted(spotCoords[r][key].items(), @@ -78,7 +83,8 @@ def findNeighbors(spotTables: dict, searchRadius: float) -> dict: # Create empty neighbor dictionary neighborDict = {} for r in spotTables: - neighborDict[r] = {i: defaultdict(list, {r: [i]}) for i in range(len(spotTables[r]))} + neighborDict[r] = {i: defaultdict(list, {r: [i]}) for i in + range(1, len(spotTables[r]) + 1)} # For each pairing of rounds, find all mutual neighbors within the search radius for each spot # and assigns them in the neighborDict dictionary @@ -91,11 +97,61 @@ def findNeighbors(spotTables: dict, searchRadius: float) -> dict: for j, neighbors in enumerate(allNeighbors): if neighbors != []: for neighbor in neighbors: - neighborDict[r1][neighbor][r2].append(j) - neighborDict[r2][j][r1].append(neighbor) + neighborDict[r1][neighbor + 1][r2].append(j + 1) + neighborDict[r2][j + 1][r1].append(neighbor + 1) return neighborDict +def encodeSpots(spotCodes: list) -> list: + ''' + For compressing spot ID codes into single integers. Saves memory. The number of digits in + each ID is counted and these integer lengths and concatenated into a string in the same + order as the IDs they correspond to. The IDs themselves are then converted to strings and + concatenated to this, also maintaining order. + + Parameters + ---------- + spotCodes : list + List of spot codes (each a tuple of integers with length equal to the number of rounds) + + + Returns + ------- + list: List of compressed spot codes, one int per code + ''' + + strs = [list(map(str, code)) for code in spotCodes] + compressed = [int(''.join(map(str, map(len, intStr))) + ''.join(intStr)) for intStr in strs] + + return compressed + +def decodeSpots(compressed: list, roundNum: int) -> list: + ''' + Reconverts compressed spot codes back into their roundNum length tupes of integers with + the same order and IDs as their original source. First roundNum values in the compressed + code will each correspond to the string length of each spot ID integer (as long as no round + has 10 billion or more spots). Can use these to determine how to split the rest of the string + to retrieve the original values in the correct order. + + Parameters + ---------- + compressed : list + List of integer values corresponding to compressed spot codes + + roundNum : int + The number of rounds in the experiment + + Returns + ------- + list: List of recovered spot codes in their original tuple form + + ''' + strs = [str(intStr) for intStr in compressed] + idxs, nums = list(zip(*[(map(int, s[:roundNum]), [iter(s[roundNum:])] * roundNum) + for s in strs])) + decompressed = [tuple(int(''.join(islice(n, i))) for i, n in zip(idxs[j], nums[j])) + for j in range(len(idxs))] + return decompressed def buildBarcodes(roundData: pd.DataFrame, neighborDict: dict, @@ -175,6 +231,7 @@ def barcodeBuildFunc(data: pd.DataFrame, # spotCodes are the ordered spot IDs of the spots making up each barcode while barcodes are # the corresponding channel labels, need spotCodes so each barcode can have a unique # identifier + # A 0 value in a barcode/spot code corresponds to a dropped round allSpotCodes = [] allBarcodes = [] allNeighbors = list(data['neighbors'])[rang[0]: rang[1]] @@ -182,29 +239,29 @@ def barcodeBuildFunc(data: pd.DataFrame, neighbors = deepcopy(allNeighbors[i]) neighborLists = [] for rnd in range(roundNum): - # Adds a -1 to each round of the neighbors dictionary (allows barcodes with dropped + # Adds a 0 to each round of the neighbors dictionary (allows barcodes with dropped # rounds to be created) if roundOmitNum > 0: - neighbors[rnd].append(-1) + neighbors[rnd].append(0) neighborLists.append(neighbors[rnd]) # Creates all possible spot code combinations from neighbors codes = list(product(*neighborLists)) # Only save the ones with the correct number of dropped rounds - counters = [Counter(code) for code in codes] # type: list - spotCodes = [code for j, code in enumerate(codes) if counters[j][-1] == roundOmitNum] + counters = [Counter(code) for code in codes] + spotCodes = [code for j, code in enumerate(codes) if counters[j][0] == roundOmitNum] + # Only save those that don't have a dropped round in the current round + spotCodes = [code for code in spotCodes if code[currentRound] != 0] # Create barcodes from spot codes using the mapping from spot ID to channel barcodes = [] for spotCode in spotCodes: - barcode = [] - for spotInd in range(len(spotCode)): - if spotCode[spotInd] == -1: - barcode.append(-1) - else: - barcode.append(channelDict[spotInd][spotCode[spotInd]]) - barcodes.append(tuple(barcode)) + barcode = [channelDict[spotInd][spotCode[spotInd]] for spotInd + in range(len(spotCode))] + # Barcodes are hashed to save memory + barcodes.append(hash(tuple(barcode))) allBarcodes.append(barcodes) - allSpotCodes.append(spotCodes) + # Spot codes are compressed to save memory + allSpotCodes.append(encodeSpots(spotCodes)) return (allSpotCodes, allBarcodes) @@ -216,7 +273,7 @@ def barcodeBuildFunc(data: pd.DataFrame, if len(neighborDict[currentRound][key]) >= roundNum - roundOmitNum: passingSpots[key] = neighborDict[currentRound][key] passed = list(passingSpots.keys()) - roundData = roundData.iloc[passed] + roundData = roundData.iloc[np.asarray(passed) - 1] roundData['neighbors'] = [passingSpots[i] for i in roundData.index] roundData = roundData.reset_index(drop=True) @@ -238,6 +295,9 @@ def barcodeBuildFunc(data: pd.DataFrame, for i in range(len(ranges[:-1]))] rayResults = ray.get(results) + # Drop neighbors column (saves memory) + roundData = roundData.drop(['neighbors'], axis=1) + # Add possible barcodes and spot codes (same order) to spot table (must chain results from # different jobs together) roundData['spot_codes'] = list(chain(*[job[0] for job in rayResults])) @@ -305,7 +365,6 @@ def generateRoundPermutations(size: int, roundOmitNum: int) -> list: @ray.remote def decodeFunc(data: pd.DataFrame, - roundPermutations: list, permutationCodes: dict, rnd: int) -> tuple: ''' @@ -316,10 +375,6 @@ def decodeFunc(data: pd.DataFrame, data : pd.DataFrame Spot table for the current round - roundPermutations : list - List of logicals from generateRoundPermutations that details the rounds to use in - decoding - permutationCodes : dict Dictionary containing barcode information for each roundPermutation @@ -336,46 +391,44 @@ def decodeFunc(data: pd.DataFrame, # Goes through all possible decodings of each spot (ensures each spot is only looked up # once) allTargets = [] - allDecodedBarcodes = [] allDecodedSpotCodes = [] allBarcodes = list(data['barcodes']) allSpotCodes = list(data['spot_codes']) for i in range(len(allBarcodes)): targets = [] - decodedBarcodes = [] decodedSpotCodes = [] for j, barcode in enumerate(allBarcodes[i]): - if barcode[rnd] != -1: + try: # Try to assign target by using barcode as key in permutationsCodes dictionary # for current set of rounds. If there is no barcode match, it will error and go - # to the except and if it succeeds it will add the data to the other lists for - # this barcode - try: - targets.append(permutationCodes[tuple(barcode)]) - decodedBarcodes.append(barcode) - decodedSpotCodes.append(list(allSpotCodes[i][j])) - except Exception: - pass + # to the except and if it succeeds it will add the corresponding spot code to + # the decodedSpotCodes list + targets.append(permutationCodes[barcode]) + decodedSpotCodes.append(allSpotCodes[i][j]) + except Exception: + pass allTargets.append(targets) - allDecodedBarcodes.append(decodedBarcodes) allDecodedSpotCodes.append(decodedSpotCodes) - return (allTargets, allDecodedBarcodes, allDecodedSpotCodes) + return (allTargets, allDecodedSpotCodes) # Create list of logical arrays corresponding to the round sets being used to decode roundPermutations = generateRoundPermutations(codebook.sizes[Axes.ROUND], roundOmitNum) - # Create dictionary where the keys are all the possible barocodes (where dropped rounds - # are set to -1) for the current roundOmitNum. Provides fast mapping from barcode to - # target mRNA and having all the different dropped rounds together eliminates the need - # to loop through them. + # Create dictionary where the keys are the different round sets that can be used for decoding + # and the values are the modified codebooks corresponding to the rounds used permCodeDict = {} for currentRounds in roundPermutations: - codes = codebook.data.argmax(axis=2) + codes = codebook.argmax(Axes.CH.value) if roundOmitNum > 0: omittedRounds = np.argwhere(~np.asarray(currentRounds)) - codes[:, omittedRounds] = -1 - roundDict = dict(zip([tuple(code) for code in codes], codebook['target'].data)) + # Makes entire column that is being omitted -1, which become 0 after 1 is added + # so they match up with the barcodes made earlier + codes.data[:, omittedRounds] = -1 + # Makes codes 1-based which prevents collisions when hashing + codes.data += 1 + # Barcodes are hashed as before + roundDict = dict(zip([hash(tuple(code)) for code in codes.data], codes['target'].data)) permCodeDict.update(roundDict) # Put data table and permutations codes dictionary in ray storage @@ -390,21 +443,24 @@ def decodeFunc(data: pd.DataFrame, chunkedData.append(deepcopy(roundData[ranges[i]:ranges[i + 1]])) # Run in parallel - results = [decodeFunc.remote(chunkedData[i], roundPermutations, permutationCodesID, - currentRound) for i in range(len(ranges[:-1]))] + results = [decodeFunc.remote(chunkedData[i], permutationCodesID, currentRound) + for i in range(len(ranges[:-1]))] rayResults = ray.get(results) # Update table roundData['targets'] = list(chain(*[job[0] for job in rayResults])) - roundData['decoded_barcodes'] = list(chain(*[job[1] for job in rayResults])) - roundData['decoded_spot_codes'] = list(chain(*[job[2] for job in rayResults])) + roundData['decoded_spot_codes'] = list(chain(*[job[1] for job in rayResults])) # Drop barcodes and spot_codes column (saves memory) - roundData = roundData.drop(['neighbors', 'spot_codes', 'barcodes'], axis=1) + roundData = roundData.drop(['spot_codes', 'barcodes'], axis=1) # Remove rows that have no decoded barcodes roundData = roundData[roundData['targets'].astype(bool)].reset_index(drop=True) + # Convert spot codes back to tuples + roundData['decoded_spot_codes'] = list(map(partial(decodeSpots, roundNum=len(codebook.r)), + roundData['decoded_spot_codes'])) + return roundData def distanceFilter(roundData: pd.DataFrame, @@ -464,7 +520,7 @@ def distanceFunc(subSpotCodes: list, spotCoords: dict) -> list: distances = [] for s, spotCode in enumerate(spotCodes): coords = np.asarray([spotCoords[j][spot] for j, spot in enumerate(spotCode) - if spot != -1]) + if spot != 0]) # Distance is calculate as the sum of variances of the coordinates along each axis distances.append(sum(np.var(coords, axis=0))) allDistances.append(distances) @@ -493,54 +549,45 @@ def distanceFunc(subSpotCodes: list, spotCoords: dict) -> list: # Pick minimum distance barcode(s) for each spot bestSpotCodes = [] - bestBarcodes = [] bestTargets = [] bestDistances = [] dataSpotCodes = list(roundData['decoded_spot_codes']) - dataBarcodes = list(roundData['decoded_barcodes']) dataDistances = list(roundData['distance']) dataTargets = list(roundData['targets']) for i in range(len(roundData)): spotCodes = dataSpotCodes[i] - barcodes = dataBarcodes[i] distances = dataDistances[i] targets = dataTargets[i] # If only one barcode to choose from, that one is picked as best if len(distances) == 1: bestSpotCodes.append(spotCodes) - bestBarcodes.append(barcodes) bestTargets.append(targets) bestDistances.append(distances) # Otherwise find the minimum(s) else: mins = np.argwhere(distances == min(distances)) bestSpotCodes.append([spotCodes[m[0]] for m in mins]) - bestBarcodes.append([barcodes[m[0]] for m in mins]) bestTargets.append([targets[m[0]] for m in mins]) bestDistances.append([distances[m[0]] for m in mins]) # Create new columns with minimum distance barcode information roundData['best_spot_codes'] = bestSpotCodes - roundData['best_barcodes'] = bestBarcodes roundData['best_targets'] = bestTargets roundData['best_distances'] = bestDistances # Drop old columns - roundData = roundData.drop(['targets', 'decoded_barcodes', 'decoded_spot_codes'], - axis=1) + roundData = roundData.drop(['targets', 'decoded_spot_codes'], axis=1) # Only keep barcodes with only one minimum distance - keep = [] - barcodes = roundData['best_barcodes'] - for i in range(len(roundData)): - if len(barcodes[i]) == 1: - keep.append(i) + targets = roundData['best_targets'] + keep = [i for i in range(len(roundData)) if len(targets[i]) == 1] roundData = roundData.iloc[keep] return roundData def cleanup(bestPerSpotTables: dict, spotCoords: dict, - filterRounds: int) -> pd.DataFrame: + channelDict: dict, + roundOmitNum: int) -> pd.DataFrame: ''' Function that combines all "best" codes for each spot in each round into a single table, filters them by their frequency (with a user-defined threshold), chooses between overlapping @@ -556,6 +603,9 @@ def cleanup(bestPerSpotTables: dict, spotCoords : dict Dictionary containing spatial locations of spots + channelDict : dict + Dictionary with mapping between spot IDs and the channel labels + filterRounds : int Number of rounds that a barcode must be identified in to pass filters (higher = more stringent filtering), default = 1 - #rounds or 1 - roundOmitNum if roundOmitNum > 0 @@ -570,58 +620,42 @@ def cleanup(bestPerSpotTables: dict, mergedCodes = pd.DataFrame() roundNum = len(bestPerSpotTables) for r in range(roundNum): - barcodes = bestPerSpotTables[r]['best_barcodes'] spotCodes = bestPerSpotTables[r]['best_spot_codes'] targets = bestPerSpotTables[r]['best_targets'] distances = bestPerSpotTables[r]['best_distances'] # Turn each barcode and spot code into a tuple so they can be used as dictionary keys - bestPerSpotTables[r]['best_barcodes'] = [tuple(barcode[0]) for barcode in barcodes] bestPerSpotTables[r]['best_spot_codes'] = [tuple(spotCode[0]) for spotCode in spotCodes] bestPerSpotTables[r]['best_targets'] = [target[0] for target in targets] bestPerSpotTables[r]['best_distances'] = [distance[0] for distance in distances] mergedCodes = mergedCodes.append(bestPerSpotTables[r]) mergedCodes = mergedCodes.reset_index(drop=True) - # Only use codes that were found in >= filterRounds rounds + # Only use codes that were found as best for each of its spots spotCodes = mergedCodes['best_spot_codes'] counts = defaultdict(int) # type: dict for code in spotCodes: counts[code] += 1 - passing = list(set(code for code in counts if counts[code] >= filterRounds)) + passing = list(set(code for code in counts if counts[code] == len(spotCoords) - roundOmitNum)) finalCodes = mergedCodes[mergedCodes['best_spot_codes'].isin(passing)].reset_index(drop=True) finalCodes = finalCodes.iloc[finalCodes['best_spot_codes'].drop_duplicates().index] finalCodes = finalCodes.reset_index(drop=True) - # Choose between overlapping spot codes based on which has the smaller spatial variance - for r in range(roundNum): - roundSpots = [code[r] for code in finalCodes['best_spot_codes'] if code[r] != -1] - dupSpots = set([spot for spot in roundSpots if Counter(roundSpots)[spot] > 1]) - drop = [] - for spot in dupSpots: - locs = np.where(np.asarray(roundSpots) == spot)[0] - distances = [finalCodes.loc[loc, 'best_distances'] for loc in locs] - minInd = np.where(distances == min(distances))[0] - if len(minInd) > 1: - drop.extend([ind for ind in minInd]) - else: - drop.extend([locs[i] for i in range(len(locs)) if i != minInd]) - finalCodes = finalCodes.iloc[[i for i in range(len(finalCodes)) if i not in drop]] - finalCodes = finalCodes.reset_index(drop=True) - - # Add spot coordinates, barcode center coordinates, and number of rounds used for each barcode - # to table + # Add barcode lables, spot coordinates, barcode center coordinates, and number of rounds used + # for each barcode to table + barcodes = [] allCoords = [] centers = [] roundsUsed = [] for i in range(len(finalCodes)): spotCode = finalCodes.iloc[i]['best_spot_codes'] - counter = Counter(spotCode) # type: Counter - roundsUsed.append(roundNum - counter[-1]) - coords = np.asarray([spotCoords[j][spot] for j, spot in enumerate(spotCode) if spot != -1]) + barcodes.append([channelDict[j][spot] for j, spot in enumerate(spotCode)]) + roundsUsed.append(roundNum - Counter(spotCode)[0]) + coords = np.asarray([spotCoords[j][spot] for j, spot in enumerate(spotCode) if spot != 0]) allCoords.append(coords) coords = np.asarray([coord for coord in coords]) center = np.asarray(coords).mean(axis=0) centers.append(center) + finalCodes['best_barcodes'] = barcodes finalCodes['coords'] = allCoords finalCodes['center'] = centers finalCodes['rounds_used'] = roundsUsed @@ -651,7 +685,7 @@ def removeUsedSpots(finalCodes: pd.DataFrame, spotTables: dict) -> dict: # Remove used spots for r in range(len(spotTables)): usedSpots = set([passed[r] for passed in finalCodes['best_spot_codes'] - if passed[r] != -1]) + if passed[r] != 0]) spotTables[r] = spotTables[r].iloc[[i for i in range(len(spotTables[r])) if i not in usedSpots]].reset_index(drop=True) From fe9733e710fc3284284fc2ce105be2e20d104e74 Mon Sep 17 00:00:00 2001 From: nickeener Date: Wed, 27 Oct 2021 23:01:07 -0700 Subject: [PATCH 11/30] Replaced ray w/ multiprocessing --- REQUIREMENTS.txt | 1 - .../spots/DecodeSpots/check_all_decoder.py | 7 - .../core/spots/DecodeSpots/check_all_funcs.py | 340 ++++++++---------- 3 files changed, 159 insertions(+), 189 deletions(-) diff --git a/REQUIREMENTS.txt b/REQUIREMENTS.txt index 66c517fc2..b1c700a84 100644 --- a/REQUIREMENTS.txt +++ b/REQUIREMENTS.txt @@ -10,7 +10,6 @@ jsonschema matplotlib numpy != 1.13.0, >= 1.20.0 pandas >= 0.23.4 -ray==1.7.0 read_roi regional semantic_version diff --git a/starfish/core/spots/DecodeSpots/check_all_decoder.py b/starfish/core/spots/DecodeSpots/check_all_decoder.py index 905d6905e..56dda91e7 100644 --- a/starfish/core/spots/DecodeSpots/check_all_decoder.py +++ b/starfish/core/spots/DecodeSpots/check_all_decoder.py @@ -3,7 +3,6 @@ import numpy as np import pandas as pd -import ray from starfish.core.codebook.codebook import Codebook from starfish.core.intensity_table.decoded_intensity_table import DecodedIntensityTable @@ -96,9 +95,6 @@ def run(self, # increased to ensure this doesn't happen self.searchRadius += 0.001 - # Initialize ray for multi_processing - ray.init(num_cpus=numJobs) - # Create dictionary where keys are round labels and the values are pandas dataframes # containing information on the spots found in that round spotTables = _merge_spots_by_round(spots) @@ -156,9 +152,6 @@ def run(self, # Append found codes to allCodes table allCodes = allCodes.append(finalCodes).reset_index(drop=True) - # Shutdown ray - ray.shutdown() - # Create and fill in intensity table channels = spots.ch_labels rounds = spots.round_labels diff --git a/starfish/core/spots/DecodeSpots/check_all_funcs.py b/starfish/core/spots/DecodeSpots/check_all_funcs.py index 077cc5e69..630be57b4 100644 --- a/starfish/core/spots/DecodeSpots/check_all_funcs.py +++ b/starfish/core/spots/DecodeSpots/check_all_funcs.py @@ -3,10 +3,10 @@ from copy import deepcopy from itertools import chain, islice, permutations, product from functools import partial +from multiprocessing import Pool import numpy as np import pandas as pd -import ray from scipy.spatial import cKDTree from starfish.core.codebook.codebook import Codebook @@ -153,6 +153,71 @@ def decodeSpots(compressed: list, roundNum: int) -> list: for j in range(len(idxs))] return decompressed +def barcodeBuildFunc(allNeighbors: list, + channelDict: dict, + roundOmitNum: int, + currentRound: int, + roundNum: int) -> tuple: + ''' + Subfunction to buildBarcodes that allows it to run in parallel chunks + + Parameters + ---------- + allNeighbors : list + List of neighbor from which to build barcodes from + + channelDict : dict + Dictionary mapping spot IDs to their channels labels + + rang : tuple + Range of indices to build barcodes for in the current data object + + roundOmitNum : int + Maximum hamming distance a barcode can be from it's target in the codebook and + still be uniquely identified (i.e. number of error correction rounds in each + the experiment) + + roundNum : int + Current round + + Returns + ------- + tuple : First element is a list of the possible spot codes while the second element is + a list of the possible barcodes + ''' + + # Build barcodes from neighbors + # spotCodes are the ordered spot IDs of the spots making up each barcode while barcodes are + # the corresponding channel labels, need spotCodes so each barcode can have a unique + # identifier + allSpotCodes = [] + allBarcodes = [] + for i in range(len(allNeighbors)): + neighbors = deepcopy(allNeighbors[i]) + neighborLists = [] + for rnd in range(roundNum): + # Adds a 0 to each round of the neighbors dictionary (allows barcodes with dropped + # rounds to be created) + if roundOmitNum > 0: + neighbors[rnd].append(0) + neighborLists.append(neighbors[rnd]) + # Creates all possible spot code combinations from neighbors + codes = list(product(*neighborLists)) + # Only save the ones with the correct number of dropped rounds + counters = [Counter(code) for code in codes] + spotCodes = [code for j, code in enumerate(codes) if counters[j][0] == roundOmitNum] + spotCodes = [code for code in spotCodes if code[currentRound] != 0] + # Create barcodes from spot codes using the mapping from spot ID to channel + barcodes = [] + for spotCode in spotCodes: + barcode = [channelDict[spotInd][spotCode[spotInd]] for spotInd in range(len(spotCode))] + barcodes.append(hash(tuple(barcode))) + + allBarcodes.append(barcodes) + allSpotCodes.append(encodeSpots(spotCodes)) + + return (allSpotCodes, allBarcodes) + def buildBarcodes(roundData: pd.DataFrame, neighborDict: dict, roundOmitNum: int, @@ -193,78 +258,6 @@ def buildBarcodes(roundData: pd.DataFrame, ''' - @ray.remote - def barcodeBuildFunc(data: pd.DataFrame, - channelDict: dict, - rang: tuple, - roundOmitNum: int, - roundNum: int) -> tuple: - ''' - Subfunction to buildBarcodes that allows it to run in parallel chunks using ray - - Parameters - ---------- - data : pd.DataFrame - Spot table for the current round - - channelDict : dict - Dictionary mapping spot IDs to their channels labels - - rang : tuple - Range of indices to build barcodes for in the current data object - - roundOmitNum : int - Maximum hamming distance a barcode can be from it's target in the codebook and - still be uniquely identified (i.e. number of error correction rounds in each the - experiment) - - roundNum : int - Current round - - Returns - ------- - tuple : First element is a list of the possible spot codes while the second element is - a list of the possible barcodes - ''' - - # Build barcodes from neighbors - # spotCodes are the ordered spot IDs of the spots making up each barcode while barcodes are - # the corresponding channel labels, need spotCodes so each barcode can have a unique - # identifier - # A 0 value in a barcode/spot code corresponds to a dropped round - allSpotCodes = [] - allBarcodes = [] - allNeighbors = list(data['neighbors'])[rang[0]: rang[1]] - for i in range(len(allNeighbors)): - neighbors = deepcopy(allNeighbors[i]) - neighborLists = [] - for rnd in range(roundNum): - # Adds a 0 to each round of the neighbors dictionary (allows barcodes with dropped - # rounds to be created) - if roundOmitNum > 0: - neighbors[rnd].append(0) - neighborLists.append(neighbors[rnd]) - # Creates all possible spot code combinations from neighbors - codes = list(product(*neighborLists)) - # Only save the ones with the correct number of dropped rounds - counters = [Counter(code) for code in codes] - spotCodes = [code for j, code in enumerate(codes) if counters[j][0] == roundOmitNum] - # Only save those that don't have a dropped round in the current round - spotCodes = [code for code in spotCodes if code[currentRound] != 0] - # Create barcodes from spot codes using the mapping from spot ID to channel - barcodes = [] - for spotCode in spotCodes: - barcode = [channelDict[spotInd][spotCode[spotInd]] for spotInd - in range(len(spotCode))] - # Barcodes are hashed to save memory - barcodes.append(hash(tuple(barcode))) - - allBarcodes.append(barcodes) - # Spot codes are compressed to save memory - allSpotCodes.append(encodeSpots(spotCodes)) - - return (allSpotCodes, allBarcodes) - # Only keep spots that have enough neighbors to form a barcode (determined by the total number # of rounds and the number of rounds that can be omitted from each code) passingSpots = {} @@ -280,31 +273,71 @@ def barcodeBuildFunc(data: pd.DataFrame, # Find all possible barcodes for the spots in each round by splitting each round's spots into # numJob chunks and constructing each chunks barcodes in parallel - # Save the current round's data table and the channelDict to ray memory - dataID = ray.put(roundData) - channelDictID = ray.put(channelDict) - # Calculates index ranges to chunk data by ranges = [0] for i in range(1, numJobs + 1): ranges.append(int((len(roundData) / numJobs) * i)) + chunkedNeighbors = [] + for i in range(len(ranges[:-1])): + chunkedNeighbors.append(list(roundData['neighbors'][ranges[i]:ranges[i + 1]])) # Run in parallel - results = [barcodeBuildFunc.remote(dataID, channelDictID, (ranges[i], ranges[i + 1]), - roundOmitNum, roundNum) - for i in range(len(ranges[:-1]))] - rayResults = ray.get(results) + with Pool(processes=numJobs) as pool: + part = partial(barcodeBuildFunc, channelDict=channelDict, roundOmitNum=roundOmitNum, + roundNum=roundNum, currentRound=currentRound) + results = pool.map(part, [chunkedNeighbors[i] for i in range(len(ranges[:-1]))]) # Drop neighbors column (saves memory) roundData = roundData.drop(['neighbors'], axis=1) # Add possible barcodes and spot codes (same order) to spot table (must chain results from # different jobs together) - roundData['spot_codes'] = list(chain(*[job[0] for job in rayResults])) - roundData['barcodes'] = list(chain(*[job[1] for job in rayResults])) + roundData['spot_codes'] = list(chain(*[job[0] for job in results])) + roundData['barcodes'] = list(chain(*[job[1] for job in results])) return roundData +def decodeFunc(codes: pd.DataFrame, permutationCodes: dict) -> tuple: + ''' + Subfunction for decoder that allows it to run in parallel chunks using ray + + Parameters + ---------- + codes : pd.DataFrame + Two column with columns called 'barcodes' and 'spot_codes' + + permutationCodes : dict + Dictionary containing barcode information for each roundPermutation + + Returns + ------- + tuple : First element is a list of all decoded targets, second element is a list of all + decoded barcodes,third element is a list of all decoded spot codes, and the + fourth element is a list of rounds that were omitted for each decoded barcode + ''' + + # Goes through all possible decodings of each spot (ensures each spot is only looked up once) + allTargets = [] + allDecodedSpotCodes = [] + allBarcodes = list(codes['barcodes']) + allSpotCodes = list(codes['spot_codes']) + for i in range(len(allBarcodes)): + targets = [] + decodedSpotCodes = [] + for j, barcode in enumerate(allBarcodes[i]): + try: + # Try to assign target by using barcode as key in permutationsCodes dictionary for + # current set of rounds. If there is no barcode match, it will error and go to the + # except and if it succeeds it will add the data to the other lists for this barcode + targets.append(permutationCodes[barcode]) + decodedSpotCodes.append(allSpotCodes[i][j]) + except Exception: + pass + allTargets.append(targets) + allDecodedSpotCodes.append(decodedSpotCodes) + + return (allTargets, allDecodedSpotCodes) + def decoder(roundData: pd.DataFrame, codebook: Codebook, roundOmitNum: int, @@ -363,55 +396,6 @@ def generateRoundPermutations(size: int, roundOmitNum: int) -> list: return sorted(set(list(permutations([*([False] * roundOmitNum), *([True] * (size - roundOmitNum))])))) - @ray.remote - def decodeFunc(data: pd.DataFrame, - permutationCodes: dict, - rnd: int) -> tuple: - ''' - Subfunction for decoder that allows it to run in parallel chunks using ray - - Parameters - ---------- - data : pd.DataFrame - Spot table for the current round - - permutationCodes : dict - Dictionary containing barcode information for each roundPermutation - - rnd : int - Current round being decoded - - Returns - ------- - tuple : First element is a list of all decoded targets, second element is a list of all - decoded barcodes,third element is a list of all decoded spot codes, and the - fourth element is a list of rounds that were omitted for each decoded barcode - ''' - - # Goes through all possible decodings of each spot (ensures each spot is only looked up - # once) - allTargets = [] - allDecodedSpotCodes = [] - allBarcodes = list(data['barcodes']) - allSpotCodes = list(data['spot_codes']) - for i in range(len(allBarcodes)): - targets = [] - decodedSpotCodes = [] - for j, barcode in enumerate(allBarcodes[i]): - try: - # Try to assign target by using barcode as key in permutationsCodes dictionary - # for current set of rounds. If there is no barcode match, it will error and go - # to the except and if it succeeds it will add the corresponding spot code to - # the decodedSpotCodes list - targets.append(permutationCodes[barcode]) - decodedSpotCodes.append(allSpotCodes[i][j]) - except Exception: - pass - allTargets.append(targets) - allDecodedSpotCodes.append(decodedSpotCodes) - - return (allTargets, allDecodedSpotCodes) - # Create list of logical arrays corresponding to the round sets being used to decode roundPermutations = generateRoundPermutations(codebook.sizes[Axes.ROUND], roundOmitNum) @@ -431,9 +415,6 @@ def decodeFunc(data: pd.DataFrame, roundDict = dict(zip([hash(tuple(code)) for code in codes.data], codes['target'].data)) permCodeDict.update(roundDict) - # Put data table and permutations codes dictionary in ray storage - permutationCodesID = ray.put(permCodeDict) - # Calculates index ranges to chunk data by and creates list of chunked data to loop through ranges = [0] for i in range(1, numJobs + 1): @@ -443,13 +424,14 @@ def decodeFunc(data: pd.DataFrame, chunkedData.append(deepcopy(roundData[ranges[i]:ranges[i + 1]])) # Run in parallel - results = [decodeFunc.remote(chunkedData[i], permutationCodesID, currentRound) - for i in range(len(ranges[:-1]))] - rayResults = ray.get(results) + with Pool(processes=numJobs) as pool: + part = partial(decodeFunc, permutationCodes=permCodeDict) + results = pool.map(part, [chunkedData[i][['barcodes', 'spot_codes']] + for i in range(len(chunkedData))]) # Update table - roundData['targets'] = list(chain(*[job[0] for job in rayResults])) - roundData['decoded_spot_codes'] = list(chain(*[job[1] for job in rayResults])) + roundData['targets'] = list(chain(*[job[0] for job in results])) + roundData['decoded_spot_codes'] = list(chain(*[job[1] for job in results])) # Drop barcodes and spot_codes column (saves memory) roundData = roundData.drop(['spot_codes', 'barcodes'], axis=1) @@ -463,6 +445,38 @@ def decodeFunc(data: pd.DataFrame, return roundData +def distanceFunc(subSpotCodes: list, spotCoords: dict) -> list: + ''' + Subfunction for distanceFilter to allow it to run in parallel using ray + + Parameters + ---------- + subSpotCodes : list + Chunk of full list of spot codes for the current round to calculate the spatial + variance for + + spotCoords : dict + Dictionary containing spatial locations for spots by their IDs in the original + spotTables object + + Returns + ------- + list: list of spatial variances for the current chunk of spot codes + + ''' + + # Calculate spatial variances for current chunk of spot codes + allDistances = [] + for spotCodes in subSpotCodes: + distances = [] + for s, spotCode in enumerate(spotCodes): + coords = np.asarray([spotCoords[j][spot] for j, spot in enumerate(spotCode) + if spot != 0]) + # Distance is calculate as the sum of variances of the coordinates along each axis + distances.append(sum(np.var(coords, axis=0))) + allDistances.append(distances) + return allDistances + def distanceFilter(roundData: pd.DataFrame, spotCoords: dict, currentRound: int, @@ -493,45 +507,9 @@ def distanceFilter(roundData: pd.DataFrame, found for each spot ''' - @ray.remote - def distanceFunc(subSpotCodes: list, spotCoords: dict) -> list: - ''' - Subfunction for distanceFilter to allow it to run in parallel using ray - - Parameters - ---------- - subSpotCodes : list - Chunk of full list of spot codes for the current round to calculate the spatial - variance for - - spotCoords : dict - Dictionary containing spatial locations for spots by their IDs in the original - spotTables object - - Returns - ------- - list: list of spatial variances for the current chunk of spot codes - - ''' - - # Calculate spatial variances for current chunk of spot codes - allDistances = [] - for spotCodes in subSpotCodes: - distances = [] - for s, spotCode in enumerate(spotCodes): - coords = np.asarray([spotCoords[j][spot] for j, spot in enumerate(spotCode) - if spot != 0]) - # Distance is calculate as the sum of variances of the coordinates along each axis - distances.append(sum(np.var(coords, axis=0))) - allDistances.append(distances) - return allDistances - # Calculate the spatial variance for each decodable barcode for each spot in each round allSpotCodes = roundData['decoded_spot_codes'] - # Put spotCoords dictionary into ray memory - spotCoordsID = ray.put(spotCoords) - # Calculates index ranges to chunk data by ranges = [0] for i in range(1, numJobs): @@ -539,13 +517,13 @@ def distanceFunc(subSpotCodes: list, spotCoords: dict) -> list: ranges.append(len(roundData)) chunkedSpotCodes = [allSpotCodes[ranges[i]:ranges[i + 1]] for i in range(len(ranges[:-1]))] - # Run in parallel using ray - results = [distanceFunc.remote(subSpotCodes, spotCoordsID) for subSpotCodes - in chunkedSpotCodes] - rayResults = ray.get(results) + # Run in parallel + with Pool(processes=numJobs) as pool: + part = partial(distanceFunc, spotCoords=spotCoords) + results = pool.map(part, [list(subSpotCodes) for subSpotCodes in chunkedSpotCodes]) # Add distances to decodedTables as new column - roundData['distance'] = list(chain(*[job for job in rayResults])) + roundData['distance'] = list(chain(*[job for job in results])) # Pick minimum distance barcode(s) for each spot bestSpotCodes = [] From 002d25dc898012ff20b0b32530a8f8167894f466 Mon Sep 17 00:00:00 2001 From: nickeener Date: Wed, 27 Oct 2021 23:02:58 -0700 Subject: [PATCH 12/30] Fix import order --- starfish/core/spots/DecodeSpots/check_all_funcs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/starfish/core/spots/DecodeSpots/check_all_funcs.py b/starfish/core/spots/DecodeSpots/check_all_funcs.py index 630be57b4..b2bf087ef 100644 --- a/starfish/core/spots/DecodeSpots/check_all_funcs.py +++ b/starfish/core/spots/DecodeSpots/check_all_funcs.py @@ -1,8 +1,8 @@ import warnings from collections import Counter, defaultdict from copy import deepcopy -from itertools import chain, islice, permutations, product from functools import partial +from itertools import chain, islice, permutations, product from multiprocessing import Pool import numpy as np From ef5cf5edc6a0d6e0c2e8c8e834450e3a72901b22 Mon Sep 17 00:00:00 2001 From: nickeener Date: Wed, 27 Oct 2021 23:37:13 -0700 Subject: [PATCH 13/30] few more fixes --- .../core/spots/DecodeSpots/check_all_funcs.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/starfish/core/spots/DecodeSpots/check_all_funcs.py b/starfish/core/spots/DecodeSpots/check_all_funcs.py index b2bf087ef..b09098761 100644 --- a/starfish/core/spots/DecodeSpots/check_all_funcs.py +++ b/starfish/core/spots/DecodeSpots/check_all_funcs.py @@ -1,3 +1,4 @@ +import typing import warnings from collections import Counter, defaultdict from copy import deepcopy @@ -5,6 +6,7 @@ from itertools import chain, islice, permutations, product from multiprocessing import Pool + import numpy as np import pandas as pd from scipy.spatial import cKDTree @@ -204,7 +206,7 @@ def barcodeBuildFunc(allNeighbors: list, # Creates all possible spot code combinations from neighbors codes = list(product(*neighborLists)) # Only save the ones with the correct number of dropped rounds - counters = [Counter(code) for code in codes] + counters = [Counter(code) for code in codes] # type: typing.List[Counter] spotCodes = [code for j, code in enumerate(codes) if counters[j][0] == roundOmitNum] spotCodes = [code for code in spotCodes if code[currentRound] != 0] # Create barcodes from spot codes using the mapping from spot ID to channel @@ -402,17 +404,18 @@ def generateRoundPermutations(size: int, roundOmitNum: int) -> list: # Create dictionary where the keys are the different round sets that can be used for decoding # and the values are the modified codebooks corresponding to the rounds used permCodeDict = {} + targets = codebook['target'].data for currentRounds in roundPermutations: - codes = codebook.argmax(Axes.CH.value) + codes = codebook.data.argmax(axis=2) if roundOmitNum > 0: omittedRounds = np.argwhere(~np.asarray(currentRounds)) # Makes entire column that is being omitted -1, which become 0 after 1 is added # so they match up with the barcodes made earlier - codes.data[:, omittedRounds] = -1 + codes[:, omittedRounds] = -1 # Makes codes 1-based which prevents collisions when hashing - codes.data += 1 + codes += 1 # Barcodes are hashed as before - roundDict = dict(zip([hash(tuple(code)) for code in codes.data], codes['target'].data)) + roundDict = dict(zip([hash(tuple(code)) for code in codes], targets)) permCodeDict.update(roundDict) # Calculates index ranges to chunk data by and creates list of chunked data to loop through @@ -627,7 +630,8 @@ def cleanup(bestPerSpotTables: dict, for i in range(len(finalCodes)): spotCode = finalCodes.iloc[i]['best_spot_codes'] barcodes.append([channelDict[j][spot] for j, spot in enumerate(spotCode)]) - roundsUsed.append(roundNum - Counter(spotCode)[0]) + counter = Counter(spotCode) # type: Counter + roundsUsed.append(roundNum - counter[0]) coords = np.asarray([spotCoords[j][spot] for j, spot in enumerate(spotCode) if spot != 0]) allCoords.append(coords) coords = np.asarray([coord for coord in coords]) From 68a3c5fc93751f10f6c50f1336a0c30d32872a57 Mon Sep 17 00:00:00 2001 From: nickeener Date: Thu, 28 Oct 2021 00:31:24 -0700 Subject: [PATCH 14/30] Changed test to account for randomness --- .../spots/DecodeSpots/test/test_check_all.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/starfish/core/spots/DecodeSpots/test/test_check_all.py b/starfish/core/spots/DecodeSpots/test/test_check_all.py index 36b267e2c..8d5ba0cb0 100644 --- a/starfish/core/spots/DecodeSpots/test/test_check_all.py +++ b/starfish/core/spots/DecodeSpots/test/test_check_all.py @@ -84,11 +84,11 @@ def testExactMatches(): codebook = seqfishCodebook(5, 3, 20) - img, trueTargets = syntheticSeqfish(100, 100, 20, codebook, 20, 0, False) + img, trueTargets = syntheticSeqfish(100, 100, 20, codebook, 5, 0, False) bd = BlobDetector(min_sigma=1, max_sigma=4, num_sigma=30, threshold=.1, exclude_border=False) spots = bd.run(image_stack=img) - assert spots.count_total_spots() == 5 * 20, 'Spot detector did not find all spots' + assert spots.count_total_spots() == 5 * 5, 'Spot detector did not find all spots' decoder = CheckAll(codebook=codebook, search_radius=1, error_rounds=0) hits = decoder.run(spots=spots, n_processes=4) @@ -113,11 +113,11 @@ def testJitteredMatches(): codebook = seqfishCodebook(5, 3, 20) - img, trueTargets = syntheticSeqfish(100, 100, 20, codebook, 20, 2, False) + img, trueTargets = syntheticSeqfish(100, 100, 20, codebook, 5, 2, False) bd = BlobDetector(min_sigma=1, max_sigma=4, num_sigma=30, threshold=.1, exclude_border=False) spots = bd.run(image_stack=img) - assert spots.count_total_spots() == 5 * 20, 'Spot detector did not find all spots' + assert spots.count_total_spots() == 5 * 5, 'Spot detector did not find all spots' decoder = CheckAll(codebook=codebook, search_radius=3, error_rounds=0) hits = decoder.run(spots=spots, n_processes=4) @@ -142,11 +142,11 @@ def testErrorCorrection(): codebook = seqfishCodebook(5, 3, 20) - img, trueTargets = syntheticSeqfish(100, 100, 20, codebook, 20, 0, True) + img, trueTargets = syntheticSeqfish(100, 100, 20, codebook, 5, 0, True) bd = BlobDetector(min_sigma=1, max_sigma=4, num_sigma=30, threshold=.1, exclude_border=False) spots = bd.run(image_stack=img) - assert spots.count_total_spots() == 4 * 20, 'Spot detector did not find all spots' + assert spots.count_total_spots() == 4 * 5, 'Spot detector did not find all spots' decoder = CheckAll(codebook=codebook, search_radius=1, error_rounds=1) hits = decoder.run(spots=spots, n_processes=4) @@ -166,3 +166,7 @@ def testErrorCorrection(): matches += 1 assert matches == len(trueTargets) + +testExactMatches() +testJitteredMatches() +testErrorCorrection() \ No newline at end of file From 3a10c185c8cae759e5917159496923c7698ddde5 Mon Sep 17 00:00:00 2001 From: nickeener Date: Thu, 28 Oct 2021 00:35:20 -0700 Subject: [PATCH 15/30] fix test --- starfish/core/spots/DecodeSpots/test/test_check_all.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/starfish/core/spots/DecodeSpots/test/test_check_all.py b/starfish/core/spots/DecodeSpots/test/test_check_all.py index 8d5ba0cb0..47c9dceb1 100644 --- a/starfish/core/spots/DecodeSpots/test/test_check_all.py +++ b/starfish/core/spots/DecodeSpots/test/test_check_all.py @@ -166,7 +166,3 @@ def testErrorCorrection(): matches += 1 assert matches == len(trueTargets) - -testExactMatches() -testJitteredMatches() -testErrorCorrection() \ No newline at end of file From 7f74fb9143661929b6614b6f46b3521faa24cdaf Mon Sep 17 00:00:00 2001 From: nickeener Date: Sat, 6 Nov 2021 20:25:08 -0700 Subject: [PATCH 16/30] Added new filter step --- .../spots/DecodeSpots/check_all_decoder.py | 38 ++++--- .../core/spots/DecodeSpots/check_all_funcs.py | 99 ++++++++++++++++--- 2 files changed, 106 insertions(+), 31 deletions(-) diff --git a/starfish/core/spots/DecodeSpots/check_all_decoder.py b/starfish/core/spots/DecodeSpots/check_all_decoder.py index 56dda91e7..1c52b7bea 100644 --- a/starfish/core/spots/DecodeSpots/check_all_decoder.py +++ b/starfish/core/spots/DecodeSpots/check_all_decoder.py @@ -16,7 +16,6 @@ removeUsedSpots from .util import _merge_spots_by_round - class CheckAll(DecodeSpotsAlgorithm): """ Decode spots by generating all possible combinations of spots to form barcodes given a radius @@ -35,11 +34,13 @@ class CheckAll(DecodeSpotsAlgorithm): the sum of variances for each of the spatial coordinates of the spots that make up each barcode and choosing the minimum distance barcode (if there is a tie, they are all dropped as ambiguous). Each spot is assigned a "best" barcode in this way. - 5. Only keep barcodes/targets that were found as "best" in each of the rounds they have spots in - (End here if number of error_rounds = 0) - 6. Remove all spots used in decoded targets that passed the previous filtering steps from the + 5. Only keep barcodes/targets that were found as "best" using at least 2 of the spots that make + each up + 6. Find maximum independent set (approximation) of the spot combinations so no two barcodes use + the same spot + 7. Remove all spots used in decoded targets that passed the previous filtering steps from the original set of spots - 7. Rerun steps 2-5 for barcodes that use less than the full set of rounds for codebook + 8. Rerun steps 2-5 for barcodes that use less than the full set of rounds for codebook matching (how many rounds can be dropped determined by error_rounds parameter) Parameters @@ -120,6 +121,8 @@ def run(self, decodedTables = {} for r in range(len(spotTables)): roundData = deepcopy(spotTables[r]) + roundData = roundData.drop(['intensity', 'z', 'y', 'x', 'radius', 'c'], axis=1) + roundData.index += 1 # Create dictionary of dataframes (based on spotTables data) that contains # additional columns for each spot containing all the possible barcodes that @@ -139,18 +142,23 @@ def run(self, # Assign to DecodedTables dictionary decodedTables[r] = roundData - # Turn spot table dictionary into single table, filter barcodes by round frequency, add - # additional information, and choose between barcodes that have overlapping spots - finalCodes = cleanup(decodedTables, spotCoords, channelDict, currentRoundOmitNum) + # Only do the following if barcodes were founds + totalSpots = sum([len(decodedTables[table]) for table in decodedTables]) + if totalSpots: + + # Turn spot table dictionary into single table, filter barcodes by round frequency, + # add additional information, and choose between barcodes that have overlapping + # spots + finalCodes = cleanup(decodedTables, spotCoords, channelDict) - # If this is not the last round omission number to run, remove spots that have just - # been found to be in passing barcodes from spotTables so they are not used for the - # next round omission number - if currentRoundOmitNum != roundOmits[-1]: - spotTables = removeUsedSpots(finalCodes, spotTables) + # If this is not the last round omission number to run, remove spots that have just + # been found to be in passing barcodes from spotTables so they are not used for the + # next round omission number + if currentRoundOmitNum != roundOmits[-1]: + spotTables = removeUsedSpots(finalCodes, spotTables) - # Append found codes to allCodes table - allCodes = allCodes.append(finalCodes).reset_index(drop=True) + # Append found codes to allCodes table + allCodes = allCodes.append(finalCodes).reset_index(drop=True) # Create and fill in intensity table channels = spots.ch_labels diff --git a/starfish/core/spots/DecodeSpots/check_all_funcs.py b/starfish/core/spots/DecodeSpots/check_all_funcs.py index b09098761..1fd567b89 100644 --- a/starfish/core/spots/DecodeSpots/check_all_funcs.py +++ b/starfish/core/spots/DecodeSpots/check_all_funcs.py @@ -6,7 +6,6 @@ from itertools import chain, islice, permutations, product from multiprocessing import Pool - import numpy as np import pandas as pd from scipy.spatial import cKDTree @@ -52,6 +51,7 @@ def createRefDicts(spotTables: dict, searchRadius: float) -> tuple: for key in [*spotCoords[r]]: spotCoords[r][key] = tuple([item[1] for item in sorted(spotCoords[r][key].items(), key=lambda x: x[0])]) + spotTables[r].index -= 1 return neighborDict, channelDict, spotCoords @@ -171,16 +171,16 @@ def barcodeBuildFunc(allNeighbors: list, channelDict : dict Dictionary mapping spot IDs to their channels labels - rang : tuple - Range of indices to build barcodes for in the current data object - roundOmitNum : int Maximum hamming distance a barcode can be from it's target in the codebook and still be uniquely identified (i.e. number of error correction rounds in each the experiment) + currentRound : int + The round that the spots being used for reference points are found in + roundNum : int - Current round + Total number of round in experiment Returns ------- @@ -262,6 +262,7 @@ def buildBarcodes(roundData: pd.DataFrame, # Only keep spots that have enough neighbors to form a barcode (determined by the total number # of rounds and the number of rounds that can be omitted from each code) + passingSpots = {} roundNum = len(neighborDict) for key in neighborDict[currentRound]: @@ -567,8 +568,7 @@ def distanceFilter(roundData: pd.DataFrame, def cleanup(bestPerSpotTables: dict, spotCoords: dict, - channelDict: dict, - roundOmitNum: int) -> pd.DataFrame: + channelDict: dict) -> pd.DataFrame: ''' Function that combines all "best" codes for each spot in each round into a single table, filters them by their frequency (with a user-defined threshold), chooses between overlapping @@ -587,10 +587,6 @@ def cleanup(bestPerSpotTables: dict, channelDict : dict Dictionary with mapping between spot IDs and the channel labels - filterRounds : int - Number of rounds that a barcode must be identified in to pass filters (higher = more - stringent filtering), default = 1 - #rounds or 1 - roundOmitNum if roundOmitNum > 0 - Returns ------- pd.DataFrame : Dataframe containing final set of codes that have passed all filters @@ -611,15 +607,86 @@ def cleanup(bestPerSpotTables: dict, mergedCodes = mergedCodes.append(bestPerSpotTables[r]) mergedCodes = mergedCodes.reset_index(drop=True) - # Only use codes that were found as best for each of its spots + # Only pass codes that are chosen as best for at least 2 of the spots that make it up spotCodes = mergedCodes['best_spot_codes'] counts = defaultdict(int) # type: dict for code in spotCodes: counts[code] += 1 - passing = list(set(code for code in counts if counts[code] == len(spotCoords) - roundOmitNum)) - finalCodes = mergedCodes[mergedCodes['best_spot_codes'].isin(passing)].reset_index(drop=True) - finalCodes = finalCodes.iloc[finalCodes['best_spot_codes'].drop_duplicates().index] - finalCodes = finalCodes.reset_index(drop=True) + passing = list(set(code for code in counts if counts[code] > 1)) + passingCodes = mergedCodes[mergedCodes['best_spot_codes'].isin(passing)].reset_index(drop=True) + passingCodes = passingCodes.iloc[passingCodes['best_spot_codes'].drop_duplicates().index] + passingCodes = passingCodes.reset_index(drop=True) + + # Need to find maximum independent set of spot codes where each spot code is a node and there + # is an edge connecting two codes if they share at least one spot. Does this by eliminating + # nodes (spot codes) that have the most edges first and if there is tie for which has the most + # edges they are ordered in order of decreasing spatial variance of the spots that make it up + # (so codes are eliminated in order first of how many other codes they share a spots with and + # then spatial variance is used to break ties). Nodes are eliminated from the graph in this way + # until there are no more edges in the graph + + # First prepare list of counters of the spot IDs for each round + spotCodes = passingCodes['best_spot_codes'] + codeArray = np.asarray([np.asarray(code) for code in spotCodes]) + counters = [] # type: typing.List[Counter] + for r in range(roundNum): + counters.append(Counter(codeArray[:, r])) + counters[-1][0] = 0 + + # Then create collisonCounter dictionary which has the number of edges for each code and the + # collisions dictionary which holds a list of codes each code has an overlap with. Any code with + # no overlaps is added to keep to save later + collisionCounter = defaultdict(int) # type: dict + collisions = defaultdict(list) + keep = [] + for i, spotCode in enumerate(spotCodes): + collision = False + for r in range(roundNum): + if spotCode[r] != 0: + count = counters[r][spotCode[r]] - 1 + if count > 0: + collision = True + collisionCounter[spotCode] += count + collisions[spotCode].extend([spotCodes[ind[0]] for ind in + np.argwhere(codeArray[:, r] == spotCode[r]) + if ind[0] != i]) + if not collision: + keep.append(i) + + # spotDict dictionary has mapping for codes to their index location in spotCodes and + # codeDistance has mapping for codes to their spatial variance value + spotDict = {code: i for i, code in enumerate(spotCodes)} + codeDistance = passingCodes.set_index('best_spot_codes')['best_distances'].to_dict() + while len(collisions): + # Gets all the codes that have the highest value for number of edges, and then sorts them by + # their spatial variance values in decreasing order + maxValue = max(collisionCounter.values()) + maxCodes = [code for code in collisionCounter if collisionCounter[code] == maxValue] + distances = np.asarray([codeDistance[code] for code in maxCodes]) + sortOrder = [item[1] for item in sorted(zip(distances, range(len(distances))), + reverse=True)] + maxCodes = [tuple(code) for code in np.asarray(maxCodes)[sortOrder]] + + # For every maxCode, first check that it is still a maxCode (may change during this loop), + # if it is then modify all the nodes that have edge to it to have one less edge (if this + # causes that node to have no more edges then delete it from the graph and add it to the + # codes we keep), then delete the maxCode from the graph + for maxCode in maxCodes: + if collisionCounter[maxCode] == maxValue: + for code in collisions[maxCode]: + if collisionCounter[code] == 1: + del collisionCounter[code] + del collisions[code] + keep.append(spotDict[code]) + else: + collisionCounter[code] -= 1 + collisions[code] = [c for c in collisions[code] if c != maxCode] + + del collisionCounter[maxCode] + del collisions[maxCode] + + # Only choose codes that we found to not have any edges in the graph + finalCodes = passingCodes.loc[keep].reset_index(drop=True) # Add barcode lables, spot coordinates, barcode center coordinates, and number of rounds used # for each barcode to table From be3905254b5fc42a8414c76ea1cb99eea8eabe40 Mon Sep 17 00:00:00 2001 From: nickeener Date: Tue, 11 Jan 2022 14:19:07 -0800 Subject: [PATCH 17/30] Added input error checking and condition for empty results --- .../spots/DecodeSpots/check_all_decoder.py | 37 ++++++++++++++++++- 1 file changed, 35 insertions(+), 2 deletions(-) diff --git a/starfish/core/spots/DecodeSpots/check_all_decoder.py b/starfish/core/spots/DecodeSpots/check_all_decoder.py index 1c52b7bea..380371ee8 100644 --- a/starfish/core/spots/DecodeSpots/check_all_decoder.py +++ b/starfish/core/spots/DecodeSpots/check_all_decoder.py @@ -1,4 +1,5 @@ from copy import deepcopy +import sys from typing import Any, Hashable, Mapping, Tuple import numpy as np @@ -59,9 +60,22 @@ def __init__( codebook: Codebook, search_radius: float=3, error_rounds: int=0): + + # Error catching for input + if len(codebook) == 0: + sys.exit('Codebook is empty') + if not isinstance(search_radius, int) or isinstance(search_radius, float): + sys.exit('search_radius must be a positive number or zero') + elif search_radius < 0: + sys.exit('search_radius must be a positive number or zero') + if not isistance(error_rounds, int): + sys.exit('error_rounds must be a positive integer or zero') + elif error_rounds < 0: + sys.exit('error_rounds must be a positive integer or zero') + self.codebook = codebook self.searchRadius = search_radius - self.errorRounds = error_rounds + self.errorRounds = int(error_rounds) def run(self, spots: SpotFindingResults, @@ -87,8 +101,12 @@ def run(self, """ # Rename n_processes (trying to stay consistent between starFISH's _ variables and my - # camel case ones) + # camel case ones) and check that it is a positive integer numJobs = n_processes + if not isinstance(numJobs, int): + sys.exit('n_processes must be a positive integer') + elif numJobs < 1: + sys.exit('n_processes must be a positive integer') # If using an search radius exactly equal to a possible distance between two pixels # (ex: 1), some distances will be calculated as slightly less than their exact distance @@ -96,6 +114,13 @@ def run(self, # increased to ensure this doesn't happen self.searchRadius += 0.001 + # Check that there are spots in the SpotFindingResults object, if there are none exit + # program and print error message + if not isinstance(spots, SpotFindingResults): + sys.exit('spots must be a SpotFindingResults object') + elif spots.count_total_spots() == 0: + sys.exit('No spots in SpotFindingResults object') + # Create dictionary where keys are round labels and the values are pandas dataframes # containing information on the spots found in that round spotTables = _merge_spots_by_round(spots) @@ -167,6 +192,14 @@ def run(self, # create empty IntensityTable filled with np.nan data = np.full((len(allCodes), len(rounds), len(channels)), fill_value=np.nan) dims = (Features.AXIS, Axes.ROUND.value, Axes.CH.value) + + # If there are no decoded targets, return empty DecodedIntensityTable + if len(allCodes) == 0: + int_table = IntensityTable(data=data, dims=dims) + intensities = int_table.transpose('features', 'r', 'c') + print("No targets found") + return DecodedIntensityTable.from_intensity_table(intensities, targets=(Features.AXIS, [])) + centers = allCodes['center'] coords: Mapping[Hashable, Tuple[str, Any]] = { Features.SPOT_RADIUS: (Features.AXIS, np.full(len(allCodes), 1)), From 0a8ced92faabf4d1352d0d900e1daaae02da0297 Mon Sep 17 00:00:00 2001 From: nickeener Date: Mon, 24 Jan 2022 22:58:18 -0800 Subject: [PATCH 18/30] Fixed channel bug --- starfish/core/spots/DecodeSpots/check_all_decoder.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/starfish/core/spots/DecodeSpots/check_all_decoder.py b/starfish/core/spots/DecodeSpots/check_all_decoder.py index 380371ee8..91482c026 100644 --- a/starfish/core/spots/DecodeSpots/check_all_decoder.py +++ b/starfish/core/spots/DecodeSpots/check_all_decoder.py @@ -218,8 +218,9 @@ def run(self, for i in range(len(allCodes)): code = [] for ch in allCodes.loc[i, 'best_barcodes']: - # If a round is not used, row will be all zeros - code.append(np.asarray([0 if j != ch else 1 for j in range(len(channels))])) + # If a round is not used, row will be all zeros (subtract one because we added + # one earlier) + code.append(np.asarray([0 if j != ch - 1 else 1 for j in range(len(channels))])) table_codes.append(np.asarray(code)) int_table.values = np.asarray(table_codes) int_table = transfer_physical_coords_to_intensity_table(intensity_table=int_table, From ea8a2e96458a51c5e2040a777b7a831985a262ca Mon Sep 17 00:00:00 2001 From: nickeener Date: Wed, 30 Mar 2022 12:01:29 -0700 Subject: [PATCH 19/30] March 30 2022 big update --- .../spots/DecodeSpots/check_all_decoder.py | 358 +++++++---- .../core/spots/DecodeSpots/check_all_funcs.py | 562 +++++++++++------- 2 files changed, 606 insertions(+), 314 deletions(-) diff --git a/starfish/core/spots/DecodeSpots/check_all_decoder.py b/starfish/core/spots/DecodeSpots/check_all_decoder.py index 91482c026..46159c143 100644 --- a/starfish/core/spots/DecodeSpots/check_all_decoder.py +++ b/starfish/core/spots/DecodeSpots/check_all_decoder.py @@ -1,9 +1,11 @@ -from copy import deepcopy import sys +from collections import Counter +from copy import deepcopy from typing import Any, Hashable, Mapping, Tuple import numpy as np import pandas as pd +import ray from starfish.core.codebook.codebook import Codebook from starfish.core.intensity_table.decoded_intensity_table import DecodedIntensityTable @@ -13,8 +15,8 @@ from starfish.core.types import SpotFindingResults from starfish.types import Axes, Features from ._base import DecodeSpotsAlgorithm -from .check_all_funcs import buildBarcodes, cleanup, createRefDicts, decoder, distanceFilter, \ - removeUsedSpots +from .check_all_funcs import buildBarcodes, cleanup, createNeighborDict, createRefDicts, decoder, \ + distanceFilter, findNeighbors, removeUsedSpots from .util import _merge_spots_by_round class CheckAll(DecodeSpotsAlgorithm): @@ -59,23 +61,26 @@ def __init__( self, codebook: Codebook, search_radius: float=3, - error_rounds: int=0): - - # Error catching for input - if len(codebook) == 0: - sys.exit('Codebook is empty') - if not isinstance(search_radius, int) or isinstance(search_radius, float): - sys.exit('search_radius must be a positive number or zero') - elif search_radius < 0: - sys.exit('search_radius must be a positive number or zero') - if not isistance(error_rounds, int): - sys.exit('error_rounds must be a positive integer or zero') - elif error_rounds < 0: - sys.exit('error_rounds must be a positive integer or zero') - + error_rounds: int=0, + mode='med', + physical_coords=False): self.codebook = codebook self.searchRadius = search_radius - self.errorRounds = int(error_rounds) + self.errorRounds = error_rounds + self.mode = mode + self.physicalCoords = physical_coords + + # Error checking for some inputs + + # Check that codebook is the right class and not empty + if not isinstance(self.codebook, Codebook) or len(codebook) == 0: + sys.exit('codebook is either not a Codebook object or is empty') + # Check that error_rounds is either 0 or 1 + if self.errorRounds not in [0, 1]: + exit('error_rounds can only take a value of 0 or 1') + # Return error if search radius is greater than 4.5 or negative + if self.searchRadius < 0 or self.searchRadius > 4.5: + sys.exit('search_radius must be positive w/ max value of 4.5') def run(self, spots: SpotFindingResults, @@ -101,111 +106,247 @@ def run(self, """ # Rename n_processes (trying to stay consistent between starFISH's _ variables and my - # camel case ones) and check that it is a positive integer + # camel case ones) numJobs = n_processes - if not isinstance(numJobs, int): - sys.exit('n_processes must be a positive integer') - elif numJobs < 1: - sys.exit('n_processes must be a positive integer') - - # If using an search radius exactly equal to a possible distance between two pixels - # (ex: 1), some distances will be calculated as slightly less than their exact distance - # (either due to rounding or precision errors) so search radius needs to be slightly - # increased to ensure this doesn't happen - self.searchRadius += 0.001 - - # Check that there are spots in the SpotFindingResults object, if there are none exit - # program and print error message - if not isinstance(spots, SpotFindingResults): - sys.exit('spots must be a SpotFindingResults object') - elif spots.count_total_spots() == 0: - sys.exit('No spots in SpotFindingResults object') + # Check that numJobs is a positive integer + if numJobs < 0 or not isinstance(numJobs, int): + sys.exit('n_process must be a positive integer') + + # Initialize ray for multi_processing + ray.init(num_cpus=numJobs, ignore_reinit_error=True) # Create dictionary where keys are round labels and the values are pandas dataframes # containing information on the spots found in that round spotTables = _merge_spots_by_round(spots) - # Add one to channels labels (prevents collisions between hashes of barcodes later) + # Check that enough rounds have spots to make at least one barcode + spotsPerRound = [len(spotTables[r]) for r in range(len(spotTables))] + counter = Counter(spotsPerRound) + if counter[0] > self.errorRounds: + exit('Not enough spots to form a barcode') + + if self.physicalCoords: + physicalCoords = spots.physical_coord_ranges + if len(physicalCoords['z'].data) > 1: + zScale = physicalCoords['z'][1].data - physicalCoords['z'][0].data + else: + zScale = 1 + yScale = physicalCoords['y'][1].data - physicalCoords['y'][0].data + xScale = physicalCoords['x'][1].data - physicalCoords['x'][0].data + if xScale <= 0 or yScale <= 0 or zScale <= 0: + exit('invalid physical coords') + + # Add one to channels labels (prevents collisions between hashes of barcodes later), adds + # unique spot_id column for each spot in each round, and scales the x, y, and z columns to + # the phsyical coordinates if specified for r in spots.round_labels: spotTables[r]['c'] += 1 + spotTables[r]['spot_id'] = range(1, len(spotTables[r]) + 1) + if self.physicalCoords: + spotTables[r]['z'] = spotTables[r]['z'] * zScale + spotTables[r]['y'] = spotTables[r]['y'] * yScale + spotTables[r]['x'] = spotTables[r]['x'] * xScale + + # Choose search radius set based on search_radius parameter and ability for spots to be + # neighbors across z slices. Each value in allSearchRadii represents an incremental + # increase in neighborhood size + set1 = False + zs = set() + [zs.update(spotTables[r]['z']) for r in range(len(spotTables))] + if self.physicalCoords: + if zScale < self.searchRadius or len(zs) > 1: + set1 = True + else: + if len(zs) > 1: + set1 = True + if set1: + allSearchRadii = np.array([0, 1.05, 1.5, 1.8, 2.05, 2.3, 2.45, 2.85, 3.05, 3.2, + 3.35, 3.5, 3.65, 3.75, 4.05, 4.15, 4.25, 4.4, 4.5]) + else: + allSearchRadii = np.array([0, 1.05, 1.5, 2.05, 2.3, 2.85, 3.05, 3.2, 3.65, 4.05, 4.15, + 4.25, 4.5]) + + maxRadii = allSearchRadii[(allSearchRadii - self.searchRadius) <= 0][-1] + radiusSet = allSearchRadii[allSearchRadii <= maxRadii] + + # Calculate neighbors for each radius in the set + neighborsByRadius = {} + for searchRadius in radiusSet: + if self.physicalCoords: + searchRadius = round(searchRadius * xScale, 5) + neighborsByRadius[searchRadius] = findNeighbors(spotTables, searchRadius, numJobs) + + # Create reference dictionaries for spot channels, coordinates, raw intensities, and + # normalized intensities. Each is a dict w/ keys equal to the round labels and each + # value is a dict with spot IDs in that round as keys and their corresponding value + # (channel label, spatial coords, etc) + channelDict, spotCoords, spotIntensities, spotQualDict = createRefDicts(spotTables, numJobs) + + # Add spot quality (normalized spot intensity) tp spotTables + for r in range(len(spotTables)): + spotTables[r]['spot_quals'] = [spotQualDict[r][spot] for spot in + spotTables[r]['spot_id']] # Set list of round omission numbers to loop through roundOmits = range(self.errorRounds + 1) + # Set parameters according to presets + if self.mode == 'high': + strictnesses = [50, -1] + seedNumbers = [len(spotTables) - 1, len(spotTables)] + minDist = 3 + if self.errorRounds == 1: + strictnesses.append(1) + seedNumbers.append(len(spotTables) - 1) + elif self.mode == 'med': + strictnesses = [50, -5] + seedNumbers = [len(spotTables) - 1, len(spotTables)] + minDist = 3 + if self.errorRounds == 1: + strictnesses.append(5) + seedNumbers.append(len(spotTables) - 1) + elif self.mode == 'low': + strictnesses = [50, -100] + seedNumbers = [len(spotTables) - 1, len(spotTables) - 1] + minDist = 100 + if self.errorRounds == 1: + strictnesses.append(10) + seedNumbers.append(len(spotTables) - 1) + else: + exit('Invalid mode choice ("high", "med", or "low")') + # Decode for each round omission number, store results in allCodes table allCodes = pd.DataFrame() - for currentRoundOmitNum in roundOmits: - - # Create necessary reference dictionaries - neighborDict, channelDict, spotCoords = createRefDicts(spotTables, self.searchRadius) - - # Chooses best barcode for all spots in each round sequentially (possible barcode - # space can become quite large which can increase memory needs so I do it this way so - # we only need to store all potential barcodes that originate from one round at a - # time) - decodedTables = {} - for r in range(len(spotTables)): - roundData = deepcopy(spotTables[r]) - roundData = roundData.drop(['intensity', 'z', 'y', 'x', 'radius', 'c'], axis=1) - roundData.index += 1 - - # Create dictionary of dataframes (based on spotTables data) that contains - # additional columns for each spot containing all the possible barcodes that - # could be constructed from the neighbors of that spot - roundData = buildBarcodes(roundData, neighborDict, currentRoundOmitNum, - channelDict, r, numJobs) - - # Match possible barcodes to codebook and add new columns with info about barcodes - # that had a codebook match - roundData = decoder(roundData, self.codebook, currentRoundOmitNum, r, numJobs) - - # Choose most likely barcode for each spot in each round by find the possible - # decodable barcode with the least spatial variance between the spots that made up - # the barcode - roundData = distanceFilter(roundData, spotCoords, r, numJobs) - - # Assign to DecodedTables dictionary - decodedTables[r] = roundData - - # Only do the following if barcodes were founds - totalSpots = sum([len(decodedTables[table]) for table in decodedTables]) - if totalSpots: - - # Turn spot table dictionary into single table, filter barcodes by round frequency, - # add additional information, and choose between barcodes that have overlapping - # spots - finalCodes = cleanup(decodedTables, spotCoords, channelDict) - - # If this is not the last round omission number to run, remove spots that have just - # been found to be in passing barcodes from spotTables so they are not used for the - # next round omission number - if currentRoundOmitNum != roundOmits[-1]: - spotTables = removeUsedSpots(finalCodes, spotTables) - - # Append found codes to allCodes table - allCodes = allCodes.append(finalCodes).reset_index(drop=True) + for s, strictness in enumerate(strictnesses): + seedNumber = seedNumbers[s] + for currentRoundOmitNum in roundOmits: + for intVal in range(50, -1, -50): + + spotsPerRound = [len(spotTables[r]) for r in range(len(spotTables))] + counter = Counter(spotsPerRound) + condition3 = True if counter[0] > currentRoundOmitNum else False + + if not condition3: + # Subset spots by intensity, start with top 50% then decode again with all + currentTables = {} + for r in range(len(spotTables)): + lowerBound = np.percentile(spotTables[r]['spot_quals'], intVal) + currentTables[r] = spotTables[r][spotTables[r]['spot_quals'] + >= lowerBound] + + # Decode each radius and remove spots found in each decoding before the next + for sr, searchRadius in enumerate(radiusSet): + if self.physicalCoords: + searchRadius = round(searchRadius * xScale, 5) + + # Only run partial codes for the final strictness and don't run full + # barcodes for the final strictness. Also don't run if there are not + # enough spots left. + condition1 = (currentRoundOmitNum == 1 and s != len(strictnesses) - 1) + condition2 = (len(roundOmits) > 1 and currentRoundOmitNum == 0 + and s == len(strictnesses) - 1) + + if condition1 or condition2 or condition3: + pass + else: + + # Creates neighbor dictionary for the current radius and current set of + # spots + neighborDict = createNeighborDict(currentTables, searchRadius, + neighborsByRadius) + + # Find best spot combination using each spot in each round as seed + decodedTables = {} + for r in range(len(spotTables)): + + # roundData will carry the possible barcode info for each spot in + # the current round being examined + roundData = deepcopy(currentTables[r]) + roundData = roundData.drop(['intensity', 'z', 'y', 'x', 'radius', + 'c', 'spot_quals'], axis=1) + + # From each spot's neighbors, create all possible combinations that + # would form a barocde with the correct number of rounds. Adds + # spot_codes column to roundData + roundData = buildBarcodes(roundData, neighborDict, + currentRoundOmitNum, channelDict, + strictness, r, numJobs) + + # When strictness is positive, distanceFilter is run first on all + # the potential barcodes to choose the one with the minimum score + # (based on spatial variance of the spots and their intensities) + # which are then matched to the codebook. Spots that have more + # possible barcodes to choose between than the current strictness + # number are dropped as ambiguous. If strictness is negative, all + # the possible barcodes are instead first matched to the codebook + # and then the lowest scoring decodable spot combination is chosen + # for each spot. Spots that have more decodable barcodes to choose + # from than the strictness value (absolute value) are dropped. + # Positive strictness method has lower false positive rate but + # finds fewer targets while the negative strictness method has + # higher false positive rates but finds more targets + if strictness > 0: + + # Choose most likely combination of spots for each seed spot + # using their spatial variance and normalized intensity values. + # Adds distance column to roundData + roundData = distanceFilter(roundData, spotCoords, spotQualDict, + r, currentRoundOmitNum, numJobs) + + # Match possible barcodes to codebook. Adds target column to + # roundData + roundData = decoder(roundData, self.codebook, channelDict, + strictness, currentRoundOmitNum, r, numJobs) + + else: + + # Match possible barcodes to codebook. Adds target column to + # roundData + roundData = decoder(roundData, self.codebook, channelDict, + strictness, currentRoundOmitNum, r, numJobs) + + # Choose most likely combination of spots for each seed spot + # using their spatial variance and normalized intensity values. + # Adds distance column to roundData + roundData = distanceFilter(roundData, spotCoords, spotQualDict, + r, currentRoundOmitNum, numJobs) + + # Assign to DecodedTables dictionary + decodedTables[r] = roundData + + # Turn spot table dictionary into single table, filter barcodes by + # round frequency, add additional information, and choose between + # barcodes that have overlapping spots + finalCodes = cleanup(decodedTables, spotCoords, channelDict, + strictness, currentRoundOmitNum, seedNumber) + + # Remove spots that have just been found to be in passing barcodes from + # neighborDict so they are not used for the next decoding round and + # filter codes whose distance value is above the minimum + if len(finalCodes) > 0: + finalCodes = finalCodes[finalCodes['distance'] <= minDist] + spotTables = removeUsedSpots(finalCodes, spotTables) + currentTables = removeUsedSpots(finalCodes, currentTables) + + # Append found codes to allCodes table + allCodes = allCodes.append(finalCodes).reset_index(drop=True) + + # Shutdown ray + ray.shutdown() # Create and fill in intensity table channels = spots.ch_labels rounds = spots.round_labels # create empty IntensityTable filled with np.nan - data = np.full((len(allCodes), len(rounds), len(channels)), fill_value=np.nan) - dims = (Features.AXIS, Axes.ROUND.value, Axes.CH.value) - - # If there are no decoded targets, return empty DecodedIntensityTable - if len(allCodes) == 0: - int_table = IntensityTable(data=data, dims=dims) - intensities = int_table.transpose('features', 'r', 'c') - print("No targets found") - return DecodedIntensityTable.from_intensity_table(intensities, targets=(Features.AXIS, [])) - + data = np.full((len(allCodes), len(channels), len(rounds)), fill_value=np.nan) + dims = (Features.AXIS, Axes.CH.value, Axes.ROUND.value) centers = allCodes['center'] coords: Mapping[Hashable, Tuple[str, Any]] = { Features.SPOT_RADIUS: (Features.AXIS, np.full(len(allCodes), 1)), - Axes.ZPLANE.value: (Features.AXIS, np.asarray([round(c[2]) for c in centers])), + Axes.ZPLANE.value: (Features.AXIS, np.asarray([round(c[0]) for c in centers])), Axes.Y.value: (Features.AXIS, np.asarray([round(c[1]) for c in centers])), - Axes.X.value: (Features.AXIS, np.asarray([round(c[0]) for c in centers])), + Axes.X.value: (Features.AXIS, np.asarray([round(c[2]) for c in centers])), Features.SPOT_ID: (Features.AXIS, np.arange(len(allCodes))), Features.AXIS: (Features.AXIS, np.arange(len(allCodes))), Axes.ROUND.value: (Axes.ROUND.value, rounds), @@ -217,23 +358,24 @@ def run(self, table_codes = [] for i in range(len(allCodes)): code = [] - for ch in allCodes.loc[i, 'best_barcodes']: - # If a round is not used, row will be all zeros (subtract one because we added - # one earlier) - code.append(np.asarray([0 if j != ch - 1 else 1 for j in range(len(channels))])) - table_codes.append(np.asarray(code)) + # ints = allCodes.loc[i, 'intensities'] + for j, ch in enumerate(allCodes.loc[i, 'best_barcodes']): + # If a round is not used, row will be all zeros + code.append(np.asarray([0 if k != ch - 1 else 1 for k in range(len(channels))])) + table_codes.append(np.asarray(code).T) int_table.values = np.asarray(table_codes) int_table = transfer_physical_coords_to_intensity_table(intensity_table=int_table, spots=spots) + intensities = int_table.transpose('features', 'r', 'c') # Validate results are correct shape - self.codebook._validate_decode_intensity_input_matches_codebook_shape(int_table) + self.codebook._validate_decode_intensity_input_matches_codebook_shape(intensities) # Create DecodedIntensityTable result = DecodedIntensityTable.from_intensity_table( - int_table, - targets=(Features.AXIS, allCodes['best_targets'].astype('U')), - distances=(Features.AXIS, allCodes["best_distances"]), + intensities, + targets=(Features.AXIS, allCodes['targets'].astype('U')), + distances=(Features.AXIS, allCodes["distance"]), passes_threshold=(Features.AXIS, np.full(len(allCodes), True)), rounds_used=(Features.AXIS, allCodes['rounds_used'])) diff --git a/starfish/core/spots/DecodeSpots/check_all_funcs.py b/starfish/core/spots/DecodeSpots/check_all_funcs.py index 1fd567b89..bed2399b6 100644 --- a/starfish/core/spots/DecodeSpots/check_all_funcs.py +++ b/starfish/core/spots/DecodeSpots/check_all_funcs.py @@ -2,24 +2,26 @@ import warnings from collections import Counter, defaultdict from copy import deepcopy -from functools import partial from itertools import chain, islice, permutations, product -from multiprocessing import Pool import numpy as np import pandas as pd +import ray from scipy.spatial import cKDTree from starfish.core.codebook.codebook import Codebook +from starfish.core.types import SpotFindingResults from starfish.types import Axes warnings.filterwarnings('ignore') -def createRefDicts(spotTables: dict, searchRadius: float) -> tuple: +def findNeighbors(spotTables: dict, + searchRadius: float, + numJobs: int) -> dict: + ''' - Creates reference dictionary that have mappings between the each spot's round and ID and their - neighbors, channel label, and spatial coordinates. Spot IDs correspond to their 1-based index - location in the spotTables dataframes. + Using scipy's cKDTree method, finds all neighbors within the seach radius between the spots in + each pair of rounds and stores the indices in a dictionary for later access. Parameters ---------- @@ -32,34 +34,65 @@ def createRefDicts(spotTables: dict, searchRadius: float) -> tuple: Returns ------- - tuple : First object is the neighbors dictionary, second is the channel dictionary, and the - third object is the spatial coordinate dictionary + dict: a dictionary with the following structure: + {round: { + spotID in round: { + neighborRound: + [list of spotIDs in neighborRound within searchRadius of spotID in round] + } + } + } ''' - # Create dictionary of neighbors (within the search radius) in other rounds for each spot - neighborDict = findNeighbors(spotTables, searchRadius) + allNeighborDict = {} + for r1 in range((len(spotTables))): + tree = cKDTree(spotTables[r1][['z', 'y', 'x']]) + for r2 in list(range((len(spotTables))))[r1 + 1:]: + allNeighborDict[(r1, r2)] = tree.query_ball_point(spotTables[r2][['z', 'y', 'x']], + searchRadius, workers=numJobs) - # Create dictionaries with mapping from spot id (row index) in spotTables to channel - # number and one with spot coordinates for fast access - channelDict = {} - spotCoords = {} - for r in [*spotTables]: - spotTables[r].index += 1 - channelDict[r] = spotTables[r]['c'].to_dict() - channelDict[r][0] = 0 - spotCoords[r] = spotTables[r][['z', 'y', 'x']].T.to_dict() - for key in [*spotCoords[r]]: - spotCoords[r][key] = tuple([item[1] for item in sorted(spotCoords[r][key].items(), - key=lambda x: x[0])]) - spotTables[r].index -= 1 + return allNeighborDict - return neighborDict, channelDict, spotCoords +def createNeighborDict(spotTables: dict, + searchRadius: float, + neighborsByRadius: dict) -> dict: -def findNeighbors(spotTables: dict, searchRadius: float) -> dict: ''' - Function that takes spatial information from the spot tables from each round and creates a - dictionary that contains all the neighbors for each spot in other rounds that are within the - search radius. + Create dictionary of neighbors (within the search radius) in other rounds for each spot. + Dictionary has format: + neighborDict[roundNum][spotID] = {0 : neighbors in round 0, 1: neighbors in round 1, etc} + ''' + + # Create empty neighbor dictionary + neighborDict = {} + spotIDs = {} + for r in spotTables: + spotIDs[r] = {idd: 0 for idd in spotTables[r]['spot_id']} + neighborDict[r] = {i: defaultdict(list, {r: [i]}) for i in spotTables[r]['spot_id']} + + # Add neighbors in neighborsByRadius[searchRadius] but check to make sure that spot is still + # available before adding it + for r1 in range(len(spotTables)): + for r2 in list(range((len(spotTables))))[r1 + 1:]: + for j, neighbors in enumerate(neighborsByRadius[searchRadius][(r1, r2)]): + try: + spotIDs[r2][j + 1] + for neighbor in neighbors: + try: + spotIDs[r1][neighbor + 1] + neighborDict[r1][neighbor + 1][r2].append(j + 1) + neighborDict[r2][j + 1][r1].append(neighbor + 1) + except Exception: + pass + except Exception: + pass + return neighborDict + +def createRefDicts(spotTables: dict, numJobs: int) -> tuple: + + ''' + Create dictionaries with mapping from spot id (row index + 1) in spotTables to channel label, + spatial coordinates raw intensity and normalized intensity. Parameters ---------- @@ -72,37 +105,31 @@ def findNeighbors(spotTables: dict, searchRadius: float) -> dict: Returns ------- - dict: a dictionary with the following structure: - {round: { - spotID in round: { - neighborRound: - [list of spotIDs in neighborRound within searchRadius of spotID in round] - } - } - } + tuple : First object is the neighbors dictionary, second is the channel dictionary, and the + third object is the spatial coordinate dictionary ''' - # Create empty neighbor dictionary - neighborDict = {} - for r in spotTables: - neighborDict[r] = {i: defaultdict(list, {r: [i]}) for i in - range(1, len(spotTables[r]) + 1)} - - # For each pairing of rounds, find all mutual neighbors within the search radius for each spot - # and assigns them in the neighborDict dictionary - # Number assigned each spot in neighborDict is the index of it's original location in - # spotTables and is used to track each spot uniquely throughout - for i, r1 in enumerate(range((len(spotTables)))): - tree = cKDTree(spotTables[r1][['z', 'y', 'x']]) - for r2 in list(range((len(spotTables))))[i + 1:]: - allNeighbors = tree.query_ball_point(spotTables[r2][['z', 'y', 'x']], searchRadius) - for j, neighbors in enumerate(allNeighbors): - if neighbors != []: - for neighbor in neighbors: - neighborDict[r1][neighbor + 1][r2].append(j + 1) - neighborDict[r2][j + 1][r1].append(neighbor + 1) + # Create channel label and spatial coordinate dictionaries + channelDict = {} + spotCoords = {} + for r in [*spotTables]: + channelDict[r] = spotTables[r][['c', 'spot_id']].set_index('spot_id').to_dict()['c'] + channelDict[r][0] = 0 + tmpTable = spotTables[r][['z', 'y', 'x', 'spot_id']].set_index('spot_id') + spotCoords[r] = tmpTable.to_dict(orient='index') + for key in [*spotCoords[r]]: + spotCoords[r][key] = tuple(spotCoords[r][key].values()) - return neighborDict + # Create raw intensity dictionary + spotIntensities = {r: spotTables[r][['intensity', 'spot_id']].set_index('spot_id').to_dict() + ['intensity'] for r in [*spotTables]} + for r in [*spotTables]: + spotIntensities[r][0] = 0 + + # Create normalized intensity dictionary + spotQualDict = spotQuality(spotTables, spotCoords, spotIntensities, channelDict, numJobs) + + return channelDict, spotCoords, spotIntensities, spotQualDict def encodeSpots(spotCodes: list) -> list: ''' @@ -155,10 +182,97 @@ def decodeSpots(compressed: list, roundNum: int) -> list: for j in range(len(idxs))] return decompressed +@ray.remote +def spotQualityFunc(spots: SpotFindingResults, + spotCoords: dict, + spotIntensities: dict, + spotTables: dict, + channelDict: dict, + r: int) -> list: + + ''' + Helper function for spotQuality to run in parallel w/ ray + ''' + + # Find spots in the same neighborhood (same channel and z slice and less than 100 pixels away + # in either x or y direction) + neighborhood = 100 + quals = [] + for i, spot in enumerate(spots): + z, y, x = spotCoords[r][spot] + ch = channelDict[r][spot] + yMin = y - neighborhood if y - neighborhood >= 0 else 0 + yMax = y + neighborhood if y + neighborhood <= 2048 else 2048 + xMin = x - neighborhood if x - neighborhood >= 0 else 0 + xMax = x + neighborhood if x + neighborhood <= 2048 else 2048 + neighborInts = spotTables[r][(spotTables[r]['c'] == ch) + & (spotTables[r]['z'] == z) + & (spotTables[r]['y'] >= yMin) + & (spotTables[r]['y'] < yMax) + & (spotTables[r]['x'] >= xMin) + & (spotTables[r]['x'] < xMax)]['intensity'] + # If no neighbors drop requirement that they be within 100 pixels of each other + if len(neighborInts) == 1: + neighborInts = spotTables[r][(spotTables[r]['c'] == ch) + & (spotTables[r]['z'] == z)]['intensity'] + # If still no neighbors drop requirement that they be on the same z slice + if len(neighborInts) == 1: + neighborInts = spotTables[r][(spotTables[r]['c'] == ch)]['intensity'] + # Calculate the l2 norm of the neighbor's intensities and divide the spot's intensity by + # this value to get it's normalized intensity value + norm = np.linalg.norm(neighborInts) + quals.append(spotIntensities[r][spot] / norm) + + return quals + +def spotQuality(spotTables: dict, + spotCoords: dict, + spotIntensities: dict, + channelDict: dict, + numJobs: int) -> dict: + + ''' + Creates dictionary mapping each spot ID to their normalized intensity value. Calculated as the + spot intensity value divided by the l2 norm of the intensities of all the spots in the same + neighborhood. + ''' + + # Place data dictionary into shared memory + spotCoordsID = ray.put(spotCoords) + spotIntsID = ray.put(spotIntensities) + spotTablesID = ray.put(spotTables) + channelDictID = ray.put(channelDict) + + # Calculate normalize spot intensities for each spot in each round + spotQuals = {} + for r in range(len(spotTables)): + roundSpots = spotTables[r]['spot_id'] + spotQuals[r] = {} + + # Calculates index ranges to chunk data by + ranges = [0] + for i in range(1, numJobs): + ranges.append(int((len(roundSpots) / numJobs) * i)) + ranges.append(len(roundSpots)) + chunkedSpots = [roundSpots[ranges[i]:ranges[i + 1]] for i in range(len(ranges[:-1]))] + + # Run in parallel + results = [spotQualityFunc.remote(subSpots, spotCoordsID, spotIntsID, spotTablesID, + channelDictID, r) + for subSpots in chunkedSpots] + rayResults = ray.get(results) + + # Extract results + for spot, qual in zip(roundSpots, list(chain(*rayResults))): + spotQuals[r][spot] = qual + + return spotQuals + +@ray.remote def barcodeBuildFunc(allNeighbors: list, channelDict: dict, - roundOmitNum: int, currentRound: int, + roundOmitNum: int, roundNum: int) -> tuple: ''' Subfunction to buildBarcodes that allows it to run in parallel chunks @@ -188,48 +302,39 @@ def barcodeBuildFunc(allNeighbors: list, a list of the possible barcodes ''' - # Build barcodes from neighbors # spotCodes are the ordered spot IDs of the spots making up each barcode while barcodes are # the corresponding channel labels, need spotCodes so each barcode can have a unique # identifier allSpotCodes = [] - allBarcodes = [] - for i in range(len(allNeighbors)): - neighbors = deepcopy(allNeighbors[i]) - neighborLists = [] - for rnd in range(roundNum): - # Adds a 0 to each round of the neighbors dictionary (allows barcodes with dropped - # rounds to be created) - if roundOmitNum > 0: - neighbors[rnd].append(0) - neighborLists.append(neighbors[rnd]) + for neighbors in allNeighbors: + neighborLists = [neighbors[rnd] for rnd in range(roundNum)] + # Adds a 0 to each round of the neighbors dictionary (allows barcodes with dropped + # rounds to be created) + if roundOmitNum > 0: + [neighbors[rnd].append(0) for rnd in range(roundNum)] # Creates all possible spot code combinations from neighbors codes = list(product(*neighborLists)) # Only save the ones with the correct number of dropped rounds - counters = [Counter(code) for code in codes] # type: typing.List[Counter] + counters = [Counter(code) for code in codes] spotCodes = [code for j, code in enumerate(codes) if counters[j][0] == roundOmitNum] spotCodes = [code for code in spotCodes if code[currentRound] != 0] - # Create barcodes from spot codes using the mapping from spot ID to channel - barcodes = [] - for spotCode in spotCodes: - barcode = [channelDict[spotInd][spotCode[spotInd]] for spotInd in range(len(spotCode))] - barcodes.append(hash(tuple(barcode))) - allBarcodes.append(barcodes) allSpotCodes.append(encodeSpots(spotCodes)) - return (allSpotCodes, allBarcodes) + return allSpotCodes def buildBarcodes(roundData: pd.DataFrame, neighborDict: dict, roundOmitNum: int, channelDict: dict, + strictness: int, currentRound: int, numJobs: int) -> pd.DataFrame: + ''' - Function that adds to the current rounds spot table all the possible barcodes that could be - formed using the neighbors of each spot, spots without enough neighbors to form a barcode - # are dropped. + Builds possible barcodes for each seed spot from its neighbors. First checks that each spot has + enough neighbors in each round to form a barcode and, depending on the strictness value, drops + spots who have too many possible barcodes to choose from Parameters ---------- @@ -261,46 +366,51 @@ def buildBarcodes(roundData: pd.DataFrame, ''' # Only keep spots that have enough neighbors to form a barcode (determined by the total number - # of rounds and the number of rounds that can be omitted from each code) - - passingSpots = {} + # of rounds and the number of rounds that can be omitted from each code) and if strictness is + # positive, drop spots that have more than the strictness value number of possible barcodes roundNum = len(neighborDict) - for key in neighborDict[currentRound]: - if len(neighborDict[currentRound][key]) >= roundNum - roundOmitNum: - passingSpots[key] = neighborDict[currentRound][key] - passed = list(passingSpots.keys()) - roundData = roundData.iloc[np.asarray(passed) - 1] - roundData['neighbors'] = [passingSpots[i] for i in roundData.index] - roundData = roundData.reset_index(drop=True) + if strictness > 0: + passed = [key for key in neighborDict[currentRound] if + len(neighborDict[currentRound][key]) >= roundNum - roundOmitNum + and np.prod([len(values) for values in + neighborDict[currentRound][key].values()]) <= strictness] + else: + passed = [key for key in neighborDict[currentRound] if + len(neighborDict[currentRound][key]) >= roundNum - roundOmitNum] + roundData = roundData[roundData['spot_id'].isin(passed)].reset_index(drop=True) + roundData['neighbors'] = [neighborDict[currentRound][p] for p in passed] # Find all possible barcodes for the spots in each round by splitting each round's spots into # numJob chunks and constructing each chunks barcodes in parallel + # Save the current round's data table and the channelDict to ray memory + channelDictID = ray.put(channelDict) + # Calculates index ranges to chunk data by ranges = [0] for i in range(1, numJobs + 1): ranges.append(int((len(roundData) / numJobs) * i)) - chunkedNeighbors = [] - for i in range(len(ranges[:-1])): - chunkedNeighbors.append(list(roundData['neighbors'][ranges[i]:ranges[i + 1]])) + chunkedNeighbors = [list(roundData['neighbors'])[ranges[i]: ranges[i + 1]] for i in + range(len(ranges[:-1]))] # Run in parallel - with Pool(processes=numJobs) as pool: - part = partial(barcodeBuildFunc, channelDict=channelDict, roundOmitNum=roundOmitNum, - roundNum=roundNum, currentRound=currentRound) - results = pool.map(part, [chunkedNeighbors[i] for i in range(len(ranges[:-1]))]) + results = [barcodeBuildFunc.remote(chunkedNeighbors[i], channelDictID, currentRound, + roundOmitNum, roundNum) + for i in range(len(chunkedNeighbors))] + rayResults = ray.get(results) - # Drop neighbors column (saves memory) - roundData = roundData.drop(['neighbors'], axis=1) + # Drop unneeded columns (saves memory) + roundData = roundData.drop(['neighbors', 'spot_id'], axis=1) - # Add possible barcodes and spot codes (same order) to spot table (must chain results from - # different jobs together) - roundData['spot_codes'] = list(chain(*[job[0] for job in results])) - roundData['barcodes'] = list(chain(*[job[1] for job in results])) + # Add possible spot codes to spot table (must chain results from different jobs together) + roundData['spot_codes'] = list(chain(*[job for job in rayResults])) return roundData -def decodeFunc(codes: pd.DataFrame, permutationCodes: dict) -> tuple: +@ray.remote +def decodeFunc(data: pd.DataFrame, + permutationCodes: dict, + strictness: int) -> tuple: ''' Subfunction for decoder that allows it to run in parallel chunks using ray @@ -319,11 +429,11 @@ def decodeFunc(codes: pd.DataFrame, permutationCodes: dict) -> tuple: fourth element is a list of rounds that were omitted for each decoded barcode ''' - # Goes through all possible decodings of each spot (ensures each spot is only looked up once) + # Checks if each barcode is in the permutationsCodes dict, if it isn't, there is no match allTargets = [] allDecodedSpotCodes = [] - allBarcodes = list(codes['barcodes']) - allSpotCodes = list(codes['spot_codes']) + allBarcodes = list(data['barcodes']) + allSpotCodes = list(data['spot_codes']) for i in range(len(allBarcodes)): targets = [] decodedSpotCodes = [] @@ -338,12 +448,13 @@ def decodeFunc(codes: pd.DataFrame, permutationCodes: dict) -> tuple: pass allTargets.append(targets) allDecodedSpotCodes.append(decodedSpotCodes) - return (allTargets, allDecodedSpotCodes) def decoder(roundData: pd.DataFrame, codebook: Codebook, - roundOmitNum: int, + channelDict: dict, + strictness: str, + currentRoundOmitNum: int, currentRound: int, numJobs: int) -> pd.DataFrame: ''' @@ -399,26 +510,36 @@ def generateRoundPermutations(size: int, roundOmitNum: int) -> list: return sorted(set(list(permutations([*([False] * roundOmitNum), *([True] * (size - roundOmitNum))])))) + # Add barcodes column by mapping spotIDs in spot_codes to channel labels using channelDict + if strictness > 0: + roundData['barcodes'] = [[hash(tuple([channelDict[j][spot] for j, spot in + enumerate(code)]))] for code in roundData['spot_codes']] + roundData['spot_codes'] = [[codes] for codes in roundData['spot_codes']] + else: + barcodes = [] + for codes in roundData['spot_codes']: + barcodes.append([hash(tuple([channelDict[j][spot] for j, spot in enumerate(code)])) + for code in decodeSpots(codes, len(channelDict))]) + roundData['barcodes'] = barcodes + # Create list of logical arrays corresponding to the round sets being used to decode - roundPermutations = generateRoundPermutations(codebook.sizes[Axes.ROUND], roundOmitNum) + roundPermutations = generateRoundPermutations(codebook.sizes[Axes.ROUND], currentRoundOmitNum) # Create dictionary where the keys are the different round sets that can be used for decoding # and the values are the modified codebooks corresponding to the rounds used permCodeDict = {} - targets = codebook['target'].data for currentRounds in roundPermutations: - codes = codebook.data.argmax(axis=2) - if roundOmitNum > 0: + codes = codebook.argmax(Axes.CH.value) + if currentRoundOmitNum > 0: omittedRounds = np.argwhere(~np.asarray(currentRounds)) - # Makes entire column that is being omitted -1, which become 0 after 1 is added - # so they match up with the barcodes made earlier - codes[:, omittedRounds] = -1 - # Makes codes 1-based which prevents collisions when hashing - codes += 1 - # Barcodes are hashed as before - roundDict = dict(zip([hash(tuple(code)) for code in codes], targets)) + codes.data[:, omittedRounds] = -1 + codes.data += 1 + roundDict = dict(zip([hash(tuple(code)) for code in codes.data], codes['target'].data)) permCodeDict.update(roundDict) + # Put data table and permutations codes dictionary in ray storage + permutationCodesID = ray.put(permCodeDict) + # Calculates index ranges to chunk data by and creates list of chunked data to loop through ranges = [0] for i in range(1, numJobs + 1): @@ -428,28 +549,31 @@ def generateRoundPermutations(size: int, roundOmitNum: int) -> list: chunkedData.append(deepcopy(roundData[ranges[i]:ranges[i + 1]])) # Run in parallel - with Pool(processes=numJobs) as pool: - part = partial(decodeFunc, permutationCodes=permCodeDict) - results = pool.map(part, [chunkedData[i][['barcodes', 'spot_codes']] - for i in range(len(chunkedData))]) + results = [decodeFunc.remote(chunkedData[i], permutationCodesID, strictness) + for i in range(len(ranges[:-1]))] + rayResults = ray.get(results) # Update table - roundData['targets'] = list(chain(*[job[0] for job in results])) - roundData['decoded_spot_codes'] = list(chain(*[job[1] for job in results])) + roundData['targets'] = list(chain(*[job[0] for job in rayResults])) + roundData['spot_codes'] = list(chain(*[job[1] for job in rayResults])) - # Drop barcodes and spot_codes column (saves memory) - roundData = roundData.drop(['spot_codes', 'barcodes'], axis=1) + roundData = roundData[[len(targets) > 0 for targets in + roundData['targets']]].reset_index(drop=True) - # Remove rows that have no decoded barcodes - roundData = roundData[roundData['targets'].astype(bool)].reset_index(drop=True) + if len(roundData) > 0: + if strictness < 0: + roundData = roundData[[len(targets) <= np.abs(strictness) for targets in + roundData['targets']]].reset_index(drop=True) - # Convert spot codes back to tuples - roundData['decoded_spot_codes'] = list(map(partial(decodeSpots, roundNum=len(codebook.r)), - roundData['decoded_spot_codes'])) + roundData = roundData.drop(['barcodes'], axis=1) return roundData -def distanceFunc(subSpotCodes: list, spotCoords: dict) -> list: +@ray.remote +def distanceFunc(subSpotCodes: list, + subTargets: list, + spotCoords: dict, + spotQualDict: dict) -> tuple: ''' Subfunction for distanceFilter to allow it to run in parallel using ray @@ -469,21 +593,37 @@ def distanceFunc(subSpotCodes: list, spotCoords: dict) -> list: ''' - # Calculate spatial variances for current chunk of spot codes - allDistances = [] - for spotCodes in subSpotCodes: - distances = [] - for s, spotCode in enumerate(spotCodes): - coords = np.asarray([spotCoords[j][spot] for j, spot in enumerate(spotCode) - if spot != 0]) - # Distance is calculate as the sum of variances of the coordinates along each axis - distances.append(sum(np.var(coords, axis=0))) - allDistances.append(distances) - return allDistances + # Find minimum scoring combination of spots from set of possible combinations + constant = 2 + bestSpotCodes = [] + bestDistances = [] + bestTargets = [] + for i, codes in enumerate(subSpotCodes): + quals = [sum([spotQualDict[r][spot] for r, spot in enumerate(code) if spot != 0]) + for code in codes] + quals = np.asarray([-np.log(1 / (1 + (len(spotCoords) - qual))) for qual in quals]) + subCoords = [[spotCoords[r][spot] for r, spot in enumerate(code) if spot != 0] + for code in codes] + spaVars = [sum(np.var(np.asarray(coords), axis=0)) for coords in subCoords] + spaVars = np.asarray([-np.log(1 / (1 + spaVar)) for spaVar in spaVars]) + combined = quals + (spaVars * constant) + minInds = np.where(combined == min(combined))[0] + if len(minInds) == 1: + bestSpotCodes.append(codes[minInds[0]]) + bestDistances.append(combined[minInds[0]]) + bestTargets.append(subTargets[i][minInds[0]]) + else: + bestSpotCodes.append(-1) + bestDistances.append(-1) + bestTargets.append(-1) + + return (bestSpotCodes, bestDistances, bestTargets) def distanceFilter(roundData: pd.DataFrame, spotCoords: dict, + spotQualDict: dict, currentRound: int, + currentRoundOmitNum: int, numJobs: int) -> pd.DataFrame: ''' Function that chooses between the best barcode for each spot from the set of decodable barcodes. @@ -512,63 +652,56 @@ def distanceFilter(roundData: pd.DataFrame, ''' # Calculate the spatial variance for each decodable barcode for each spot in each round - allSpotCodes = roundData['decoded_spot_codes'] - - # Calculates index ranges to chunk data by + if len(roundData) == 0: + return roundData + + if 'targets' in roundData.columns: + checkTargets = True + else: + checkTargets = False + + # Extract spot codes and targets + allSpotCodes = [decodeSpots(codes, len(spotCoords)) for codes in roundData['spot_codes']] + if checkTargets: + allTargets = roundData['targets'].tolist() + else: + allTargets = [[0 for code in codes] for codes in roundData['spot_codes']] + + # Put reference dicts into shared memory + spotCoordsID = ray.put(spotCoords) + spotQualDictID = ray.put(spotQualDict) + + # Find ranges to chunk data by ranges = [0] for i in range(1, numJobs): ranges.append(int((len(roundData) / numJobs) * i)) ranges.append(len(roundData)) chunkedSpotCodes = [allSpotCodes[ranges[i]:ranges[i + 1]] for i in range(len(ranges[:-1]))] + chunkedTargets = [allTargets[ranges[i]:ranges[i + 1]] for i in range(len(ranges[:-1]))] # Run in parallel - with Pool(processes=numJobs) as pool: - part = partial(distanceFunc, spotCoords=spotCoords) - results = pool.map(part, [list(subSpotCodes) for subSpotCodes in chunkedSpotCodes]) + results = [distanceFunc.remote(subSpotCodes, subTargets, spotCoordsID, spotQualDictID) + for subSpotCodes, subTargets in zip(chunkedSpotCodes, chunkedTargets)] + rayResults = ray.get(results) - # Add distances to decodedTables as new column - roundData['distance'] = list(chain(*[job for job in results])) + # Add distances to decodedTables as new column and replace spot_codes and targets column with + # only the min scoring values + roundData['spot_codes'] = list(chain(*[job[0] for job in rayResults])) + roundData['distance'] = list(chain(*[job[1] for job in rayResults])) + if checkTargets: + roundData['targets'] = list(chain(*[job[2] for job in rayResults])) - # Pick minimum distance barcode(s) for each spot - bestSpotCodes = [] - bestTargets = [] - bestDistances = [] - dataSpotCodes = list(roundData['decoded_spot_codes']) - dataDistances = list(roundData['distance']) - dataTargets = list(roundData['targets']) - for i in range(len(roundData)): - spotCodes = dataSpotCodes[i] - distances = dataDistances[i] - targets = dataTargets[i] - # If only one barcode to choose from, that one is picked as best - if len(distances) == 1: - bestSpotCodes.append(spotCodes) - bestTargets.append(targets) - bestDistances.append(distances) - # Otherwise find the minimum(s) - else: - mins = np.argwhere(distances == min(distances)) - bestSpotCodes.append([spotCodes[m[0]] for m in mins]) - bestTargets.append([targets[m[0]] for m in mins]) - bestDistances.append([distances[m[0]] for m in mins]) - # Create new columns with minimum distance barcode information - roundData['best_spot_codes'] = bestSpotCodes - roundData['best_targets'] = bestTargets - roundData['best_distances'] = bestDistances - - # Drop old columns - roundData = roundData.drop(['targets', 'decoded_spot_codes'], axis=1) - - # Only keep barcodes with only one minimum distance - targets = roundData['best_targets'] - keep = [i for i in range(len(roundData)) if len(targets[i]) == 1] - roundData = roundData.iloc[keep] + # Remove spots who had a tie between possible spot combinations + roundData = roundData[roundData['spot_codes'] != -1] return roundData def cleanup(bestPerSpotTables: dict, spotCoords: dict, - channelDict: dict) -> pd.DataFrame: + channelDict: dict, + strictness: int, + currentRoundOmitNum: int, + seedNumber: int) -> pd.DataFrame: ''' Function that combines all "best" codes for each spot in each round into a single table, filters them by their frequency (with a user-defined threshold), chooses between overlapping @@ -597,26 +730,36 @@ def cleanup(bestPerSpotTables: dict, mergedCodes = pd.DataFrame() roundNum = len(bestPerSpotTables) for r in range(roundNum): - spotCodes = bestPerSpotTables[r]['best_spot_codes'] - targets = bestPerSpotTables[r]['best_targets'] - distances = bestPerSpotTables[r]['best_distances'] - # Turn each barcode and spot code into a tuple so they can be used as dictionary keys - bestPerSpotTables[r]['best_spot_codes'] = [tuple(spotCode[0]) for spotCode in spotCodes] - bestPerSpotTables[r]['best_targets'] = [target[0] for target in targets] - bestPerSpotTables[r]['best_distances'] = [distance[0] for distance in distances] - mergedCodes = mergedCodes.append(bestPerSpotTables[r]) + if len(bestPerSpotTables[r]) != 0: + if strictness > 0: + spotCodes = bestPerSpotTables[r]['spot_codes'] + targets = bestPerSpotTables[r]['targets'] + # Turn each barcode and spot code into a tuple so they can be used as dictionary + # keys + bestPerSpotTables[r]['spot_codes'] = [tuple(spotCode[0]) for spotCode in spotCodes] + bestPerSpotTables[r]['targets'] = [target[0] for target in targets] + mergedCodes = mergedCodes.append(bestPerSpotTables[r]) mergedCodes = mergedCodes.reset_index(drop=True) + # If no codes return empty dataframe + if len(mergedCodes) == 0: + return pd.DataFrame() + # Only pass codes that are chosen as best for at least 2 of the spots that make it up - spotCodes = mergedCodes['best_spot_codes'] + spotCodes = mergedCodes['spot_codes'] counts = defaultdict(int) # type: dict for code in spotCodes: counts[code] += 1 - passing = list(set(code for code in counts if counts[code] > 1)) - passingCodes = mergedCodes[mergedCodes['best_spot_codes'].isin(passing)].reset_index(drop=True) - passingCodes = passingCodes.iloc[passingCodes['best_spot_codes'].drop_duplicates().index] + passing = list(set(code for code in counts if counts[code] >= seedNumber)) + + passingCodes = mergedCodes[mergedCodes['spot_codes'].isin(passing)].reset_index(drop=True) + passingCodes = passingCodes.iloc[passingCodes['spot_codes'].drop_duplicates().index] passingCodes = passingCodes.reset_index(drop=True) + # If no codes return empty dataframe + if len(passingCodes) == 0: + return pd.DataFrame() + # Need to find maximum independent set of spot codes where each spot code is a node and there # is an edge connecting two codes if they share at least one spot. Does this by eliminating # nodes (spot codes) that have the most edges first and if there is tie for which has the most @@ -626,7 +769,7 @@ def cleanup(bestPerSpotTables: dict, # until there are no more edges in the graph # First prepare list of counters of the spot IDs for each round - spotCodes = passingCodes['best_spot_codes'] + spotCodes = passingCodes['spot_codes'] codeArray = np.asarray([np.asarray(code) for code in spotCodes]) counters = [] # type: typing.List[Counter] for r in range(roundNum): @@ -656,7 +799,7 @@ def cleanup(bestPerSpotTables: dict, # spotDict dictionary has mapping for codes to their index location in spotCodes and # codeDistance has mapping for codes to their spatial variance value spotDict = {code: i for i, code in enumerate(spotCodes)} - codeDistance = passingCodes.set_index('best_spot_codes')['best_distances'].to_dict() + codeDistance = passingCodes.set_index('spot_codes')['distance'].to_dict() while len(collisions): # Gets all the codes that have the highest value for number of edges, and then sorts them by # their spatial variance values in decreasing order @@ -688,14 +831,18 @@ def cleanup(bestPerSpotTables: dict, # Only choose codes that we found to not have any edges in the graph finalCodes = passingCodes.loc[keep].reset_index(drop=True) + if len(finalCodes) == 0: + return pd.DataFrame() + # Add barcode lables, spot coordinates, barcode center coordinates, and number of rounds used # for each barcode to table barcodes = [] allCoords = [] centers = [] roundsUsed = [] + # intensities = [] for i in range(len(finalCodes)): - spotCode = finalCodes.iloc[i]['best_spot_codes'] + spotCode = finalCodes.iloc[i]['spot_codes'] barcodes.append([channelDict[j][spot] for j, spot in enumerate(spotCode)]) counter = Counter(spotCode) # type: Counter roundsUsed.append(roundNum - counter[0]) @@ -703,10 +850,12 @@ def cleanup(bestPerSpotTables: dict, allCoords.append(coords) coords = np.asarray([coord for coord in coords]) center = np.asarray(coords).mean(axis=0) - centers.append(center) + centers.append(tuple(center)) + # intensities.append([spotIntensities[j][spot] for j,spot in enumerate(spotCode)]) finalCodes['best_barcodes'] = barcodes finalCodes['coords'] = allCoords finalCodes['center'] = centers + # finalCodes['intensities'] = intensities finalCodes['rounds_used'] = roundsUsed return finalCodes @@ -733,9 +882,10 @@ def removeUsedSpots(finalCodes: pd.DataFrame, spotTables: dict) -> dict: # Remove used spots for r in range(len(spotTables)): - usedSpots = set([passed[r] for passed in finalCodes['best_spot_codes'] + usedSpots = set([passed[r] for passed in finalCodes['spot_codes'] if passed[r] != 0]) - spotTables[r] = spotTables[r].iloc[[i for i in range(len(spotTables[r])) if i - not in usedSpots]].reset_index(drop=True) + spotTables[r] = spotTables[r][~spotTables[r]['spot_id'].isin(usedSpots)] + spotTables[r] = spotTables[r].reset_index(drop=True) + spotTables[r].index = range(1, len(spotTables[r]) + 1) return spotTables From 1d19407ef9fe33d7189b58af3bbd7716a0104459 Mon Sep 17 00:00:00 2001 From: nickeener Date: Fri, 1 Apr 2022 17:34:33 -0700 Subject: [PATCH 20/30] Updated comments --- .../spots/DecodeSpots/check_all_decoder.py | 64 ++++-- .../core/spots/DecodeSpots/check_all_funcs.py | 188 ++++++++++++++---- 2 files changed, 198 insertions(+), 54 deletions(-) diff --git a/starfish/core/spots/DecodeSpots/check_all_decoder.py b/starfish/core/spots/DecodeSpots/check_all_decoder.py index 46159c143..45997b04d 100644 --- a/starfish/core/spots/DecodeSpots/check_all_decoder.py +++ b/starfish/core/spots/DecodeSpots/check_all_decoder.py @@ -22,29 +22,56 @@ class CheckAll(DecodeSpotsAlgorithm): """ Decode spots by generating all possible combinations of spots to form barcodes given a radius - distance that spots must be from each other in order to form a barcode. Then chooses the best + distance that spots may be from each other in order to form a barcode. Then chooses the best set of nonoverlapping spot combinations by choosing the ones with the least spatial variance - of their spot coordinates and are also found to be best for multiple spots in the barcode - (see algorithm below). Allows for error correction rounds. + of their spot coordinates, highest intensity and are also found to be best for multiple spots + in the barcode (see algorithm below). Allows for error correction rounds. + + Two slightly different algorithms are used to balance recoving the full set of visible targets + while ensuring accuracy of the decoded targets. They share the same steps but two are switched + between the different versions. The following is for one version: (see input parmeters below) 1. For each spot in each round, find all neighbors in other rounds that are within the search radius 2. For each spot in each round, build all possible full length barcodes based on the channel labels of the spot's neighbors and itself - 3. Drop barcodes that don't have a matching target in the codebook - 4. Choose the "best" barcode of each spot's possible target matching barcodes by calculating - the sum of variances for each of the spatial coordinates of the spots that make up each barcode - and choosing the minimum distance barcode (if there is a tie, they are all dropped as - ambiguous). Each spot is assigned a "best" barcode in this way. - 5. Only keep barcodes/targets that were found as "best" using at least 2 of the spots that make - each up + 3. Choose the "best" barcode of each spot's possible barcodes by calculating a score that is + based on minimizing the spatial variance and maximizing the intensities of the spots in the + barcode. Each spot is assigned a "best" barcode in this way. + 4. Drop barcodes that don't have a matching target in the codebook + 5. Only keep barcodes/targets that were found as "best" using at least x of the spots that make + each up (x is determined by parameters) 6. Find maximum independent set (approximation) of the spot combinations so no two barcodes use the same spot - 7. Remove all spots used in decoded targets that passed the previous filtering steps from the - original set of spots - 8. Rerun steps 2-5 for barcodes that use less than the full set of rounds for codebook - matching (how many rounds can be dropped determined by error_rounds parameter) + + The other method is the same except steps 3 and 4 are switched so that the minimum scoring + barcode is chosen from the set of possible codes that have a match to the codebook. The first + method will return fewer decoded targets but has a lower false positive rate while the other + method will find more targets but at the cost of an increased false positive rate. + + Decoding is run in multiple stages with the parameters becoming less strict as it gets into + later stages. The high accuracy algorithm is always run first followed by the low accuracy + method, each with slightly different parameters based on the choice of "mode" parameter. After + each decoding, the spots found to be in decoded barcodes are removed from the original set of + spots before they are decoded again with a new set of parameters. In order to simplify the + number of parameters to choose from, I have sorted them into three sets of presets determined + by the "mode" parameter. + + Decoding is done multiple times at multiple distances (starting from 0) that increase + incrementally until they reach the user-specified search radius. This allows high confidence + barcodes to be called first and make things easier when later codes are called. + + If error_rounds is set to 1 (currently cannot handle more than 1), after running all decodings + for barocdes that exactly match the codebook, another set of decodings will be run to find + barcodes that are missing a spot in exactly one round. If the codes in your codebook all have a + hamming distance of at least 2 from all other codes, each can still be uniquely indentified + using a partial code with a single round dropped. Barcodes decoded with a partial code like this + are inherently less accurate and so an extra dimension called "rounds_used" was added to the + DecodedIntensityTable output that labels each decoded target with the number of rounds that was + used to decode it, allowing you to easily separate these less accurate codes from your high + accuracy set. + Parameters ---------- @@ -55,6 +82,15 @@ class CheckAll(DecodeSpotsAlgorithm): error_rounds : int Maximum hamming distance a barcode can be from it's target in the codebook and still be uniquely identified (i.e. number of error correction rounds in each the experiment) + mode : string + One of three preset parmaters sets. Choices are: "low", "med", or 'high'. Low accuracy mode + will return more decoded targets but at the cost to accuracy while the high accuracy version + will find fewer false postives but also fewer targets overall, medium is a balance between + the two. Which mode works best will differ for each dataset. + physical_coords : bool + True or False, should decoding using physical distances from the original imagestack that + you performed spot finding on. Should be used when distances between z pixels is much + greater than distance between x and y pixels. """ def __init__( diff --git a/starfish/core/spots/DecodeSpots/check_all_funcs.py b/starfish/core/spots/DecodeSpots/check_all_funcs.py index bed2399b6..7fcebd63b 100644 --- a/starfish/core/spots/DecodeSpots/check_all_funcs.py +++ b/starfish/core/spots/DecodeSpots/check_all_funcs.py @@ -10,7 +10,6 @@ from scipy.spatial import cKDTree from starfish.core.codebook.codebook import Codebook -from starfish.core.types import SpotFindingResults from starfish.types import Axes warnings.filterwarnings('ignore') @@ -32,16 +31,14 @@ def findNeighbors(spotTables: dict, searchRadius : float Distance that spots can be from each other and still form a barcode + numJobs : int + Number of CPU threads to use in parallel + Returns ------- dict: a dictionary with the following structure: - {round: { - spotID in round: { - neighborRound: - [list of spotIDs in neighborRound within searchRadius of spotID in round] - } - } - } + {(round1, round2): index table showing neighbors between spots in round1 and round2 + where round1 != round2} ''' allNeighborDict = {} @@ -59,8 +56,26 @@ def createNeighborDict(spotTables: dict, ''' Create dictionary of neighbors (within the search radius) in other rounds for each spot. - Dictionary has format: - neighborDict[roundNum][spotID] = {0 : neighbors in round 0, 1: neighbors in round 1, etc} + + Parameters + ---------- + + spotTables : dict + Dictionary with round labels as keys and pandas dataframes containing spot information + for its key round as values (result of _merge_spots_by_round function) + + searchRadius : float + Distance that spots can be from each other and still form a barcode + + neighborsByRadius : dict + Dictionary of outputs from findNeighbors() where each key is a radius and the value is + the findNeighbors dictionary + + Returns + ------- + + dict: a dictionary with the following structure + neighborDict[roundNum][spotID] = {0 : neighbors in round 0, 1: neighbors in round 1,etc} ''' # Create empty neighbor dictionary @@ -100,13 +115,14 @@ def createRefDicts(spotTables: dict, numJobs: int) -> tuple: Dictionary with round labels as keys and pandas dataframes containing spot information for its key round as values (result of _merge_spots_by_round function) - searchRadius : float - Distance that spots can be from each other and still form a barcode + numJobs : int + Number of CPU threads to use in parallel Returns ------- - tuple : First object is the neighbors dictionary, second is the channel dictionary, and the - third object is the spatial coordinate dictionary + tuple : First object is the channel dictionary, second is the spatial coordinate dictionary, + the third object is the raw spot instensity dictionary, and the last object is the + normalized spot intensity dictionary ''' # Create channel label and spatial coordinate dictionaries @@ -183,7 +199,7 @@ def decodeSpots(compressed: list, roundNum: int) -> list: return decompressed @ray.remote -def spotQualityFunc(spots: SpotFindingResults, +def spotQualityFunc(spots: list, spotCoords: dict, spotIntensities: dict, spotTables: dict, @@ -192,6 +208,30 @@ def spotQualityFunc(spots: SpotFindingResults, ''' Helper function for spotQuality to run in parallel w/ ray + + Parameters + ---------- + spots : list + List of spot IDs in the current round to calculate the normalized intensity of + + spotCoords : dict + Spot ID to spatial coordinate dictionary + + spotIntensities : dict + Spot ID to raw intensity dictionary + + spotTables : dict + Dictionary containing spot info tables + + channelDict : dict + Spot ID to channel label dictionary + + r : int + Current round + + Returns + ------- + list : list of normalized spot intensities of the input spot IDs ''' # Find spots in the same neighborhood (same channel and z slice and less than 100 pixels away @@ -235,6 +275,28 @@ def spotQuality(spotTables: dict, Creates dictionary mapping each spot ID to their normalized intensity value. Calculated as the spot intensity value divided by the l2 norm of the intensities of all the spots in the same neighborhood. + + Parameters + ---------- + spotTables : dict + Dictionary containing spot info tables + + spotCoords : dict + Spot ID to spatial coordinate dictionary + + spotIntensities : dict + Spot ID to raw intensity dictionary + + channelDict : dict + Spot ID to channel label dictionary + + numJobs : int + Number of CPU threads to use in parallel + + Returns + ------- + dict : dictionary mapping spot ID to it's normalized intensity value + ''' # Place data dictionary into shared memory @@ -285,21 +347,20 @@ def barcodeBuildFunc(allNeighbors: list, channelDict : dict Dictionary mapping spot IDs to their channels labels + currentRound : int + The round that the spots being used for reference points are found in + roundOmitNum : int Maximum hamming distance a barcode can be from it's target in the codebook and still be uniquely identified (i.e. number of error correction rounds in each the experiment) - currentRound : int - The round that the spots being used for reference points are found in - roundNum : int Total number of round in experiment Returns ------- - tuple : First element is a list of the possible spot codes while the second element is - a list of the possible barcodes + list : list of the possible spot codes ''' # spotCodes are the ordered spot IDs of the spots making up each barcode while barcodes are @@ -352,6 +413,10 @@ def buildBarcodes(roundData: pd.DataFrame, channelDict : dict Dictionary with mappings between spot IDs and their channel labels + strictness: int + Determines the number of possible codes a spot is allowed to have before it is dropped + as ambiguous (if it is positive) + currentRound : int Current round to build barcodes for (same round that roundData is from) @@ -360,8 +425,9 @@ def buildBarcodes(roundData: pd.DataFrame, Returns ------- - pd.DataFrame : Copy of roundData with additional columns which list all possible barcodes - that could be made from each spot's neighbors + pd.DataFrame : Copy of roundData with an additional column which lists all the possible spot + codes that could be made from each spot's neighbors for those spots that + passed the strictness requirement (if it is positive) ''' @@ -408,16 +474,14 @@ def buildBarcodes(roundData: pd.DataFrame, return roundData @ray.remote -def decodeFunc(data: pd.DataFrame, - permutationCodes: dict, - strictness: int) -> tuple: +def decodeFunc(data: pd.DataFrame, permutationCodes: dict) -> tuple: ''' Subfunction for decoder that allows it to run in parallel chunks using ray Parameters ---------- - codes : pd.DataFrame - Two column with columns called 'barcodes' and 'spot_codes' + data : pd.DataFrame + DataFrame with columns called 'barcodes' and 'spot_codes' permutationCodes : dict Dictionary containing barcode information for each roundPermutation @@ -425,8 +489,7 @@ def decodeFunc(data: pd.DataFrame, Returns ------- tuple : First element is a list of all decoded targets, second element is a list of all - decoded barcodes,third element is a list of all decoded spot codes, and the - fourth element is a list of rounds that were omitted for each decoded barcode + decoded spot codes ''' # Checks if each barcode is in the permutationsCodes dict, if it isn't, there is no match @@ -471,7 +534,14 @@ def decoder(roundData: pd.DataFrame, codebook : Codebook starFISH Codebook object containg the barcode information for the experiment - roundOmitNum : int + channelDict : dict + Dictionary with mappings between spot IDs and their channel labels + + strictness : int + Determines the number of target matching barcodes each spot is allowed before it is + dropped as ambiguous (if it is negative) + + currentRoundOmitNum : int Number of rounds that can be dropped from each barcode currentRound : int @@ -549,7 +619,7 @@ def generateRoundPermutations(size: int, roundOmitNum: int) -> list: chunkedData.append(deepcopy(roundData[ranges[i]:ranges[i + 1]])) # Run in parallel - results = [decodeFunc.remote(chunkedData[i], permutationCodesID, strictness) + results = [decodeFunc.remote(chunkedData[i], permutationCodesID) for i in range(len(ranges[:-1]))] rayResults = ray.get(results) @@ -573,7 +643,8 @@ def generateRoundPermutations(size: int, roundOmitNum: int) -> list: def distanceFunc(subSpotCodes: list, subTargets: list, spotCoords: dict, - spotQualDict: dict) -> tuple: + spotQualDict: dict, + currentRoundOmitNum: int) -> tuple: ''' Subfunction for distanceFilter to allow it to run in parallel using ray @@ -583,13 +654,23 @@ def distanceFunc(subSpotCodes: list, Chunk of full list of spot codes for the current round to calculate the spatial variance for + subSpotCodes : list + Chunk of full list of targets (0s if strictness is positive) associated with the + current set of spots whose spatial variance is being calculated + spotCoords : dict - Dictionary containing spatial locations for spots by their IDs in the original - spotTables object + Spot ID to spatial coordinate dictionary + + spotQualDict : dict + Spot ID to normalized intensity value dictionary + + currentRoundOmitNum : int + Number of rounds that can be dropped from each barcode Returns ------- - list: list of spatial variances for the current chunk of spot codes + tuple: First object is the min scoring spot code for each spots, the second is the min + score for each spot, and the third is the min scoring target for each spot ''' @@ -601,7 +682,8 @@ def distanceFunc(subSpotCodes: list, for i, codes in enumerate(subSpotCodes): quals = [sum([spotQualDict[r][spot] for r, spot in enumerate(code) if spot != 0]) for code in codes] - quals = np.asarray([-np.log(1 / (1 + (len(spotCoords) - qual))) for qual in quals]) + quals = np.asarray([-np.log(1 / (1 + (len(spotCoords) - currentRoundOmitNum - qual))) + for qual in quals]) subCoords = [[spotCoords[r][spot] for r, spot in enumerate(code) if spot != 0] for code in codes] spaVars = [sum(np.var(np.asarray(coords), axis=0)) for coords in subCoords] @@ -627,8 +709,18 @@ def distanceFilter(roundData: pd.DataFrame, numJobs: int) -> pd.DataFrame: ''' Function that chooses between the best barcode for each spot from the set of decodable barcodes. - Does this by choosing the barcode with the least spatial variance among the spots that make it - up. If there is a tie, the spot is dropped as ambiguous. + Does this by choosing the barcode with the least spatial variance and high intensity spots + according to this calculation: + + Score = -log(1 / 1 + (numRounds - qualSum)) + (-log(1 / 1 + spaVar) * constant) + Where: + numRounds = number of rounds being used for decoding (total - currentRoundOmitNum) + qualSum = sum of normalized intensity values for the spots in the code + spaVar = spatial variance of spots in code, calculates as the sum of variances of the + values in each spatial dimension + constant = a constant that determines the balance between the score being more influenced + by spatial variance or intensity, set to 2 so spatial variance is the biggest + deciding factor but allows ties to be broken by intensity Parameters ---------- @@ -637,11 +729,17 @@ def distanceFilter(roundData: pd.DataFrame, round spotCoords : dict - Dictionary containing spatial coordinates of spots in each round indexed by their IDs + Spot ID to spatial coordinate dictionary + + spotQualDict : dict + Spot ID to normalized intensity value dictionary currentRound : int Current round number to calculate distances for + currentRoundOmitNum : int + Number of rounds that can be dropped from each barcode + numJobs : int Number of CPU threads to use in parallel @@ -680,7 +778,8 @@ def distanceFilter(roundData: pd.DataFrame, chunkedTargets = [allTargets[ranges[i]:ranges[i + 1]] for i in range(len(ranges[:-1]))] # Run in parallel - results = [distanceFunc.remote(subSpotCodes, subTargets, spotCoordsID, spotQualDictID) + results = [distanceFunc.remote(subSpotCodes, subTargets, spotCoordsID, spotQualDictID, + currentRoundOmitNum) for subSpotCodes, subTargets in zip(chunkedSpotCodes, chunkedTargets)] rayResults = ray.get(results) @@ -720,6 +819,15 @@ def cleanup(bestPerSpotTables: dict, channelDict : dict Dictionary with mapping between spot IDs and the channel labels + strictness : int + Parameter that determines how many possible barcodes each spot can have before it is + dropped as ambiguous + + currentRoundOmitNum : int + Number of rounds that can be dropped from each barcode + + seedNumber : A barcode must be chosen as "best" in this number of rounds to pass filters + Returns ------- pd.DataFrame : Dataframe containing final set of codes that have passed all filters From ec859a14ce79e514cd8bd5a12b31ed277ef264c4 Mon Sep 17 00:00:00 2001 From: nickeener Date: Thu, 5 May 2022 15:44:43 -0700 Subject: [PATCH 21/30] Updated comments and fixed mypy errors --- .../spots/DecodeSpots/check_all_decoder.py | 124 ++++++++++-------- .../core/spots/DecodeSpots/check_all_funcs.py | 81 ++++++------ 2 files changed, 112 insertions(+), 93 deletions(-) diff --git a/starfish/core/spots/DecodeSpots/check_all_decoder.py b/starfish/core/spots/DecodeSpots/check_all_decoder.py index 45997b04d..1f948fe45 100644 --- a/starfish/core/spots/DecodeSpots/check_all_decoder.py +++ b/starfish/core/spots/DecodeSpots/check_all_decoder.py @@ -21,17 +21,18 @@ class CheckAll(DecodeSpotsAlgorithm): """ - Decode spots by generating all possible combinations of spots to form barcodes given a radius - distance that spots may be from each other in order to form a barcode. Then chooses the best - set of nonoverlapping spot combinations by choosing the ones with the least spatial variance - of their spot coordinates, highest intensity and are also found to be best for multiple spots - in the barcode (see algorithm below). Allows for error correction rounds. + Decode spots by generating all possible combinations of neighboring spots to form barcodes + given a radius distance that spots may be from each other in order to form a barcode. Then + chooses the best set of nonoverlapping spot combinations by choosing the ones with the least + spatial variance of their spot coordinates, highest normalized intensity and are also found + to be best for multiple spots in the barcode (see algorithm below). Allows for one error + correction round (option for more may be added in the future). + + Two slightly different algorithms are used to balance the precision (proportion of targets that + represent true mRNA molecules) and recall (proportion of true mRNA molecules that are + recovered). They share mostly the same steps but two are switched between the different + versions. The following is for the "filter-first" version: - Two slightly different algorithms are used to balance recoving the full set of visible targets - while ensuring accuracy of the decoded targets. They share the same steps but two are switched - between the different versions. The following is for one version: - - (see input parmeters below) 1. For each spot in each round, find all neighbors in other rounds that are within the search radius 2. For each spot in each round, build all possible full length barcodes based on the channel @@ -39,38 +40,39 @@ class CheckAll(DecodeSpotsAlgorithm): 3. Choose the "best" barcode of each spot's possible barcodes by calculating a score that is based on minimizing the spatial variance and maximizing the intensities of the spots in the barcode. Each spot is assigned a "best" barcode in this way. - 4. Drop barcodes that don't have a matching target in the codebook + 4. Drop "best" barcodes that don't have a matching target in the codebook 5. Only keep barcodes/targets that were found as "best" using at least x of the spots that make each up (x is determined by parameters) 6. Find maximum independent set (approximation) of the spot combinations so no two barcodes use the same spot - The other method is the same except steps 3 and 4 are switched so that the minimum scoring - barcode is chosen from the set of possible codes that have a match to the codebook. The first - method will return fewer decoded targets but has a lower false positive rate while the other - method will find more targets but at the cost of an increased false positive rate. + The other method (which I'll call "decode-first") is the same except steps 3 and 4 are switched + so that the minimum scoring barcode is chosen from the set of possible codes that have a match + to the codebook. The filter-first method will return fewer decoded targets (lower recall) but + has a lower false positive rate (higher precision) while the other method will find more targets + (higher recall) but at the cost of an increased false positive rate (lower precision). Decoding is run in multiple stages with the parameters becoming less strict as it gets into - later stages. The high accuracy algorithm is always run first followed by the low accuracy - method, each with slightly different parameters based on the choice of "mode" parameter. After - each decoding, the spots found to be in decoded barcodes are removed from the original set of - spots before they are decoded again with a new set of parameters. In order to simplify the - number of parameters to choose from, I have sorted them into three sets of presets determined - by the "mode" parameter. + later stages. The high accuracy algorithm (filter-first) is always run first followed by the low + accuracy method (decode-first), each with slightly different parameters based on the choice of + "mode" parameter. After each decoding, the spots found to be in decoded barcodes are removed + from the original set of spots before they are decoded again with a new set of parameters. In + order to simplify the number of parameters to choose from, I have sorted them into three sets of + presets ("high", "medium", or "low" accuracy) determined by the "mode" parameter. - Decoding is done multiple times at multiple distances (starting from 0) that increase - incrementally until they reach the user-specified search radius. This allows high confidence - barcodes to be called first and make things easier when later codes are called. + Decoding is also done multiple times at multiple search radius values that start at 0 and + increase incrementally until they reach the user-specified search radius. This allows high + confidence barcodes to be called first and make things easier when later codes are called. If error_rounds is set to 1 (currently cannot handle more than 1), after running all decodings for barocdes that exactly match the codebook, another set of decodings will be run to find - barcodes that are missing a spot in exactly one round. If the codes in your codebook all have a + barcodes that are missing a spot in exactly one round. If the codes in the codebook all have a hamming distance of at least 2 from all other codes, each can still be uniquely indentified using a partial code with a single round dropped. Barcodes decoded with a partial code like this are inherently less accurate and so an extra dimension called "rounds_used" was added to the DecodedIntensityTable output that labels each decoded target with the number of rounds that was used to decode it, allowing you to easily separate these less accurate codes from your high - accuracy set. + accuracy set if you wish Parameters @@ -78,18 +80,19 @@ class CheckAll(DecodeSpotsAlgorithm): codebook : Codebook Contains codes to decode IntensityTable search_radius : float - Number of pixels over which to search for spots in other rounds and channels. + Maximum allowed distance (in pixels) that spots in different rounds can be from each other + and still be allowed to be combined into a barcode together error_rounds : int Maximum hamming distance a barcode can be from it's target in the codebook and still be uniquely identified (i.e. number of error correction rounds in each the experiment) mode : string One of three preset parmaters sets. Choices are: "low", "med", or 'high'. Low accuracy mode - will return more decoded targets but at the cost to accuracy while the high accuracy version - will find fewer false postives but also fewer targets overall, medium is a balance between - the two. Which mode works best will differ for each dataset. + will return more decoded targets but at the cost to accuracy (high recall, low precision) + while the high accuracy version will find fewer false postives but also fewer targets + overall (high precision, low recall), medium is a balance between the two. physical_coords : bool True or False, should decoding using physical distances from the original imagestack that - you performed spot finding on. Should be used when distances between z pixels is much + you performed spot finding on? Should be used when distances between z pixels is much greater than distance between x and y pixels. """ @@ -124,7 +127,7 @@ def run(self, *args) -> DecodedIntensityTable: """ Decode spots by finding the set of nonoverlapping barcodes that have the minimum spatial - variance within each barcode + variance within each barcode. Parameters ---------- @@ -137,7 +140,7 @@ def run(self, Returns ------- DecodedIntensityTable : - IntensityTable decoded and appended with Features.TARGET and Features.QUALITY values. + IntensityTable decoded and appended with Features.TARGET values. """ @@ -161,6 +164,7 @@ def run(self, if counter[0] > self.errorRounds: exit('Not enough spots to form a barcode') + # If using physical coordinates, extract z and xy scales and check that they are all > 0 if self.physicalCoords: physicalCoords = spots.physical_coord_ranges if len(physicalCoords['z'].data) > 1: @@ -188,7 +192,8 @@ def run(self, # increase in neighborhood size set1 = False zs = set() - [zs.update(spotTables[r]['z']) for r in range(len(spotTables))] + for r in range(len(spotTables)): + zs.update(spotTables[r]['z']) if self.physicalCoords: if zScale < self.searchRadius or len(zs) > 1: set1 = True @@ -205,7 +210,8 @@ def run(self, maxRadii = allSearchRadii[(allSearchRadii - self.searchRadius) <= 0][-1] radiusSet = allSearchRadii[allSearchRadii <= maxRadii] - # Calculate neighbors for each radius in the set + # Calculate neighbors for each radius in the set (done only once and referred back to + # throughout decodings) neighborsByRadius = {} for searchRadius in radiusSet: if self.physicalCoords: @@ -226,7 +232,9 @@ def run(self, # Set list of round omission numbers to loop through roundOmits = range(self.errorRounds + 1) - # Set parameters according to presets + # Set parameters according to presets (determined empirically). Strictness value determines + # the decoding method used and the allowed number of possible barcode choices (positive + # for filter-first, negative for decode-first). if self.mode == 'high': strictnesses = [50, -1] seedNumbers = [len(spotTables) - 1, len(spotTables)] @@ -251,17 +259,21 @@ def run(self, else: exit('Invalid mode choice ("high", "med", or "low")') - # Decode for each round omission number, store results in allCodes table + # Decode for each round omission number, intensity cutoff, and then search radius allCodes = pd.DataFrame() - for s, strictness in enumerate(strictnesses): - seedNumber = seedNumbers[s] - for currentRoundOmitNum in roundOmits: + for currentRoundOmitNum in roundOmits: + for s, strictness in enumerate(strictnesses): + + # Set seedNumber according to parameters for this strictness value + seedNumber = seedNumbers[s] + + # First decodes only the highest normalized intensity spots then adds in the rest for intVal in range(50, -1, -50): + # First check that there are enough spots left otherwise an error will occur spotsPerRound = [len(spotTables[r]) for r in range(len(spotTables))] counter = Counter(spotsPerRound) condition3 = True if counter[0] > currentRoundOmitNum else False - if not condition3: # Subset spots by intensity, start with top 50% then decode again with all currentTables = {} @@ -272,6 +284,8 @@ def run(self, # Decode each radius and remove spots found in each decoding before the next for sr, searchRadius in enumerate(radiusSet): + + # Scale radius by xy scale if needed if self.physicalCoords: searchRadius = round(searchRadius * xScale, 5) @@ -298,8 +312,9 @@ def run(self, # roundData will carry the possible barcode info for each spot in # the current round being examined roundData = deepcopy(currentTables[r]) - roundData = roundData.drop(['intensity', 'z', 'y', 'x', 'radius', - 'c', 'spot_quals'], axis=1) + + # Drop all but the spot_id column + roundData = roundData[['spot_id']] # From each spot's neighbors, create all possible combinations that # would form a barocde with the correct number of rounds. Adds @@ -308,19 +323,17 @@ def run(self, currentRoundOmitNum, channelDict, strictness, r, numJobs) - # When strictness is positive, distanceFilter is run first on all - # the potential barcodes to choose the one with the minimum score - # (based on spatial variance of the spots and their intensities) - # which are then matched to the codebook. Spots that have more - # possible barcodes to choose between than the current strictness - # number are dropped as ambiguous. If strictness is negative, all + # When strictness is positive the filter-first methods is used and + # distanceFilter is run first on all the potential barcodes to + # choose the one with the minimum score (based on spatial variance + # of the spots and their intensities) which are then matched to the + # codebook. Spots that have more possible barcodes to choose between + # than the current strictnessnumber are dropped as ambiguous. If + # strictness is negative, the decode-first method is run where all # the possible barcodes are instead first matched to the codebook # and then the lowest scoring decodable spot combination is chosen # for each spot. Spots that have more decodable barcodes to choose # from than the strictness value (absolute value) are dropped. - # Positive strictness method has lower false positive rate but - # finds fewer targets while the negative strictness method has - # higher false positive rates but finds more targets if strictness > 0: # Choose most likely combination of spots for each seed spot @@ -351,7 +364,7 @@ def run(self, decodedTables[r] = roundData # Turn spot table dictionary into single table, filter barcodes by - # round frequency, add additional information, and choose between + # the seed number, add additional information, and choose between # barcodes that have overlapping spots finalCodes = cleanup(decodedTables, spotCoords, channelDict, strictness, currentRoundOmitNum, seedNumber) @@ -402,14 +415,13 @@ def run(self, int_table.values = np.asarray(table_codes) int_table = transfer_physical_coords_to_intensity_table(intensity_table=int_table, spots=spots) - intensities = int_table.transpose('features', 'r', 'c') # Validate results are correct shape - self.codebook._validate_decode_intensity_input_matches_codebook_shape(intensities) + self.codebook._validate_decode_intensity_input_matches_codebook_shape(int_table) # Create DecodedIntensityTable result = DecodedIntensityTable.from_intensity_table( - intensities, + int_table, targets=(Features.AXIS, allCodes['targets'].astype('U')), distances=(Features.AXIS, allCodes["distance"]), passes_threshold=(Features.AXIS, np.full(len(allCodes), True)), diff --git a/starfish/core/spots/DecodeSpots/check_all_funcs.py b/starfish/core/spots/DecodeSpots/check_all_funcs.py index 7fcebd63b..2bff7a7d5 100644 --- a/starfish/core/spots/DecodeSpots/check_all_funcs.py +++ b/starfish/core/spots/DecodeSpots/check_all_funcs.py @@ -148,6 +148,7 @@ def createRefDicts(spotTables: dict, numJobs: int) -> tuple: return channelDict, spotCoords, spotIntensities, spotQualDict def encodeSpots(spotCodes: list) -> list: + ''' For compressing spot ID codes into single integers. Saves memory. The number of digits in each ID is counted and these integer lengths and concatenated into a string in the same @@ -171,6 +172,7 @@ def encodeSpots(spotCodes: list) -> list: return compressed def decodeSpots(compressed: list, roundNum: int) -> list: + ''' Reconverts compressed spot codes back into their roundNum length tupes of integers with the same order and IDs as their original source. First roundNum values in the compressed @@ -189,8 +191,8 @@ def decodeSpots(compressed: list, roundNum: int) -> list: Returns ------- list: List of recovered spot codes in their original tuple form - ''' + strs = [str(intStr) for intStr in compressed] idxs, nums = list(zip(*[(map(int, s[:roundNum]), [iter(s[roundNum:])] * roundNum) for s in strs])) @@ -296,7 +298,6 @@ def spotQuality(spotTables: dict, Returns ------- dict : dictionary mapping spot ID to it's normalized intensity value - ''' # Place data dictionary into shared memory @@ -306,7 +307,7 @@ def spotQuality(spotTables: dict, channelDictID = ray.put(channelDict) # Calculate normalize spot intensities for each spot in each round - spotQuals = {} + spotQuals = {} # type: dict for r in range(len(spotTables)): roundSpots = spotTables[r]['spot_id'] spotQuals[r] = {} @@ -335,7 +336,8 @@ def barcodeBuildFunc(allNeighbors: list, channelDict: dict, currentRound: int, roundOmitNum: int, - roundNum: int) -> tuple: + roundNum: int) -> list: + ''' Subfunction to buildBarcodes that allows it to run in parallel chunks @@ -376,7 +378,7 @@ def barcodeBuildFunc(allNeighbors: list, # Creates all possible spot code combinations from neighbors codes = list(product(*neighborLists)) # Only save the ones with the correct number of dropped rounds - counters = [Counter(code) for code in codes] + counters = [Counter(code) for code in codes] # type: typing.List[Counter] spotCodes = [code for j, code in enumerate(codes) if counters[j][0] == roundOmitNum] spotCodes = [code for code in spotCodes if code[currentRound] != 0] @@ -428,7 +430,6 @@ def buildBarcodes(roundData: pd.DataFrame, pd.DataFrame : Copy of roundData with an additional column which lists all the possible spot codes that could be made from each spot's neighbors for those spots that passed the strictness requirement (if it is positive) - ''' # Only keep spots that have enough neighbors to form a barcode (determined by the total number @@ -473,8 +474,35 @@ def buildBarcodes(roundData: pd.DataFrame, return roundData +def generateRoundPermutations(size: int, roundOmitNum: int) -> list: + + ''' + Creates list of lists of logicals detailing the rounds to be used for decoding based on the + current roundOmitNum + + Parameters + ---------- + size : int + Number of rounds in experiment + + roundOmitNum: int + Number of rounds that can be dropped from each barcode + + Returns + ------- + list : list of lists of logicals detailing the rounds to be used for decoding based on + the current roundOmitNum + ''' + + if roundOmitNum == 0: + return [tuple([True] * size)] + else: + return sorted(set(list(permutations([*([False] * roundOmitNum), + *([True] * (size - roundOmitNum))])))) + @ray.remote def decodeFunc(data: pd.DataFrame, permutationCodes: dict) -> tuple: + ''' Subfunction for decoder that allows it to run in parallel chunks using ray @@ -516,10 +544,11 @@ def decodeFunc(data: pd.DataFrame, permutationCodes: dict) -> tuple: def decoder(roundData: pd.DataFrame, codebook: Codebook, channelDict: dict, - strictness: str, + strictness: int, currentRoundOmitNum: int, currentRound: int, numJobs: int) -> pd.DataFrame: + ''' Function that takes spots tables with possible barcodes added and matches each to the codebook to identify any matches. Matches are added to the spot tables and spots without any matches are @@ -556,30 +585,6 @@ def decoder(roundData: pd.DataFrame, barcodes ''' - def generateRoundPermutations(size: int, roundOmitNum: int) -> list: - ''' - Creates list of lists of logicals detailing the rounds to be used for decoding based on the - current roundOmitNum - - Parameters - ---------- - size : int - Number of rounds in experiment - - roundOmitNum: int - Number of rounds that can be dropped from each barcode - - Returns - ------- - list : list of lists of logicals detailing the rounds to be used for decoding based on - the current roundOmitNum - ''' - if roundOmitNum == 0: - return [tuple([True] * size)] - else: - return sorted(set(list(permutations([*([False] * roundOmitNum), - *([True] * (size - roundOmitNum))])))) - # Add barcodes column by mapping spotIDs in spot_codes to channel labels using channelDict if strictness > 0: roundData['barcodes'] = [[hash(tuple([channelDict[j][spot] for j, spot in @@ -645,6 +650,7 @@ def distanceFunc(subSpotCodes: list, spotCoords: dict, spotQualDict: dict, currentRoundOmitNum: int) -> tuple: + ''' Subfunction for distanceFilter to allow it to run in parallel using ray @@ -671,7 +677,6 @@ def distanceFunc(subSpotCodes: list, ------- tuple: First object is the min scoring spot code for each spots, the second is the min score for each spot, and the third is the min scoring target for each spot - ''' # Find minimum scoring combination of spots from set of possible combinations @@ -682,13 +687,13 @@ def distanceFunc(subSpotCodes: list, for i, codes in enumerate(subSpotCodes): quals = [sum([spotQualDict[r][spot] for r, spot in enumerate(code) if spot != 0]) for code in codes] - quals = np.asarray([-np.log(1 / (1 + (len(spotCoords) - currentRoundOmitNum - qual))) - for qual in quals]) + newQuals = np.asarray([-np.log(1 / (1 + (len(spotCoords) - currentRoundOmitNum - qual))) + for qual in quals]) subCoords = [[spotCoords[r][spot] for r, spot in enumerate(code) if spot != 0] for code in codes] spaVars = [sum(np.var(np.asarray(coords), axis=0)) for coords in subCoords] - spaVars = np.asarray([-np.log(1 / (1 + spaVar)) for spaVar in spaVars]) - combined = quals + (spaVars * constant) + newSpaVars = np.asarray([-np.log(1 / (1 + spaVar)) for spaVar in spaVars]) + combined = newQuals + (newSpaVars * constant) minInds = np.where(combined == min(combined))[0] if len(minInds) == 1: bestSpotCodes.append(codes[minInds[0]]) @@ -707,6 +712,7 @@ def distanceFilter(roundData: pd.DataFrame, currentRound: int, currentRoundOmitNum: int, numJobs: int) -> pd.DataFrame: + ''' Function that chooses between the best barcode for each spot from the set of decodable barcodes. Does this by choosing the barcode with the least spatial variance and high intensity spots @@ -801,6 +807,7 @@ def cleanup(bestPerSpotTables: dict, strictness: int, currentRoundOmitNum: int, seedNumber: int) -> pd.DataFrame: + ''' Function that combines all "best" codes for each spot in each round into a single table, filters them by their frequency (with a user-defined threshold), chooses between overlapping @@ -831,7 +838,6 @@ def cleanup(bestPerSpotTables: dict, Returns ------- pd.DataFrame : Dataframe containing final set of codes that have passed all filters - ''' # Create merged spot results dataframe containing the passing barcodes found in all the rounds @@ -969,6 +975,7 @@ def cleanup(bestPerSpotTables: dict, return finalCodes def removeUsedSpots(finalCodes: pd.DataFrame, spotTables: dict) -> dict: + ''' Remove spots found to be in barcodes for the current round omission number from the spotTables so they are not used for the next round omission number From 027b7cf4745768dca289471509ec9b4ab41d1c48 Mon Sep 17 00:00:00 2001 From: nickeener Date: Mon, 9 May 2022 14:35:25 -0700 Subject: [PATCH 22/30] Swapped ray multiprocessing w/ standard Python --- .../spots/DecodeSpots/check_all_decoder.py | 7 -- .../core/spots/DecodeSpots/check_all_funcs.py | 76 ++++++++----------- 2 files changed, 30 insertions(+), 53 deletions(-) diff --git a/starfish/core/spots/DecodeSpots/check_all_decoder.py b/starfish/core/spots/DecodeSpots/check_all_decoder.py index 1f948fe45..c7566a2d5 100644 --- a/starfish/core/spots/DecodeSpots/check_all_decoder.py +++ b/starfish/core/spots/DecodeSpots/check_all_decoder.py @@ -5,7 +5,6 @@ import numpy as np import pandas as pd -import ray from starfish.core.codebook.codebook import Codebook from starfish.core.intensity_table.decoded_intensity_table import DecodedIntensityTable @@ -151,9 +150,6 @@ def run(self, if numJobs < 0 or not isinstance(numJobs, int): sys.exit('n_process must be a positive integer') - # Initialize ray for multi_processing - ray.init(num_cpus=numJobs, ignore_reinit_error=True) - # Create dictionary where keys are round labels and the values are pandas dataframes # containing information on the spots found in that round spotTables = _merge_spots_by_round(spots) @@ -380,9 +376,6 @@ def run(self, # Append found codes to allCodes table allCodes = allCodes.append(finalCodes).reset_index(drop=True) - # Shutdown ray - ray.shutdown() - # Create and fill in intensity table channels = spots.ch_labels rounds = spots.round_labels diff --git a/starfish/core/spots/DecodeSpots/check_all_funcs.py b/starfish/core/spots/DecodeSpots/check_all_funcs.py index 2bff7a7d5..f99f4c461 100644 --- a/starfish/core/spots/DecodeSpots/check_all_funcs.py +++ b/starfish/core/spots/DecodeSpots/check_all_funcs.py @@ -1,12 +1,13 @@ import typing import warnings from collections import Counter, defaultdict +from concurrent.futures.process import ProcessPoolExecutor from copy import deepcopy +from functools import partial from itertools import chain, islice, permutations, product import numpy as np import pandas as pd -import ray from scipy.spatial import cKDTree from starfish.core.codebook.codebook import Codebook @@ -200,7 +201,6 @@ def decodeSpots(compressed: list, roundNum: int) -> list: for j in range(len(idxs))] return decompressed -@ray.remote def spotQualityFunc(spots: list, spotCoords: dict, spotIntensities: dict, @@ -209,7 +209,7 @@ def spotQualityFunc(spots: list, r: int) -> list: ''' - Helper function for spotQuality to run in parallel w/ ray + Helper function for spotQuality to run in parallel Parameters ---------- @@ -300,12 +300,6 @@ def spotQuality(spotTables: dict, dict : dictionary mapping spot ID to it's normalized intensity value ''' - # Place data dictionary into shared memory - spotCoordsID = ray.put(spotCoords) - spotIntsID = ray.put(spotIntensities) - spotTablesID = ray.put(spotTables) - channelDictID = ray.put(channelDict) - # Calculate normalize spot intensities for each spot in each round spotQuals = {} # type: dict for r in range(len(spotTables)): @@ -320,18 +314,18 @@ def spotQuality(spotTables: dict, chunkedSpots = [roundSpots[ranges[i]:ranges[i + 1]] for i in range(len(ranges[:-1]))] # Run in parallel - results = [spotQualityFunc.remote(subSpots, spotCoordsID, spotIntsID, spotTablesID, - channelDictID, r) - for subSpots in chunkedSpots] - rayResults = ray.get(results) + with ProcessPoolExecutor() as pool: + part = partial(spotQualityFunc, spotCoords=spotCoords, spotIntensities=spotIntensities, + spotTables=spotTables, channelDict=channelDict, r=r) + poolMap = pool.map(part, [subSpots for subSpots in chunkedSpots]) + results = [x for x in poolMap] # Extract results - for spot, qual in zip(roundSpots, list(chain(*rayResults))): + for spot, qual in zip(roundSpots, list(chain(*results))): spotQuals[r][spot] = qual return spotQuals -@ray.remote def barcodeBuildFunc(allNeighbors: list, channelDict: dict, currentRound: int, @@ -450,9 +444,6 @@ def buildBarcodes(roundData: pd.DataFrame, # Find all possible barcodes for the spots in each round by splitting each round's spots into # numJob chunks and constructing each chunks barcodes in parallel - # Save the current round's data table and the channelDict to ray memory - channelDictID = ray.put(channelDict) - # Calculates index ranges to chunk data by ranges = [0] for i in range(1, numJobs + 1): @@ -461,16 +452,17 @@ def buildBarcodes(roundData: pd.DataFrame, range(len(ranges[:-1]))] # Run in parallel - results = [barcodeBuildFunc.remote(chunkedNeighbors[i], channelDictID, currentRound, - roundOmitNum, roundNum) - for i in range(len(chunkedNeighbors))] - rayResults = ray.get(results) + with ProcessPoolExecutor() as pool: + part = partial(barcodeBuildFunc, channelDict=channelDict, currentRound=currentRound, + roundOmitNum=roundOmitNum, roundNum=roundNum) + poolMap = pool.map(part, [chunkedNeighbors[i] for i in range(len(chunkedNeighbors))]) + results = [x for x in poolMap] # Drop unneeded columns (saves memory) roundData = roundData.drop(['neighbors', 'spot_id'], axis=1) # Add possible spot codes to spot table (must chain results from different jobs together) - roundData['spot_codes'] = list(chain(*[job for job in rayResults])) + roundData['spot_codes'] = list(chain(*[job for job in results])) return roundData @@ -500,11 +492,10 @@ def generateRoundPermutations(size: int, roundOmitNum: int) -> list: return sorted(set(list(permutations([*([False] * roundOmitNum), *([True] * (size - roundOmitNum))])))) -@ray.remote def decodeFunc(data: pd.DataFrame, permutationCodes: dict) -> tuple: ''' - Subfunction for decoder that allows it to run in parallel chunks using ray + Subfunction for decoder that allows it to run in parallel chunks Parameters ---------- @@ -612,9 +603,6 @@ def decoder(roundData: pd.DataFrame, roundDict = dict(zip([hash(tuple(code)) for code in codes.data], codes['target'].data)) permCodeDict.update(roundDict) - # Put data table and permutations codes dictionary in ray storage - permutationCodesID = ray.put(permCodeDict) - # Calculates index ranges to chunk data by and creates list of chunked data to loop through ranges = [0] for i in range(1, numJobs + 1): @@ -624,13 +612,14 @@ def decoder(roundData: pd.DataFrame, chunkedData.append(deepcopy(roundData[ranges[i]:ranges[i + 1]])) # Run in parallel - results = [decodeFunc.remote(chunkedData[i], permutationCodesID) - for i in range(len(ranges[:-1]))] - rayResults = ray.get(results) + with ProcessPoolExecutor() as pool: + part = partial(decodeFunc, permutationCodes=permCodeDict, strictness=strictness) + poolMap = pool.map(part, [chunkedData[i] for i in range(len(chunkedData))]) + results = [x for x in poolMap] # Update table - roundData['targets'] = list(chain(*[job[0] for job in rayResults])) - roundData['spot_codes'] = list(chain(*[job[1] for job in rayResults])) + roundData['targets'] = list(chain(*[job[0] for job in results])) + roundData['spot_codes'] = list(chain(*[job[1] for job in results])) roundData = roundData[[len(targets) > 0 for targets in roundData['targets']]].reset_index(drop=True) @@ -644,7 +633,6 @@ def decoder(roundData: pd.DataFrame, return roundData -@ray.remote def distanceFunc(subSpotCodes: list, subTargets: list, spotCoords: dict, @@ -652,7 +640,7 @@ def distanceFunc(subSpotCodes: list, currentRoundOmitNum: int) -> tuple: ''' - Subfunction for distanceFilter to allow it to run in parallel using ray + Subfunction for distanceFilter to allow it to run in parallel Parameters ---------- @@ -771,10 +759,6 @@ def distanceFilter(roundData: pd.DataFrame, else: allTargets = [[0 for code in codes] for codes in roundData['spot_codes']] - # Put reference dicts into shared memory - spotCoordsID = ray.put(spotCoords) - spotQualDictID = ray.put(spotQualDict) - # Find ranges to chunk data by ranges = [0] for i in range(1, numJobs): @@ -784,17 +768,17 @@ def distanceFilter(roundData: pd.DataFrame, chunkedTargets = [allTargets[ranges[i]:ranges[i + 1]] for i in range(len(ranges[:-1]))] # Run in parallel - results = [distanceFunc.remote(subSpotCodes, subTargets, spotCoordsID, spotQualDictID, - currentRoundOmitNum) - for subSpotCodes, subTargets in zip(chunkedSpotCodes, chunkedTargets)] - rayResults = ray.get(results) + with ProcessPoolExecutor() as pool: + part = partial(distanceFunc, spotCoords=spotCoords, spotQualDict=spotQualDict) + poolMap = pool.map(part, [spotsAndTargets for spotsAndTargets in zip(chunkedSpotCodes, chunkedTargets)]) + results = [x for x in poolMap] # Add distances to decodedTables as new column and replace spot_codes and targets column with # only the min scoring values - roundData['spot_codes'] = list(chain(*[job[0] for job in rayResults])) - roundData['distance'] = list(chain(*[job[1] for job in rayResults])) + roundData['spot_codes'] = list(chain(*[job[0] for job in results])) + roundData['distance'] = list(chain(*[job[1] for job in results])) if checkTargets: - roundData['targets'] = list(chain(*[job[2] for job in rayResults])) + roundData['targets'] = list(chain(*[job[2] for job in results])) # Remove spots who had a tie between possible spot combinations roundData = roundData[roundData['spot_codes'] != -1] From 9c383d33a918fb4cdd0ff3a9cfe4b8c14843951f Mon Sep 17 00:00:00 2001 From: nickeener Date: Mon, 16 May 2022 21:38:34 -0700 Subject: [PATCH 23/30] fixed function arg bug --- starfish/core/spots/DecodeSpots/check_all_funcs.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/starfish/core/spots/DecodeSpots/check_all_funcs.py b/starfish/core/spots/DecodeSpots/check_all_funcs.py index f99f4c461..ce892bb19 100644 --- a/starfish/core/spots/DecodeSpots/check_all_funcs.py +++ b/starfish/core/spots/DecodeSpots/check_all_funcs.py @@ -613,7 +613,7 @@ def decoder(roundData: pd.DataFrame, # Run in parallel with ProcessPoolExecutor() as pool: - part = partial(decodeFunc, permutationCodes=permCodeDict, strictness=strictness) + part = partial(decodeFunc, permutationCodes=permCodeDict) poolMap = pool.map(part, [chunkedData[i] for i in range(len(chunkedData))]) results = [x for x in poolMap] @@ -633,8 +633,7 @@ def decoder(roundData: pd.DataFrame, return roundData -def distanceFunc(subSpotCodes: list, - subTargets: list, +def distanceFunc(spotsAndTargets: list, spotCoords: dict, spotQualDict: dict, currentRoundOmitNum: int) -> tuple: @@ -667,6 +666,9 @@ def distanceFunc(subSpotCodes: list, score for each spot, and the third is the min scoring target for each spot ''' + subSpotCodes = spotsAndTargets[0] + subTargets = spotsAndTargets[1] + # Find minimum scoring combination of spots from set of possible combinations constant = 2 bestSpotCodes = [] @@ -769,8 +771,10 @@ def distanceFilter(roundData: pd.DataFrame, # Run in parallel with ProcessPoolExecutor() as pool: - part = partial(distanceFunc, spotCoords=spotCoords, spotQualDict=spotQualDict) - poolMap = pool.map(part, [spotsAndTargets for spotsAndTargets in zip(chunkedSpotCodes, chunkedTargets)]) + part = partial(distanceFunc, spotCoords=spotCoords, spotQualDict=spotQualDict, + currentRoundOmitNum=currentRoundOmitNum) + poolMap = pool.map(part, [spotsAndTargets for spotsAndTargets in zip(chunkedSpotCodes, + chunkedTargets)]) results = [x for x in poolMap] # Add distances to decodedTables as new column and replace spot_codes and targets column with From fe841db7e0651a2125a966326e96ef708e89a07e Mon Sep 17 00:00:00 2001 From: nickeener Date: Tue, 17 May 2022 12:52:14 -0700 Subject: [PATCH 24/30] mypy fix --- starfish/core/spots/DecodeSpots/check_all_funcs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/starfish/core/spots/DecodeSpots/check_all_funcs.py b/starfish/core/spots/DecodeSpots/check_all_funcs.py index ce892bb19..29c882a43 100644 --- a/starfish/core/spots/DecodeSpots/check_all_funcs.py +++ b/starfish/core/spots/DecodeSpots/check_all_funcs.py @@ -453,7 +453,7 @@ def buildBarcodes(roundData: pd.DataFrame, # Run in parallel with ProcessPoolExecutor() as pool: - part = partial(barcodeBuildFunc, channelDict=channelDict, currentRound=currentRound, + part = partial(barcodeBuildFunc, channelDict=channelDict, currentRound=currentRound, roundOmitNum=roundOmitNum, roundNum=roundNum) poolMap = pool.map(part, [chunkedNeighbors[i] for i in range(len(chunkedNeighbors))]) results = [x for x in poolMap] From 290c62e86a681e88ee2e848855ed782a0f42db9c Mon Sep 17 00:00:00 2001 From: nickeener Date: Thu, 26 May 2022 21:43:00 -0700 Subject: [PATCH 25/30] bug fixes --- starfish/core/spots/DecodeSpots/check_all_decoder.py | 6 +++--- starfish/core/spots/DecodeSpots/check_all_funcs.py | 8 ++++---- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/starfish/core/spots/DecodeSpots/check_all_decoder.py b/starfish/core/spots/DecodeSpots/check_all_decoder.py index c7566a2d5..2e17b7122 100644 --- a/starfish/core/spots/DecodeSpots/check_all_decoder.py +++ b/starfish/core/spots/DecodeSpots/check_all_decoder.py @@ -191,7 +191,7 @@ def run(self, for r in range(len(spotTables)): zs.update(spotTables[r]['z']) if self.physicalCoords: - if zScale < self.searchRadius or len(zs) > 1: + if zScale < self.searchRadius and len(zs) > 1: set1 = True else: if len(zs) > 1: @@ -203,9 +203,9 @@ def run(self, allSearchRadii = np.array([0, 1.05, 1.5, 2.05, 2.3, 2.85, 3.05, 3.2, 3.65, 4.05, 4.15, 4.25, 4.5]) - maxRadii = allSearchRadii[(allSearchRadii - self.searchRadius) <= 0][-1] + maxRadii = allSearchRadii[(allSearchRadii - self.searchRadius) >= 0][0] radiusSet = allSearchRadii[allSearchRadii <= maxRadii] - + # Calculate neighbors for each radius in the set (done only once and referred back to # throughout decodings) neighborsByRadius = {} diff --git a/starfish/core/spots/DecodeSpots/check_all_funcs.py b/starfish/core/spots/DecodeSpots/check_all_funcs.py index 29c882a43..95c3d37b7 100644 --- a/starfish/core/spots/DecodeSpots/check_all_funcs.py +++ b/starfish/core/spots/DecodeSpots/check_all_funcs.py @@ -309,7 +309,7 @@ def spotQuality(spotTables: dict, # Calculates index ranges to chunk data by ranges = [0] for i in range(1, numJobs): - ranges.append(int((len(roundSpots) / numJobs) * i)) + ranges.append(round((len(roundSpots) / numJobs) * i)) ranges.append(len(roundSpots)) chunkedSpots = [roundSpots[ranges[i]:ranges[i + 1]] for i in range(len(ranges[:-1]))] @@ -447,7 +447,7 @@ def buildBarcodes(roundData: pd.DataFrame, # Calculates index ranges to chunk data by ranges = [0] for i in range(1, numJobs + 1): - ranges.append(int((len(roundData) / numJobs) * i)) + ranges.append(round((len(roundData) / numJobs) * i)) chunkedNeighbors = [list(roundData['neighbors'])[ranges[i]: ranges[i + 1]] for i in range(len(ranges[:-1]))] @@ -606,7 +606,7 @@ def decoder(roundData: pd.DataFrame, # Calculates index ranges to chunk data by and creates list of chunked data to loop through ranges = [0] for i in range(1, numJobs + 1): - ranges.append(int((len(roundData) / numJobs) * i)) + ranges.append(round((len(roundData) / numJobs) * i)) chunkedData = [] for i in range(len(ranges[:-1])): chunkedData.append(deepcopy(roundData[ranges[i]:ranges[i + 1]])) @@ -764,7 +764,7 @@ def distanceFilter(roundData: pd.DataFrame, # Find ranges to chunk data by ranges = [0] for i in range(1, numJobs): - ranges.append(int((len(roundData) / numJobs) * i)) + ranges.append(round((len(roundData) / numJobs) * i)) ranges.append(len(roundData)) chunkedSpotCodes = [allSpotCodes[ranges[i]:ranges[i + 1]] for i in range(len(ranges[:-1]))] chunkedTargets = [allTargets[ranges[i]:ranges[i + 1]] for i in range(len(ranges[:-1]))] From 1a6b9a668f0cbd0611e53eee9f36513c3d206594 Mon Sep 17 00:00:00 2001 From: nickeener Date: Mon, 13 Jun 2022 13:12:54 -0700 Subject: [PATCH 26/30] fix --- starfish/core/spots/DecodeSpots/check_all_decoder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/starfish/core/spots/DecodeSpots/check_all_decoder.py b/starfish/core/spots/DecodeSpots/check_all_decoder.py index 2e17b7122..dcc74b042 100644 --- a/starfish/core/spots/DecodeSpots/check_all_decoder.py +++ b/starfish/core/spots/DecodeSpots/check_all_decoder.py @@ -205,7 +205,7 @@ def run(self, maxRadii = allSearchRadii[(allSearchRadii - self.searchRadius) >= 0][0] radiusSet = allSearchRadii[allSearchRadii <= maxRadii] - + # Calculate neighbors for each radius in the set (done only once and referred back to # throughout decodings) neighborsByRadius = {} From f2987a1df89292f3e747b88c2b90c24417675e9f Mon Sep 17 00:00:00 2001 From: nickeener Date: Tue, 21 Jun 2022 23:16:45 -0700 Subject: [PATCH 27/30] Added sphinx update to reqs --- REQUIREMENTS.txt | 1 + requirements/REQUIREMENTS-CI.txt | 224 +++++++++++++----------- requirements/REQUIREMENTS-CI.txt.in | 2 +- requirements/REQUIREMENTS-NAPARI-CI.txt | 222 ++++++++++++----------- starfish/REQUIREMENTS-STRICT.txt | 151 ++++++++-------- 5 files changed, 320 insertions(+), 280 deletions(-) diff --git a/REQUIREMENTS.txt b/REQUIREMENTS.txt index b1c700a84..76a5a36b3 100644 --- a/REQUIREMENTS.txt +++ b/REQUIREMENTS.txt @@ -27,3 +27,4 @@ trackpy validators xarray >= 0.14.1 ipywidgets +sphinx-bootstrap-theme==0.8.1 \ No newline at end of file diff --git a/requirements/REQUIREMENTS-CI.txt b/requirements/REQUIREMENTS-CI.txt index 37450b48c..35e2e8cd2 100644 --- a/requirements/REQUIREMENTS-CI.txt +++ b/requirements/REQUIREMENTS-CI.txt @@ -1,124 +1,133 @@ # You should not edit this file directly. Instead, you should edit one of the following files (requirements/REQUIREMENTS-CI.txt.in) and run make requirements/REQUIREMENTS-CI.txt alabaster==0.7.12 -argon2-cffi==21.1.0 -attrs==21.2.0 -Babel==2.9.1 +argon2-cffi==21.3.0 +argon2-cffi-bindings==21.2.0 +asttokens==2.0.5 +attrs==21.4.0 +Babel==2.10.3 backcall==0.2.0 -bleach==4.1.0 -boto3==1.18.37 -botocore==1.21.37 -certifi==2021.5.30 -cffi==1.14.6 -charset-normalizer==2.0.4 -click==8.0.1 -colorama==0.4.4 +beautifulsoup4==4.11.1 +bleach==5.0.0 +boto3==1.24.14 +botocore==1.27.14 +certifi==2022.6.15 +cffi==1.15.0 +charset-normalizer==2.0.12 +click==8.1.3 commonmark==0.9.1 -coverage==5.5 -cryptography==3.4.8 -cycler==0.10.0 +coverage==6.4.1 +cryptography==37.0.3 +cycler==0.11.0 dataclasses==0.6 -debugpy==1.4.1 +debugpy==1.6.0 decorator==4.4.2 defusedxml==0.7.1 -diskcache==5.2.1 -docutils==0.16 -entrypoints==0.3 +diskcache==5.4.0 +docutils==0.17.1 +entrypoints==0.4 execnet==1.9.0 -flake8==3.9.2 +executing==0.8.3 +fastjsonschema==2.15.3 +flake8==4.0.1 flake8-import-order==0.18.1 -h5py==3.4.0 -idna==3.2 -imageio==2.9.0 -imagesize==1.2.0 -importlib-metadata==4.8.1 +fonttools==4.33.3 +h5py==3.7.0 +idna==3.3 +imageio==2.19.3 +imagesize==1.3.0 +importlib-metadata==4.11.4 iniconfig==1.1.1 -ipykernel==6.3.1 -ipython==7.27.0 +ipykernel==6.15.0 +ipython==8.4.0 ipython-genutils==0.2.0 -ipywidgets==7.6.4 -jedi==0.18.0 -jeepney==0.7.1 -Jinja2==3.0.1 -jmespath==0.10.0 -joblib==1.0.1 -jsonschema==3.2.0 -jupyter-client==7.0.2 -jupyter-core==4.7.1 -jupyterlab-pygments==0.1.2 -jupyterlab-widgets==1.0.1 -keyring==23.1.0 -kiwisolver==1.3.2 -m2r2==0.3.1 -MarkupSafe==2.0.1 -matplotlib==3.4.3 +ipywidgets==7.7.0 +jedi==0.18.1 +jeepney==0.8.0 +Jinja2==3.1.2 +jmespath==1.0.1 +joblib==1.1.0 +jsonschema==4.6.0 +jupyter-client==7.3.4 +jupyter-core==4.10.0 +jupyterlab-pygments==0.2.2 +jupyterlab-widgets==1.1.0 +keyring==23.6.0 +kiwisolver==1.4.3 +m2r2==0.3.2 +MarkupSafe==2.1.1 +matplotlib==3.5.2 matplotlib-inline==0.1.3 mccabe==0.6.1 mistune==0.8.4 mpmath==1.2.1 -mypy==0.910 +mypy==0.961 mypy-extensions==0.4.3 -nbclient==0.5.4 -nbconvert==6.1.0 +nbclient==0.6.4 +nbconvert==6.5.0 nbencdec==0.0.10 -nbformat==5.1.3 -nest-asyncio==1.5.1 -networkx==2.6.2 -notebook==6.4.3 -numpy==1.21.2 -numpydoc==1.1.0 -packaging==21.0 -pandas==1.3.2 -pandocfilters==1.4.3 -parso==0.8.2 +nbformat==5.4.0 +nest-asyncio==1.5.5 +networkx==2.8.4 +notebook==6.4.12 +numpy==1.22.4 +numpydoc==1.4.0 +packaging==21.3 +pandas==1.4.2 +pandocfilters==1.5.0 +parso==0.8.3 pexpect==4.8.0 pickleshare==0.7.5 -Pillow==8.3.2 -pkginfo==1.7.1 +Pillow==9.1.1 +pkginfo==1.8.3 pluggy==1.0.0 -prometheus-client==0.11.0 -prompt-toolkit==3.0.20 +prometheus-client==0.14.1 +prompt-toolkit==3.0.29 +psutil==5.9.1 ptyprocess==0.7.0 -py==1.10.0 -pycodestyle==2.7.0 -pycparser==2.20 -pyflakes==2.3.1 -Pygments==2.10.0 -pyparsing==2.4.7 -pyrsistent==0.18.0 -pytest==6.2.5 -pytest-cov==2.12.1 -pytest-forked==1.3.0 -pytest-xdist==2.3.0 +pure-eval==0.2.2 +py==1.11.0 +pycodestyle==2.8.0 +pycparser==2.21 +pyflakes==2.4.0 +Pygments==2.12.0 +pyparsing==3.0.9 +pyrsistent==0.18.1 +pytest==7.1.2 +pytest-cov==3.0.0 +pytest-forked==1.4.0 +pytest-xdist==2.5.0 python-dateutil==2.8.2 -pytz==2021.1 -PyWavelets==1.1.1 -PyYAML==5.4.1 -pyzmq==22.2.1 +pytz==2022.1 +PyWavelets==1.3.0 +PyYAML==6.0 +pyzmq==23.2.0 read-roi==1.6.0 -readme-renderer==29.0 +readme-renderer==35.0 recommonmark==0.7.1 regional==1.1.2 -requests==2.26.0 +requests==2.28.0 requests-toolbelt==0.9.1 -rfc3986==1.5.0 -s3transfer==0.5.0 -scikit-image==0.18.3 -scikit-learn==0.24.2 -scipy==1.7.1 +rfc3986==2.0.0 +rich==12.4.4 +s3transfer==0.6.0 +scikit-image==0.19.3 +scikit-learn==1.1.1 +scipy==1.8.1 seaborn==0.11.2 -SecretStorage==3.3.1 -semantic-version==2.8.5 +SecretStorage==3.3.2 +semantic-version==2.10.0 Send2Trash==1.8.0 -setuptools==56.0.0 +setuptools==58.1.0 showit==1.1.4 six==1.16.0 slicedimage==4.1.1 -snowballstemmer==2.1.0 -Sphinx==4.1.2 -sphinx-autodoc-typehints==1.12.0 -sphinx-bootstrap-theme==0.7.1 -sphinx-gallery==0.9.0 -sphinx-rtd-theme==0.5.2 +snowballstemmer==2.2.0 +soupsieve==2.3.2.post1 +Sphinx==5.0.2 +sphinx-autodoc-typehints==1.18.3 +sphinx-bootstrap-theme==0.8.1 +sphinx-gallery==0.10.1 +sphinx-rtd-theme==1.0.0 sphinxcontrib-applehelp==1.0.2 sphinxcontrib-devhelp==1.0.2 sphinxcontrib-htmlhelp==2.0.0 @@ -126,26 +135,27 @@ sphinxcontrib-jsmath==1.0.1 sphinxcontrib-programoutput==0.17 sphinxcontrib-qthelp==1.0.3 sphinxcontrib-serializinghtml==1.1.5 +stack-data==0.3.0 sympy==1.5.1 -terminado==0.12.1 -testpath==0.5.0 -threadpoolctl==2.2.0 -tifffile==2021.8.30 -toml==0.10.2 +terminado==0.15.0 +threadpoolctl==3.1.0 +tifffile==2022.5.4 +tinycss2==1.1.1 +tomli==2.0.1 tornado==6.1 -tqdm==4.62.2 +tqdm==4.64.0 trackpy==0.5.0 -traitlets==5.1.0 -twine==3.4.2 +traitlets==5.3.0 +twine==4.0.1 types-pkg-resources==0.1.3 -types-PyYAML==5.4.10 -types-requests==2.25.6 -typing-extensions==3.10.0.2 -urllib3==1.26.6 -validators==0.18.2 +types-PyYAML==6.0.8 +types-requests==2.27.31 +types-urllib3==1.26.15 +typing_extensions==4.2.0 +urllib3==1.26.9 +validators==0.20.0 wcwidth==0.2.5 webencodings==0.5.1 -widgetsnbextension==3.5.1 -xarray==0.19.0 -zipp==3.5.0 - +widgetsnbextension==3.6.0 +xarray==2022.3.0 +zipp==3.8.0 diff --git a/requirements/REQUIREMENTS-CI.txt.in b/requirements/REQUIREMENTS-CI.txt.in index c53a1ea0b..0d16f555d 100644 --- a/requirements/REQUIREMENTS-CI.txt.in +++ b/requirements/REQUIREMENTS-CI.txt.in @@ -22,4 +22,4 @@ sphinx_bootstrap_theme sphinxcontrib-programoutput sphinx-gallery sphinx_rtd_theme -twine +twine \ No newline at end of file diff --git a/requirements/REQUIREMENTS-NAPARI-CI.txt b/requirements/REQUIREMENTS-NAPARI-CI.txt index a266ecb5e..35a9066b6 100644 --- a/requirements/REQUIREMENTS-NAPARI-CI.txt +++ b/requirements/REQUIREMENTS-NAPARI-CI.txt @@ -1,142 +1,160 @@ # You should not edit this file directly. Instead, you should edit one of the following files (requirements/REQUIREMENTS-NAPARI-CI.txt.in) and run make requirements/REQUIREMENTS-NAPARI-CI.txt alabaster==0.7.12 appdirs==1.4.4 -argon2-cffi==21.1.0 -attrs==21.2.0 -Babel==2.9.1 +argon2-cffi==21.3.0 +argon2-cffi-bindings==21.2.0 +asttokens==2.0.5 +attrs==21.4.0 +Babel==2.10.3 backcall==0.2.0 -bleach==4.1.0 -boto3==1.18.37 -botocore==1.21.37 +beautifulsoup4==4.11.1 +bleach==5.0.0 +boto3==1.24.14 +botocore==1.27.14 +build==0.8.0 cachey==0.2.1 -certifi==2021.5.30 -cffi==1.14.6 -charset-normalizer==2.0.4 -click==8.0.1 -cloudpickle==1.6.0 -cycler==0.10.0 -dask==2021.10.0 +certifi==2022.6.15 +cffi==1.15.0 +charset-normalizer==2.0.12 +click==8.1.3 +cloudpickle==2.1.0 +cycler==0.11.0 +dask==2022.6.0 dataclasses==0.6 -debugpy==1.4.1 +debugpy==1.6.0 decorator==4.4.2 defusedxml==0.7.1 -diskcache==5.2.1 -docstring-parser==0.10 -docutils==0.17.1 -entrypoints==0.3 -freetype-py==2.2.0 -fsspec==2021.8.1 -h5py==3.4.0 +diskcache==5.4.0 +docstring-parser==0.14.1 +docutils==0.18.1 +entrypoints==0.4 +executing==0.8.3 +fastjsonschema==2.15.3 +fonttools==4.33.3 +freetype-py==2.3.0 +fsspec==2022.5.0 +h5py==3.7.0 HeapDict==1.0.1 -hsluv==5.0.2 -idna==3.2 -imageio==2.9.0 -imagesize==1.2.0 +hsluv==5.0.3 +idna==3.3 +imageio==2.19.3 +imagesize==1.3.0 +importlib-metadata==4.11.4 iniconfig==1.1.1 -ipykernel==6.3.1 -ipython==7.27.0 +ipykernel==6.15.0 +ipython==8.4.0 ipython-genutils==0.2.0 -ipywidgets==7.6.4 -jedi==0.18.0 -Jinja2==3.0.1 -jmespath==0.10.0 -joblib==1.0.1 -jsonschema==3.2.0 -jupyter-client==7.0.2 -jupyter-core==4.7.1 -jupyterlab-pygments==0.1.2 -jupyterlab-widgets==1.0.1 -kiwisolver==1.3.2 -locket==0.2.1 -magicgui==0.2.10 -MarkupSafe==2.0.1 -matplotlib==3.4.3 +ipywidgets==7.7.0 +jedi==0.18.1 +Jinja2==3.1.2 +jmespath==1.0.1 +joblib==1.1.0 +jsonschema==4.6.0 +jupyter-client==7.3.4 +jupyter-core==4.10.0 +jupyterlab-pygments==0.2.2 +jupyterlab-widgets==1.1.0 +kiwisolver==1.4.3 +locket==1.0.0 +magicgui==0.5.1 +MarkupSafe==2.1.1 +matplotlib==3.5.2 matplotlib-inline==0.1.3 mistune==0.8.4 mpmath==1.2.1 -napari==0.4.11 +napari==0.4.16 napari-console==0.0.4 -napari-plugin-engine==0.1.9 -napari-svg==0.1.5 -nbclient==0.5.4 -nbconvert==6.1.0 -nbformat==5.1.3 -nest-asyncio==1.5.1 -networkx==2.6.2 -notebook==6.4.3 -numpy==1.21.2 -numpydoc==1.1.0 -packaging==21.0 -pandas==1.3.2 -pandocfilters==1.4.3 -parso==0.8.2 +napari-plugin-engine==0.2.0 +napari-svg==0.1.6 +nbclient==0.6.4 +nbconvert==6.5.0 +nbformat==5.4.0 +nest-asyncio==1.5.5 +networkx==2.8.4 +notebook==6.4.12 +npe2==0.5.0 +numpy==1.22.4 +numpydoc==1.4.0 +packaging==21.3 +pandas==1.4.2 +pandocfilters==1.5.0 +parso==0.8.3 partd==1.2.0 +pep517==0.12.0 pexpect==4.8.0 pickleshare==0.7.5 -Pillow==8.3.2 -Pint==0.17 +Pillow==9.1.1 +Pint==0.19.2 pluggy==1.0.0 -prometheus-client==0.11.0 -prompt-toolkit==3.0.20 -psutil==5.8.0 +prometheus-client==0.14.1 +prompt-toolkit==3.0.29 +psutil==5.9.1 +psygnal==0.3.5 ptyprocess==0.7.0 -py==1.10.0 -pycparser==2.20 -pydantic==1.8.2 -Pygments==2.10.0 -PyOpenGL==3.1.5 -pyparsing==2.4.7 +pure-eval==0.2.2 +py==1.11.0 +pycparser==2.21 +pydantic==1.9.1 +Pygments==2.12.0 +PyOpenGL==3.1.6 +pyparsing==3.0.9 PyQt5==5.14.2 -PyQt5-sip==12.9.0 -pyrsistent==0.18.0 -pytest==6.2.5 +PyQt5-sip==12.11.0 +pyrsistent==0.18.1 +pytest==7.1.2 pytest-qt==4.0.2 python-dateutil==2.8.2 -pytz==2021.1 -PyWavelets==1.1.1 -PyYAML==5.4.1 -pyzmq==22.2.1 -qtconsole==5.1.1 -QtPy==1.11.0 +pytomlpp==1.0.11 +pytz==2022.1 +PyWavelets==1.3.0 +PyYAML==6.0 +pyzmq==23.2.0 +qtconsole==5.3.1 +QtPy==2.1.0 read-roi==1.6.0 regional==1.1.2 -requests==2.26.0 -s3transfer==0.5.0 -scikit-image==0.18.3 -scikit-learn==0.24.2 -scipy==1.7.1 -semantic-version==2.8.5 +requests==2.28.0 +s3transfer==0.6.0 +scikit-image==0.19.3 +scikit-learn==1.1.1 +scipy==1.8.1 +semantic-version==2.10.0 Send2Trash==1.8.0 -setuptools==56.0.0 +setuptools==58.1.0 showit==1.1.4 six==1.16.0 slicedimage==4.1.1 -snowballstemmer==2.1.0 -Sphinx==4.1.2 +snowballstemmer==2.2.0 +soupsieve==2.3.2.post1 +Sphinx==5.0.2 +sphinx-bootstrap-theme==0.8.1 sphinxcontrib-applehelp==1.0.2 sphinxcontrib-devhelp==1.0.2 sphinxcontrib-htmlhelp==2.0.0 sphinxcontrib-jsmath==1.0.1 sphinxcontrib-qthelp==1.0.3 sphinxcontrib-serializinghtml==1.1.5 -superqt==0.2.3 +stack-data==0.3.0 +superqt==0.3.2 sympy==1.5.1 -terminado==0.12.1 -testpath==0.5.0 -threadpoolctl==2.2.0 -tifffile==2021.8.30 -toml==0.10.2 -toolz==0.11.1 +terminado==0.15.0 +threadpoolctl==3.1.0 +tifffile==2022.5.4 +tinycss2==1.1.1 +tomli==2.0.1 +toolz==0.11.2 tornado==6.1 -tqdm==4.62.2 +tqdm==4.64.0 trackpy==0.5.0 -traitlets==5.1.0 -typing-extensions==3.10.0.2 -urllib3==1.26.6 -validators==0.18.2 -vispy==0.8.1 +traitlets==5.3.0 +typer==0.4.1 +typing_extensions==4.2.0 +urllib3==1.26.9 +validators==0.20.0 +vispy==0.10.0 wcwidth==0.2.5 webencodings==0.5.1 -widgetsnbextension==3.5.1 -wrapt==1.12.1 -xarray==0.19.0 +widgetsnbextension==3.6.0 +wrapt==1.14.1 +xarray==2022.3.0 +zipp==3.8.0 diff --git a/starfish/REQUIREMENTS-STRICT.txt b/starfish/REQUIREMENTS-STRICT.txt index d0d9d1d68..a6094a12e 100644 --- a/starfish/REQUIREMENTS-STRICT.txt +++ b/starfish/REQUIREMENTS-STRICT.txt @@ -1,94 +1,105 @@ # You should not edit this file directly. Instead, you should edit one of the following files (REQUIREMENTS.txt) and run make starfish/REQUIREMENTS-STRICT.txt -argon2-cffi==21.1.0 -attrs==21.2.0 +argon2-cffi==21.3.0 +argon2-cffi-bindings==21.2.0 +asttokens==2.0.5 +attrs==21.4.0 backcall==0.2.0 -bleach==4.1.0 -boto3==1.18.37 -botocore==1.21.37 -certifi==2021.5.30 -cffi==1.14.6 -charset-normalizer==2.0.4 -click==8.0.1 -cycler==0.10.0 +beautifulsoup4==4.11.1 +bleach==5.0.0 +boto3==1.24.14 +botocore==1.27.14 +certifi==2022.6.15 +cffi==1.15.0 +charset-normalizer==2.0.12 +click==8.1.3 +cycler==0.11.0 dataclasses==0.6 -debugpy==1.4.1 +debugpy==1.6.0 decorator==4.4.2 defusedxml==0.7.1 -diskcache==5.2.1 -entrypoints==0.3 -h5py==3.4.0 -idna==3.2 -imageio==2.9.0 -ipykernel==6.3.1 -ipython==7.27.0 +diskcache==5.4.0 +entrypoints==0.4 +executing==0.8.3 +fastjsonschema==2.15.3 +fonttools==4.33.3 +h5py==3.7.0 +idna==3.3 +imageio==2.19.3 +ipykernel==6.15.0 +ipython==8.4.0 ipython-genutils==0.2.0 -ipywidgets==7.6.4 -jedi==0.18.0 -Jinja2==3.0.1 -jmespath==0.10.0 -joblib==1.0.1 -jsonschema==3.2.0 -jupyter-client==7.0.2 -jupyter-core==4.7.1 -jupyterlab-pygments==0.1.2 -jupyterlab-widgets==1.0.1 -kiwisolver==1.3.2 -MarkupSafe==2.0.1 -matplotlib==3.4.3 +ipywidgets==7.7.0 +jedi==0.18.1 +Jinja2==3.1.2 +jmespath==1.0.1 +joblib==1.1.0 +jsonschema==4.6.0 +jupyter-client==7.3.4 +jupyter-core==4.10.0 +jupyterlab-pygments==0.2.2 +jupyterlab-widgets==1.1.0 +kiwisolver==1.4.3 +MarkupSafe==2.1.1 +matplotlib==3.5.2 matplotlib-inline==0.1.3 mistune==0.8.4 mpmath==1.2.1 -nbclient==0.5.4 -nbconvert==6.1.0 -nbformat==5.1.3 -nest-asyncio==1.5.1 -networkx==2.6.2 -notebook==6.4.3 -numpy==1.21.2 -packaging==21.0 -pandas==1.3.2 -pandocfilters==1.4.3 -parso==0.8.2 +nbclient==0.6.4 +nbconvert==6.5.0 +nbformat==5.4.0 +nest-asyncio==1.5.5 +networkx==2.8.4 +notebook==6.4.12 +numpy==1.22.4 +packaging==21.3 +pandas==1.4.2 +pandocfilters==1.5.0 +parso==0.8.3 pexpect==4.8.0 pickleshare==0.7.5 -Pillow==8.3.2 -prometheus-client==0.11.0 -prompt-toolkit==3.0.20 +Pillow==9.1.1 +prometheus-client==0.14.1 +prompt-toolkit==3.0.29 +psutil==5.9.1 ptyprocess==0.7.0 -pycparser==2.20 -Pygments==2.10.0 -pyparsing==2.4.7 -pyrsistent==0.18.0 +pure-eval==0.2.2 +pycparser==2.21 +Pygments==2.12.0 +pyparsing==3.0.9 +pyrsistent==0.18.1 python-dateutil==2.8.2 -pytz==2021.1 -PyWavelets==1.1.1 -PyYAML==5.4.1 -pyzmq==22.2.1 +pytz==2022.1 +PyWavelets==1.3.0 +PyYAML==6.0 +pyzmq==23.2.0 read-roi==1.6.0 regional==1.1.2 -requests==2.26.0 -s3transfer==0.5.0 -scikit-image==0.18.3 -scikit-learn==0.24.2 -scipy==1.7.1 -semantic-version==2.8.5 +requests==2.28.0 +s3transfer==0.6.0 +scikit-image==0.19.3 +scikit-learn==1.1.1 +scipy==1.8.1 +semantic-version==2.10.0 Send2Trash==1.8.0 -setuptools==56.0.0 +setuptools==58.1.0 showit==1.1.4 six==1.16.0 slicedimage==4.1.1 +soupsieve==2.3.2.post1 +sphinx-bootstrap-theme==0.8.1 +stack-data==0.3.0 sympy==1.5.1 -terminado==0.12.1 -testpath==0.5.0 -threadpoolctl==2.2.0 -tifffile==2021.8.30 +terminado==0.15.0 +threadpoolctl==3.1.0 +tifffile==2022.5.4 +tinycss2==1.1.1 tornado==6.1 -tqdm==4.62.2 +tqdm==4.64.0 trackpy==0.5.0 -traitlets==5.1.0 -urllib3==1.26.6 -validators==0.18.2 +traitlets==5.3.0 +urllib3==1.26.9 +validators==0.20.0 wcwidth==0.2.5 webencodings==0.5.1 -widgetsnbextension==3.5.1 -xarray==0.19.0 +widgetsnbextension==3.6.0 +xarray==2022.3.0 From 8752555eeaa4b4d8a064b18adc1b30dc331826d8 Mon Sep 17 00:00:00 2001 From: nickeener Date: Wed, 22 Jun 2022 16:05:43 -0700 Subject: [PATCH 28/30] fixed error correction bug --- REQUIREMENTS.txt | 2 +- requirements/REQUIREMENTS-CI.txt | 10 +- requirements/REQUIREMENTS-NAPARI-CI.txt | 8 +- starfish/REQUIREMENTS-STRICT.txt | 8 +- .../spots/DecodeSpots/check_all_decoder.py | 124 ++++++++++-------- .../core/spots/DecodeSpots/check_all_funcs.py | 18 ++- .../spots/DecodeSpots/test/test_check_all.py | 10 +- 7 files changed, 100 insertions(+), 80 deletions(-) diff --git a/REQUIREMENTS.txt b/REQUIREMENTS.txt index 76a5a36b3..837eb1026 100644 --- a/REQUIREMENTS.txt +++ b/REQUIREMENTS.txt @@ -16,7 +16,7 @@ semantic_version # 0.16.[012] are excluded because https://github.com/scikit-image/scikit-image/pull/3984 introduced # a bug into max peak finder. 0.16.3 presumably will have the fix from # https://github.com/scikit-image/scikit-image/pull/4263. -scikit-image >= 0.14.0, != 0.16.0.*, != 0.16.1.*, != 0.16.2.*, != 0.17.1.*, != 0.17.2.* +scikit-image >= 0.14.0, != 0.16.0.*, != 0.16.1.*, != 0.16.2.*, != 0.17.1.*, != 0.17.2, < 0.19.0 scikit-learn scipy showit >= 1.1.4 diff --git a/requirements/REQUIREMENTS-CI.txt b/requirements/REQUIREMENTS-CI.txt index 35e2e8cd2..32f30c2c8 100644 --- a/requirements/REQUIREMENTS-CI.txt +++ b/requirements/REQUIREMENTS-CI.txt @@ -16,7 +16,7 @@ charset-normalizer==2.0.12 click==8.1.3 commonmark==0.9.1 coverage==6.4.1 -cryptography==37.0.3 +cryptography==37.0.2 cycler==0.11.0 dataclasses==0.6 debugpy==1.6.0 @@ -40,7 +40,7 @@ iniconfig==1.1.1 ipykernel==6.15.0 ipython==8.4.0 ipython-genutils==0.2.0 -ipywidgets==7.7.0 +ipywidgets==7.7.1 jedi==0.18.1 jeepney==0.8.0 Jinja2==3.1.2 @@ -50,7 +50,7 @@ jsonschema==4.6.0 jupyter-client==7.3.4 jupyter-core==4.10.0 jupyterlab-pygments==0.2.2 -jupyterlab-widgets==1.1.0 +jupyterlab-widgets==1.1.1 keyring==23.6.0 kiwisolver==1.4.3 m2r2==0.3.2 @@ -110,7 +110,7 @@ requests-toolbelt==0.9.1 rfc3986==2.0.0 rich==12.4.4 s3transfer==0.6.0 -scikit-image==0.19.3 +scikit-image==0.18.3 scikit-learn==1.1.1 scipy==1.8.1 seaborn==0.11.2 @@ -156,6 +156,6 @@ urllib3==1.26.9 validators==0.20.0 wcwidth==0.2.5 webencodings==0.5.1 -widgetsnbextension==3.6.0 +widgetsnbextension==3.6.1 xarray==2022.3.0 zipp==3.8.0 diff --git a/requirements/REQUIREMENTS-NAPARI-CI.txt b/requirements/REQUIREMENTS-NAPARI-CI.txt index 35a9066b6..4d7989afa 100644 --- a/requirements/REQUIREMENTS-NAPARI-CI.txt +++ b/requirements/REQUIREMENTS-NAPARI-CI.txt @@ -44,7 +44,7 @@ iniconfig==1.1.1 ipykernel==6.15.0 ipython==8.4.0 ipython-genutils==0.2.0 -ipywidgets==7.7.0 +ipywidgets==7.7.1 jedi==0.18.1 Jinja2==3.1.2 jmespath==1.0.1 @@ -53,7 +53,7 @@ jsonschema==4.6.0 jupyter-client==7.3.4 jupyter-core==4.10.0 jupyterlab-pygments==0.2.2 -jupyterlab-widgets==1.1.0 +jupyterlab-widgets==1.1.1 kiwisolver==1.4.3 locket==1.0.0 magicgui==0.5.1 @@ -115,7 +115,7 @@ read-roi==1.6.0 regional==1.1.2 requests==2.28.0 s3transfer==0.6.0 -scikit-image==0.19.3 +scikit-image==0.18.3 scikit-learn==1.1.1 scipy==1.8.1 semantic-version==2.10.0 @@ -154,7 +154,7 @@ validators==0.20.0 vispy==0.10.0 wcwidth==0.2.5 webencodings==0.5.1 -widgetsnbextension==3.6.0 +widgetsnbextension==3.6.1 wrapt==1.14.1 xarray==2022.3.0 zipp==3.8.0 diff --git a/starfish/REQUIREMENTS-STRICT.txt b/starfish/REQUIREMENTS-STRICT.txt index a6094a12e..eda66e668 100644 --- a/starfish/REQUIREMENTS-STRICT.txt +++ b/starfish/REQUIREMENTS-STRICT.txt @@ -28,7 +28,7 @@ imageio==2.19.3 ipykernel==6.15.0 ipython==8.4.0 ipython-genutils==0.2.0 -ipywidgets==7.7.0 +ipywidgets==7.7.1 jedi==0.18.1 Jinja2==3.1.2 jmespath==1.0.1 @@ -37,7 +37,7 @@ jsonschema==4.6.0 jupyter-client==7.3.4 jupyter-core==4.10.0 jupyterlab-pygments==0.2.2 -jupyterlab-widgets==1.1.0 +jupyterlab-widgets==1.1.1 kiwisolver==1.4.3 MarkupSafe==2.1.1 matplotlib==3.5.2 @@ -76,7 +76,7 @@ read-roi==1.6.0 regional==1.1.2 requests==2.28.0 s3transfer==0.6.0 -scikit-image==0.19.3 +scikit-image==0.18.3 scikit-learn==1.1.1 scipy==1.8.1 semantic-version==2.10.0 @@ -101,5 +101,5 @@ urllib3==1.26.9 validators==0.20.0 wcwidth==0.2.5 webencodings==0.5.1 -widgetsnbextension==3.6.0 +widgetsnbextension==3.6.1 xarray==2022.3.0 diff --git a/starfish/core/spots/DecodeSpots/check_all_decoder.py b/starfish/core/spots/DecodeSpots/check_all_decoder.py index dcc74b042..cb4f4c808 100644 --- a/starfish/core/spots/DecodeSpots/check_all_decoder.py +++ b/starfish/core/spots/DecodeSpots/check_all_decoder.py @@ -159,6 +159,7 @@ def run(self, counter = Counter(spotsPerRound) if counter[0] > self.errorRounds: exit('Not enough spots to form a barcode') + print(spotsPerRound) # If using physical coordinates, extract z and xy scales and check that they are all > 0 if self.physicalCoords: @@ -274,9 +275,13 @@ def run(self, # Subset spots by intensity, start with top 50% then decode again with all currentTables = {} for r in range(len(spotTables)): - lowerBound = np.percentile(spotTables[r]['spot_quals'], intVal) - currentTables[r] = spotTables[r][spotTables[r]['spot_quals'] - >= lowerBound] + + if len(spotTables[r]) > 0: + lowerBound = np.percentile(spotTables[r]['spot_quals'], intVal) + currentTables[r] = spotTables[r][spotTables[r]['spot_quals'] + >= lowerBound] + else: + currentTables[r] = pd.DataFrame() # Decode each radius and remove spots found in each decoding before the next for sr, searchRadius in enumerate(radiusSet): @@ -305,59 +310,70 @@ def run(self, decodedTables = {} for r in range(len(spotTables)): - # roundData will carry the possible barcode info for each spot in - # the current round being examined - roundData = deepcopy(currentTables[r]) - - # Drop all but the spot_id column - roundData = roundData[['spot_id']] - - # From each spot's neighbors, create all possible combinations that - # would form a barocde with the correct number of rounds. Adds - # spot_codes column to roundData - roundData = buildBarcodes(roundData, neighborDict, - currentRoundOmitNum, channelDict, - strictness, r, numJobs) - - # When strictness is positive the filter-first methods is used and - # distanceFilter is run first on all the potential barcodes to - # choose the one with the minimum score (based on spatial variance - # of the spots and their intensities) which are then matched to the - # codebook. Spots that have more possible barcodes to choose between - # than the current strictnessnumber are dropped as ambiguous. If - # strictness is negative, the decode-first method is run where all - # the possible barcodes are instead first matched to the codebook - # and then the lowest scoring decodable spot combination is chosen - # for each spot. Spots that have more decodable barcodes to choose - # from than the strictness value (absolute value) are dropped. - if strictness > 0: - - # Choose most likely combination of spots for each seed spot - # using their spatial variance and normalized intensity values. - # Adds distance column to roundData - roundData = distanceFilter(roundData, spotCoords, spotQualDict, - r, currentRoundOmitNum, numJobs) - - # Match possible barcodes to codebook. Adds target column to - # roundData - roundData = decoder(roundData, self.codebook, channelDict, - strictness, currentRoundOmitNum, r, numJobs) + if len(spotTables[r]) > 0: + + # roundData will carry the possible barcode info for each spot + # in the current round being examined + roundData = deepcopy(currentTables[r]) + + # Drop all but the spot_id column + roundData = roundData[['spot_id']] + + # From each spot's neighbors, create all possible combinations + # that would form a barocde with the correct number of rounds. + # Adds spot_codes column to roundData + + roundData = buildBarcodes(roundData, neighborDict, + currentRoundOmitNum, channelDict, + strictness, r, numJobs) + + # When strictness is positive the filter-first methods is used + # and distanceFilter is run first on all the potential barcodes + # to choose the one with the minimum score (based on spatial + # variance of the spots and their intensities) which are then + # matched to the codebook. Spots that have more possible + # barcodes to choose between than the current strictnessnumber + # are dropped as ambiguous. If strictness is negative, the + # decode-first method is run where all the possible barcodes + # are instead first matched to the codebook and then the lowest + # scoring decodable spot combination is chosen for each spot. + # Spots that have more decodable barcodes to choose from than + # the strictness value (absolute value) are dropped. + if strictness > 0: + + # Choose most likely combination of spots for each seed + # spot using their spatial variance and normalized intensity + # values. Adds distance column to roundData + roundData = distanceFilter(roundData, spotCoords, + spotQualDict, r, + currentRoundOmitNum, numJobs) + + # Match possible barcodes to codebook. Adds target column + # to roundData + roundData = decoder(roundData, self.codebook, channelDict, + strictness, currentRoundOmitNum, r, + numJobs) + + else: + + # Match possible barcodes to codebook. Adds target column + # to roundData + roundData = decoder(roundData, self.codebook, channelDict, + strictness, currentRoundOmitNum, r, + numJobs) + + # Choose most likely combination of spots for each seed + # spot using their spatial variance and normalized + # intensity values. Adds distance column to roundData + roundData = distanceFilter(roundData, spotCoords, + spotQualDict, r, + currentRoundOmitNum, numJobs) + + # Assign to DecodedTables dictionary + decodedTables[r] = roundData else: - - # Match possible barcodes to codebook. Adds target column to - # roundData - roundData = decoder(roundData, self.codebook, channelDict, - strictness, currentRoundOmitNum, r, numJobs) - - # Choose most likely combination of spots for each seed spot - # using their spatial variance and normalized intensity values. - # Adds distance column to roundData - roundData = distanceFilter(roundData, spotCoords, spotQualDict, - r, currentRoundOmitNum, numJobs) - - # Assign to DecodedTables dictionary - decodedTables[r] = roundData + decodedTables[r] = pd.DataFrame() # Turn spot table dictionary into single table, filter barcodes by # the seed number, add additional information, and choose between diff --git a/starfish/core/spots/DecodeSpots/check_all_funcs.py b/starfish/core/spots/DecodeSpots/check_all_funcs.py index 95c3d37b7..5cff4dae7 100644 --- a/starfish/core/spots/DecodeSpots/check_all_funcs.py +++ b/starfish/core/spots/DecodeSpots/check_all_funcs.py @@ -83,8 +83,11 @@ def createNeighborDict(spotTables: dict, neighborDict = {} spotIDs = {} for r in spotTables: - spotIDs[r] = {idd: 0 for idd in spotTables[r]['spot_id']} - neighborDict[r] = {i: defaultdict(list, {r: [i]}) for i in spotTables[r]['spot_id']} + if len(spotTables[r]) > 0: + spotIDs[r] = {idd: 0 for idd in spotTables[r]['spot_id']} + neighborDict[r] = {i: defaultdict(list, {r: [i]}) for i in spotTables[r]['spot_id']} + else: + neighborDict[r] = {} # Add neighbors in neighborsByRadius[searchRadius] but check to make sure that spot is still # available before adding it @@ -985,10 +988,11 @@ def removeUsedSpots(finalCodes: pd.DataFrame, spotTables: dict) -> dict: # Remove used spots for r in range(len(spotTables)): - usedSpots = set([passed[r] for passed in finalCodes['spot_codes'] - if passed[r] != 0]) - spotTables[r] = spotTables[r][~spotTables[r]['spot_id'].isin(usedSpots)] - spotTables[r] = spotTables[r].reset_index(drop=True) - spotTables[r].index = range(1, len(spotTables[r]) + 1) + if len(spotTables[r]) > 0: + usedSpots = set([passed[r] for passed in finalCodes['spot_codes'] + if passed[r] != 0]) + spotTables[r] = spotTables[r][~spotTables[r]['spot_id'].isin(usedSpots)] + spotTables[r] = spotTables[r].reset_index(drop=True) + spotTables[r].index = range(1, len(spotTables[r]) + 1) return spotTables diff --git a/starfish/core/spots/DecodeSpots/test/test_check_all.py b/starfish/core/spots/DecodeSpots/test/test_check_all.py index 47c9dceb1..04afaa323 100644 --- a/starfish/core/spots/DecodeSpots/test/test_check_all.py +++ b/starfish/core/spots/DecodeSpots/test/test_check_all.py @@ -1,7 +1,7 @@ import random import numpy as np -from scipy.ndimage.filters import gaussian_filter +from scipy.ndimage import gaussian_filter from starfish import ImageStack from starfish.core.codebook.codebook import Codebook @@ -107,7 +107,7 @@ def testExactMatches(): test[1][1] + 1 >= true[1][1] >= test[1][1] - 1: matches += 1 - assert matches == len(trueTargets) + assert matches == len(trueTargets), 'Incorrect number of targets found' def testJitteredMatches(): @@ -136,7 +136,7 @@ def testJitteredMatches(): test[1][1] + 3 >= true[1][1] >= test[1][1] - 3: matches += 1 - assert matches == len(trueTargets) + assert matches == len(trueTargets), 'Incorrect number of targets found' def testErrorCorrection(): @@ -144,7 +144,7 @@ def testErrorCorrection(): img, trueTargets = syntheticSeqfish(100, 100, 20, codebook, 5, 0, True) - bd = BlobDetector(min_sigma=1, max_sigma=4, num_sigma=30, threshold=.1, exclude_border=False) + bd = BlobDetector(min_sigma=1, max_sigma=4, num_sigma=10, threshold=.1, exclude_border=False) spots = bd.run(image_stack=img) assert spots.count_total_spots() == 4 * 5, 'Spot detector did not find all spots' @@ -165,4 +165,4 @@ def testErrorCorrection(): test[1][1] + 1 >= true[1][1] >= test[1][1] - 1: matches += 1 - assert matches == len(trueTargets) + assert matches == len(trueTargets), 'Incorrect number of targets found' From cde796b285e2174e3fc94e43293d55a23fe36122 Mon Sep 17 00:00:00 2001 From: nickeener Date: Wed, 20 Jul 2022 12:23:43 -0700 Subject: [PATCH 29/30] reqs fix --- REQUIREMENTS.txt | 5 +- requirements/REQUIREMENTS-CI.txt | 219 +++++++++--------- .../decoded_intensity_table.py | 10 + .../spots/DecodeSpots/check_all_decoder.py | 9 +- .../core/spots/DecodeSpots/check_all_funcs.py | 84 ------- 5 files changed, 118 insertions(+), 209 deletions(-) diff --git a/REQUIREMENTS.txt b/REQUIREMENTS.txt index 837eb1026..e645accc3 100644 --- a/REQUIREMENTS.txt +++ b/REQUIREMENTS.txt @@ -16,7 +16,7 @@ semantic_version # 0.16.[012] are excluded because https://github.com/scikit-image/scikit-image/pull/3984 introduced # a bug into max peak finder. 0.16.3 presumably will have the fix from # https://github.com/scikit-image/scikit-image/pull/4263. -scikit-image >= 0.14.0, != 0.16.0.*, != 0.16.1.*, != 0.16.2.*, != 0.17.1.*, != 0.17.2, < 0.19.0 +scikit-image >= 0.14.0, != 0.16.0.*, != 0.16.1.*, != 0.16.2.*, != 0.17.1.*, != 0.17.2.*, < 0.19.0 scikit-learn scipy showit >= 1.1.4 @@ -26,5 +26,4 @@ tqdm trackpy validators xarray >= 0.14.1 -ipywidgets -sphinx-bootstrap-theme==0.8.1 \ No newline at end of file +ipywidgets \ No newline at end of file diff --git a/requirements/REQUIREMENTS-CI.txt b/requirements/REQUIREMENTS-CI.txt index 32f30c2c8..677bad25a 100644 --- a/requirements/REQUIREMENTS-CI.txt +++ b/requirements/REQUIREMENTS-CI.txt @@ -1,133 +1,124 @@ # You should not edit this file directly. Instead, you should edit one of the following files (requirements/REQUIREMENTS-CI.txt.in) and run make requirements/REQUIREMENTS-CI.txt alabaster==0.7.12 -argon2-cffi==21.3.0 -argon2-cffi-bindings==21.2.0 -asttokens==2.0.5 -attrs==21.4.0 -Babel==2.10.3 +argon2-cffi==21.1.0 +attrs==21.2.0 +Babel==2.9.1 backcall==0.2.0 -beautifulsoup4==4.11.1 -bleach==5.0.0 -boto3==1.24.14 -botocore==1.27.14 -certifi==2022.6.15 -cffi==1.15.0 -charset-normalizer==2.0.12 -click==8.1.3 +bleach==4.1.0 +boto3==1.18.37 +botocore==1.21.37 +certifi==2021.5.30 +cffi==1.14.6 +charset-normalizer==2.0.4 +click==8.0.1 +colorama==0.4.4 commonmark==0.9.1 -coverage==6.4.1 -cryptography==37.0.2 -cycler==0.11.0 +coverage==5.5 +cryptography==3.4.8 +cycler==0.10.0 dataclasses==0.6 -debugpy==1.6.0 +debugpy==1.4.1 decorator==4.4.2 defusedxml==0.7.1 -diskcache==5.4.0 -docutils==0.17.1 -entrypoints==0.4 +diskcache==5.2.1 +docutils==0.16 +entrypoints==0.3 execnet==1.9.0 -executing==0.8.3 -fastjsonschema==2.15.3 -flake8==4.0.1 +flake8==3.9.2 flake8-import-order==0.18.1 -fonttools==4.33.3 -h5py==3.7.0 -idna==3.3 -imageio==2.19.3 -imagesize==1.3.0 -importlib-metadata==4.11.4 +h5py==3.4.0 +idna==3.2 +imageio==2.9.0 +imagesize==1.2.0 +importlib-metadata==4.8.1 iniconfig==1.1.1 -ipykernel==6.15.0 -ipython==8.4.0 +ipykernel==6.3.1 +ipython==7.27.0 ipython-genutils==0.2.0 -ipywidgets==7.7.1 -jedi==0.18.1 -jeepney==0.8.0 -Jinja2==3.1.2 -jmespath==1.0.1 -joblib==1.1.0 -jsonschema==4.6.0 -jupyter-client==7.3.4 -jupyter-core==4.10.0 -jupyterlab-pygments==0.2.2 -jupyterlab-widgets==1.1.1 -keyring==23.6.0 -kiwisolver==1.4.3 -m2r2==0.3.2 -MarkupSafe==2.1.1 -matplotlib==3.5.2 +ipywidgets==7.6.4 +jedi==0.18.0 +jeepney==0.7.1 +Jinja2==3.0.1 +jmespath==0.10.0 +joblib==1.0.1 +jsonschema==3.2.0 +jupyter-client==7.0.2 +jupyter-core==4.7.1 +jupyterlab-pygments==0.1.2 +jupyterlab-widgets==1.0.1 +keyring==23.1.0 +kiwisolver==1.3.2 +m2r2==0.3.1 +MarkupSafe==2.0.1 +matplotlib==3.4.3 matplotlib-inline==0.1.3 mccabe==0.6.1 mistune==0.8.4 mpmath==1.2.1 -mypy==0.961 +mypy==0.910 mypy-extensions==0.4.3 -nbclient==0.6.4 -nbconvert==6.5.0 +nbclient==0.5.4 +nbconvert==6.1.0 nbencdec==0.0.10 -nbformat==5.4.0 -nest-asyncio==1.5.5 -networkx==2.8.4 -notebook==6.4.12 -numpy==1.22.4 -numpydoc==1.4.0 -packaging==21.3 -pandas==1.4.2 -pandocfilters==1.5.0 -parso==0.8.3 +nbformat==5.1.3 +nest-asyncio==1.5.1 +networkx==2.6.2 +notebook==6.4.3 +numpy==1.21.2 +numpydoc==1.1.0 +packaging==21.0 +pandas==1.3.2 +pandocfilters==1.4.3 +parso==0.8.2 pexpect==4.8.0 pickleshare==0.7.5 -Pillow==9.1.1 -pkginfo==1.8.3 +Pillow==8.3.2 +pkginfo==1.7.1 pluggy==1.0.0 -prometheus-client==0.14.1 -prompt-toolkit==3.0.29 -psutil==5.9.1 +prometheus-client==0.11.0 +prompt-toolkit==3.0.20 ptyprocess==0.7.0 -pure-eval==0.2.2 -py==1.11.0 -pycodestyle==2.8.0 -pycparser==2.21 -pyflakes==2.4.0 -Pygments==2.12.0 -pyparsing==3.0.9 -pyrsistent==0.18.1 -pytest==7.1.2 -pytest-cov==3.0.0 -pytest-forked==1.4.0 -pytest-xdist==2.5.0 +py==1.10.0 +pycodestyle==2.7.0 +pycparser==2.20 +pyflakes==2.3.1 +Pygments==2.10.0 +pyparsing==2.4.7 +pyrsistent==0.18.0 +pytest==6.2.5 +pytest-cov==2.12.1 +pytest-forked==1.3.0 +pytest-xdist==2.3.0 python-dateutil==2.8.2 -pytz==2022.1 -PyWavelets==1.3.0 -PyYAML==6.0 -pyzmq==23.2.0 +pytz==2021.1 +PyWavelets==1.1.1 +PyYAML==5.4.1 +pyzmq==22.2.1 read-roi==1.6.0 -readme-renderer==35.0 +readme-renderer==29.0 recommonmark==0.7.1 regional==1.1.2 -requests==2.28.0 +requests==2.26.0 requests-toolbelt==0.9.1 -rfc3986==2.0.0 -rich==12.4.4 -s3transfer==0.6.0 +rfc3986==1.5.0 +s3transfer==0.5.0 scikit-image==0.18.3 -scikit-learn==1.1.1 -scipy==1.8.1 +scikit-learn==0.24.2 +scipy==1.7.1 seaborn==0.11.2 -SecretStorage==3.3.2 -semantic-version==2.10.0 +SecretStorage==3.3.1 +semantic-version==2.8.5 Send2Trash==1.8.0 -setuptools==58.1.0 +setuptools==56.0.0 showit==1.1.4 six==1.16.0 slicedimage==4.1.1 -snowballstemmer==2.2.0 -soupsieve==2.3.2.post1 -Sphinx==5.0.2 -sphinx-autodoc-typehints==1.18.3 +snowballstemmer==2.1.0 +Sphinx==4.1.2 +sphinx-autodoc-typehints==1.12.0 sphinx-bootstrap-theme==0.8.1 -sphinx-gallery==0.10.1 -sphinx-rtd-theme==1.0.0 +sphinx-gallery==0.9.0 +sphinx-rtd-theme==0.5.2 sphinxcontrib-applehelp==1.0.2 sphinxcontrib-devhelp==1.0.2 sphinxcontrib-htmlhelp==2.0.0 @@ -135,27 +126,25 @@ sphinxcontrib-jsmath==1.0.1 sphinxcontrib-programoutput==0.17 sphinxcontrib-qthelp==1.0.3 sphinxcontrib-serializinghtml==1.1.5 -stack-data==0.3.0 sympy==1.5.1 -terminado==0.15.0 -threadpoolctl==3.1.0 -tifffile==2022.5.4 -tinycss2==1.1.1 -tomli==2.0.1 +terminado==0.12.1 +testpath==0.5.0 +threadpoolctl==2.2.0 +tifffile==2021.8.30 +toml==0.10.2 tornado==6.1 -tqdm==4.64.0 +tqdm==4.62.2 trackpy==0.5.0 -traitlets==5.3.0 -twine==4.0.1 +traitlets==5.1.0 +twine==3.4.2 types-pkg-resources==0.1.3 -types-PyYAML==6.0.8 -types-requests==2.27.31 -types-urllib3==1.26.15 -typing_extensions==4.2.0 -urllib3==1.26.9 -validators==0.20.0 +types-PyYAML==5.4.10 +types-requests==2.25.6 +typing-extensions==3.10.0.2 +urllib3==1.26.6 +validators==0.18.2 wcwidth==0.2.5 webencodings==0.5.1 -widgetsnbextension==3.6.1 -xarray==2022.3.0 -zipp==3.8.0 +widgetsnbextension==3.5.1 +xarray==0.19.0 +zipp==3.5.0 \ No newline at end of file diff --git a/starfish/core/intensity_table/decoded_intensity_table.py b/starfish/core/intensity_table/decoded_intensity_table.py index 0dc98f951..091086239 100644 --- a/starfish/core/intensity_table/decoded_intensity_table.py +++ b/starfish/core/intensity_table/decoded_intensity_table.py @@ -17,18 +17,22 @@ class DecodedIntensityTable(IntensityTable): """ DecodedIntensityTable is a container for spot or pixel features extracted from image data that have been decoded. It is the primary output from starfish :py:class:`Decode` methods. + An IntensityTable records the numeric intensity of a set of features in each :code:`(round, channel)` tile in which the feature is identified. The :py:class:`IntensityTable` has shape :code:`(n_feature, n_channel, n_round)`. + Some :py:class:`SpotFinder` methods identify a position and search for Gaussian blobs in a small radius, only recording intensities if they are found in a given tile. Other :py:class:SpotFinder: approaches find blobs in a max-projection and measure them everywhere. As a result, some IntensityTables will be dense, and others will contain :code:`np.nan` entries where no feature was detected. + Examples -------- Create an IntensityTable using the ``synthetic_intensities`` method:: + >>> from starfish.core.test.factories import SyntheticData >>> sd = SyntheticData(n_ch=3, n_round=4, n_codes=2) >>> codes = sd.codebook() @@ -37,6 +41,7 @@ class DecodedIntensityTable(IntensityTable): array([[[ 0., 0., 0., 0.], [ 0., 0., 8022., 12412.], [11160., 9546., 0., 0.]], + [[ 0., 0., 0., 0.], [ 0., 0., 10506., 10830.], [11172., 12331., 0., 0.]]]) @@ -109,12 +114,15 @@ def to_mermaid(self, filename: str) -> pd.DataFrame: """ Writes a .csv.gz file in columnar format that is readable by MERMAID visualization software. + To run MERMAID, follow the installation instructions for that repository and simply replace the data.csv.gz file with the output of this function. + Parameters ---------- filename : str Name for compressed-gzipped MERMAID data file. Should end in '.csv.gz'. + Notes ------ See also https://github.com/JEFworks/MERmaid @@ -136,7 +144,9 @@ def to_mermaid(self, filename: str) -> pd.DataFrame: def to_expression_matrix(self) -> ExpressionMatrix: """ Generates a cell x gene count matrix where each cell is annotated with spatial metadata. + Requires that spots in the IntensityTable have been assigned to cells. + Returns ------- ExpressionMatrix : diff --git a/starfish/core/spots/DecodeSpots/check_all_decoder.py b/starfish/core/spots/DecodeSpots/check_all_decoder.py index cb4f4c808..222b2e5b1 100644 --- a/starfish/core/spots/DecodeSpots/check_all_decoder.py +++ b/starfish/core/spots/DecodeSpots/check_all_decoder.py @@ -64,9 +64,9 @@ class CheckAll(DecodeSpotsAlgorithm): confidence barcodes to be called first and make things easier when later codes are called. If error_rounds is set to 1 (currently cannot handle more than 1), after running all decodings - for barocdes that exactly match the codebook, another set of decodings will be run to find + for barcodes that exactly match the codebook, another set of decodings will be run to find barcodes that are missing a spot in exactly one round. If the codes in the codebook all have a - hamming distance of at least 2 from all other codes, each can still be uniquely indentified + hamming distance of at least 2 from all other codes, each can still be uniquely identified using a partial code with a single round dropped. Barcodes decoded with a partial code like this are inherently less accurate and so an extra dimension called "rounds_used" was added to the DecodedIntensityTable output that labels each decoded target with the number of rounds that was @@ -127,20 +127,16 @@ def run(self, """ Decode spots by finding the set of nonoverlapping barcodes that have the minimum spatial variance within each barcode. - Parameters ---------- spots: SpotFindingResults A Dict of tile indices and their corresponding measured spots - n_processes: int Number of threads to run decoder in parallel with - Returns ------- DecodedIntensityTable : IntensityTable decoded and appended with Features.TARGET values. - """ # Rename n_processes (trying to stay consistent between starFISH's _ variables and my @@ -159,7 +155,6 @@ def run(self, counter = Counter(spotsPerRound) if counter[0] > self.errorRounds: exit('Not enough spots to form a barcode') - print(spotsPerRound) # If using physical coordinates, extract z and xy scales and check that they are all > 0 if self.physicalCoords: diff --git a/starfish/core/spots/DecodeSpots/check_all_funcs.py b/starfish/core/spots/DecodeSpots/check_all_funcs.py index 5cff4dae7..e3ad187bd 100644 --- a/starfish/core/spots/DecodeSpots/check_all_funcs.py +++ b/starfish/core/spots/DecodeSpots/check_all_funcs.py @@ -22,19 +22,15 @@ def findNeighbors(spotTables: dict, ''' Using scipy's cKDTree method, finds all neighbors within the seach radius between the spots in each pair of rounds and stores the indices in a dictionary for later access. - Parameters ---------- spotTables : dict Dictionary with round labels as keys and pandas dataframes containing spot information for its key round as values (result of _merge_spots_by_round function) - searchRadius : float Distance that spots can be from each other and still form a barcode - numJobs : int Number of CPU threads to use in parallel - Returns ------- dict: a dictionary with the following structure: @@ -57,24 +53,18 @@ def createNeighborDict(spotTables: dict, ''' Create dictionary of neighbors (within the search radius) in other rounds for each spot. - Parameters ---------- - spotTables : dict Dictionary with round labels as keys and pandas dataframes containing spot information for its key round as values (result of _merge_spots_by_round function) - searchRadius : float Distance that spots can be from each other and still form a barcode - neighborsByRadius : dict Dictionary of outputs from findNeighbors() where each key is a radius and the value is the findNeighbors dictionary - Returns ------- - dict: a dictionary with the following structure neighborDict[roundNum][spotID] = {0 : neighbors in round 0, 1: neighbors in round 1,etc} ''' @@ -112,16 +102,13 @@ def createRefDicts(spotTables: dict, numJobs: int) -> tuple: ''' Create dictionaries with mapping from spot id (row index + 1) in spotTables to channel label, spatial coordinates raw intensity and normalized intensity. - Parameters ---------- spotTables : dict Dictionary with round labels as keys and pandas dataframes containing spot information for its key round as values (result of _merge_spots_by_round function) - numJobs : int Number of CPU threads to use in parallel - Returns ------- tuple : First object is the channel dictionary, second is the spatial coordinate dictionary, @@ -158,13 +145,10 @@ def encodeSpots(spotCodes: list) -> list: each ID is counted and these integer lengths and concatenated into a string in the same order as the IDs they correspond to. The IDs themselves are then converted to strings and concatenated to this, also maintaining order. - Parameters ---------- spotCodes : list List of spot codes (each a tuple of integers with length equal to the number of rounds) - - Returns ------- list: List of compressed spot codes, one int per code @@ -183,15 +167,12 @@ def decodeSpots(compressed: list, roundNum: int) -> list: code will each correspond to the string length of each spot ID integer (as long as no round has 10 billion or more spots). Can use these to determine how to split the rest of the string to retrieve the original values in the correct order. - Parameters ---------- compressed : list List of integer values corresponding to compressed spot codes - roundNum : int The number of rounds in the experiment - Returns ------- list: List of recovered spot codes in their original tuple form @@ -213,27 +194,20 @@ def spotQualityFunc(spots: list, ''' Helper function for spotQuality to run in parallel - Parameters ---------- spots : list List of spot IDs in the current round to calculate the normalized intensity of - spotCoords : dict Spot ID to spatial coordinate dictionary - spotIntensities : dict Spot ID to raw intensity dictionary - spotTables : dict Dictionary containing spot info tables - channelDict : dict Spot ID to channel label dictionary - r : int Current round - Returns ------- list : list of normalized spot intensities of the input spot IDs @@ -280,24 +254,18 @@ def spotQuality(spotTables: dict, Creates dictionary mapping each spot ID to their normalized intensity value. Calculated as the spot intensity value divided by the l2 norm of the intensities of all the spots in the same neighborhood. - Parameters ---------- spotTables : dict Dictionary containing spot info tables - spotCoords : dict Spot ID to spatial coordinate dictionary - spotIntensities : dict Spot ID to raw intensity dictionary - channelDict : dict Spot ID to channel label dictionary - numJobs : int Number of CPU threads to use in parallel - Returns ------- dict : dictionary mapping spot ID to it's normalized intensity value @@ -337,26 +305,20 @@ def barcodeBuildFunc(allNeighbors: list, ''' Subfunction to buildBarcodes that allows it to run in parallel chunks - Parameters ---------- allNeighbors : list List of neighbor from which to build barcodes from - channelDict : dict Dictionary mapping spot IDs to their channels labels - currentRound : int The round that the spots being used for reference points are found in - roundOmitNum : int Maximum hamming distance a barcode can be from it's target in the codebook and still be uniquely identified (i.e. number of error correction rounds in each the experiment) - roundNum : int Total number of round in experiment - Returns ------- list : list of the possible spot codes @@ -395,33 +357,25 @@ def buildBarcodes(roundData: pd.DataFrame, Builds possible barcodes for each seed spot from its neighbors. First checks that each spot has enough neighbors in each round to form a barcode and, depending on the strictness value, drops spots who have too many possible barcodes to choose from - Parameters ---------- roundData : dict Spot data table for the current round - neighborDict : dict Dictionary that contains all the neighbors for each spot in other rounds that are within the search radius - roundOmitNum : int Maximum hamming distance a barcode can be from it's target in the codebook and still be uniquely identified (i.e. number of error correction rounds in each the experiment - channelDict : dict Dictionary with mappings between spot IDs and their channel labels - strictness: int Determines the number of possible codes a spot is allowed to have before it is dropped as ambiguous (if it is positive) - currentRound : int Current round to build barcodes for (same round that roundData is from) - numJobs : int Number of CPU threads to use in parallel - Returns ------- pd.DataFrame : Copy of roundData with an additional column which lists all the possible spot @@ -474,15 +428,12 @@ def generateRoundPermutations(size: int, roundOmitNum: int) -> list: ''' Creates list of lists of logicals detailing the rounds to be used for decoding based on the current roundOmitNum - Parameters ---------- size : int Number of rounds in experiment - roundOmitNum: int Number of rounds that can be dropped from each barcode - Returns ------- list : list of lists of logicals detailing the rounds to be used for decoding based on @@ -499,15 +450,12 @@ def decodeFunc(data: pd.DataFrame, permutationCodes: dict) -> tuple: ''' Subfunction for decoder that allows it to run in parallel chunks - Parameters ---------- data : pd.DataFrame DataFrame with columns called 'barcodes' and 'spot_codes' - permutationCodes : dict Dictionary containing barcode information for each roundPermutation - Returns ------- tuple : First element is a list of all decoded targets, second element is a list of all @@ -547,32 +495,24 @@ def decoder(roundData: pd.DataFrame, Function that takes spots tables with possible barcodes added and matches each to the codebook to identify any matches. Matches are added to the spot tables and spots without any matches are dropped - Parameters ---------- roundData : pd.DataFrane Modified spot table containing all possible barcodes that can be made from each spot for the current round - codebook : Codebook starFISH Codebook object containg the barcode information for the experiment - channelDict : dict Dictionary with mappings between spot IDs and their channel labels - strictness : int Determines the number of target matching barcodes each spot is allowed before it is dropped as ambiguous (if it is negative) - currentRoundOmitNum : int Number of rounds that can be dropped from each barcode - currentRound : int Current round being for which spots are being decoded - numJobs : int Number of CPU threads to use in parallel - Returns ------- pd.DataFrane : Modified spot table with added columns with information on decodable @@ -643,26 +583,20 @@ def distanceFunc(spotsAndTargets: list, ''' Subfunction for distanceFilter to allow it to run in parallel - Parameters ---------- subSpotCodes : list Chunk of full list of spot codes for the current round to calculate the spatial variance for - subSpotCodes : list Chunk of full list of targets (0s if strictness is positive) associated with the current set of spots whose spatial variance is being calculated - spotCoords : dict Spot ID to spatial coordinate dictionary - spotQualDict : dict Spot ID to normalized intensity value dictionary - currentRoundOmitNum : int Number of rounds that can be dropped from each barcode - Returns ------- tuple: First object is the min scoring spot code for each spots, the second is the min @@ -710,7 +644,6 @@ def distanceFilter(roundData: pd.DataFrame, Function that chooses between the best barcode for each spot from the set of decodable barcodes. Does this by choosing the barcode with the least spatial variance and high intensity spots according to this calculation: - Score = -log(1 / 1 + (numRounds - qualSum)) + (-log(1 / 1 + spaVar) * constant) Where: numRounds = number of rounds being used for decoding (total - currentRoundOmitNum) @@ -720,28 +653,21 @@ def distanceFilter(roundData: pd.DataFrame, constant = a constant that determines the balance between the score being more influenced by spatial variance or intensity, set to 2 so spatial variance is the biggest deciding factor but allows ties to be broken by intensity - Parameters ---------- roundData : pd.DataFrame Modified spot table containing info on decodable barcodes for the spots in the current round - spotCoords : dict Spot ID to spatial coordinate dictionary - spotQualDict : dict Spot ID to normalized intensity value dictionary - currentRound : int Current round number to calculate distances for - currentRoundOmitNum : int Number of rounds that can be dropped from each barcode - numJobs : int Number of CPU threads to use in parallel - Returns ------- pd.DataFrame : Modified spot table with added columns to with info on the "best" barcode @@ -804,28 +730,21 @@ def cleanup(bestPerSpotTables: dict, filters them by their frequency (with a user-defined threshold), chooses between overlapping codes (using the same distance function as used earlier), and finally adds some additional information to the final set of barcodes - Parameters ---------- bestPerSpotTables : dict Spot tables dictionary containing columns with information on the "best" barcode found for each spot - spotCoords : dict Dictionary containing spatial locations of spots - channelDict : dict Dictionary with mapping between spot IDs and the channel labels - strictness : int Parameter that determines how many possible barcodes each spot can have before it is dropped as ambiguous - currentRoundOmitNum : int Number of rounds that can be dropped from each barcode - seedNumber : A barcode must be chosen as "best" in this number of rounds to pass filters - Returns ------- pd.DataFrame : Dataframe containing final set of codes that have passed all filters @@ -970,16 +889,13 @@ def removeUsedSpots(finalCodes: pd.DataFrame, spotTables: dict) -> dict: ''' Remove spots found to be in barcodes for the current round omission number from the spotTables so they are not used for the next round omission number - Parameters ---------- finalCodes : pd.DataFrame Dataframe containing final set of codes that have passed all filters - spotTables : dict Dictionary of original data tables extracted from SpotFindingResults objects by the _merge_spots_by_round() function - Returns ------- dict : Modified version of spotTables with spots that have been used in the current round From 30e919f0cb959f172d7592aa9665b2de3e4ce16d Mon Sep 17 00:00:00 2001 From: nickeener Date: Fri, 22 Jul 2022 14:20:28 -0700 Subject: [PATCH 30/30] Added better error catching --- .../spots/DecodeSpots/check_all_decoder.py | 22 ++++++++++++------- .../core/spots/DecodeSpots/check_all_funcs.py | 3 --- 2 files changed, 14 insertions(+), 11 deletions(-) diff --git a/starfish/core/spots/DecodeSpots/check_all_decoder.py b/starfish/core/spots/DecodeSpots/check_all_decoder.py index 222b2e5b1..77a3c8a05 100644 --- a/starfish/core/spots/DecodeSpots/check_all_decoder.py +++ b/starfish/core/spots/DecodeSpots/check_all_decoder.py @@ -1,4 +1,3 @@ -import sys from collections import Counter from copy import deepcopy from typing import Any, Hashable, Mapping, Tuple @@ -112,13 +111,16 @@ def __init__( # Check that codebook is the right class and not empty if not isinstance(self.codebook, Codebook) or len(codebook) == 0: - sys.exit('codebook is either not a Codebook object or is empty') + raise ValueError( + 'codebook is either not a Codebook object or is empty') # Check that error_rounds is either 0 or 1 if self.errorRounds not in [0, 1]: - exit('error_rounds can only take a value of 0 or 1') + raise ValueError( + 'error_rounds can only take a value of 0 or 1') # Return error if search radius is greater than 4.5 or negative if self.searchRadius < 0 or self.searchRadius > 4.5: - sys.exit('search_radius must be positive w/ max value of 4.5') + raise ValueError( + 'search_radius must be positive w/ max value of 4.5') def run(self, spots: SpotFindingResults, @@ -144,7 +146,8 @@ def run(self, numJobs = n_processes # Check that numJobs is a positive integer if numJobs < 0 or not isinstance(numJobs, int): - sys.exit('n_process must be a positive integer') + raise ValueError( + 'n_process must be a positive integer') # Create dictionary where keys are round labels and the values are pandas dataframes # containing information on the spots found in that round @@ -154,7 +157,8 @@ def run(self, spotsPerRound = [len(spotTables[r]) for r in range(len(spotTables))] counter = Counter(spotsPerRound) if counter[0] > self.errorRounds: - exit('Not enough spots to form a barcode') + raise ValueError( + 'Not enough spots to form a barcode') # If using physical coordinates, extract z and xy scales and check that they are all > 0 if self.physicalCoords: @@ -166,7 +170,8 @@ def run(self, yScale = physicalCoords['y'][1].data - physicalCoords['y'][0].data xScale = physicalCoords['x'][1].data - physicalCoords['x'][0].data if xScale <= 0 or yScale <= 0 or zScale <= 0: - exit('invalid physical coords') + raise ValueError( + 'invalid physical coords') # Add one to channels labels (prevents collisions between hashes of barcodes later), adds # unique spot_id column for each spot in each round, and scales the x, y, and z columns to @@ -249,7 +254,8 @@ def run(self, strictnesses.append(10) seedNumbers.append(len(spotTables) - 1) else: - exit('Invalid mode choice ("high", "med", or "low")') + raise ValueError( + 'Invalid mode choice ("high", "med", or "low")') # Decode for each round omission number, intensity cutoff, and then search radius allCodes = pd.DataFrame() diff --git a/starfish/core/spots/DecodeSpots/check_all_funcs.py b/starfish/core/spots/DecodeSpots/check_all_funcs.py index e3ad187bd..3d36e42de 100644 --- a/starfish/core/spots/DecodeSpots/check_all_funcs.py +++ b/starfish/core/spots/DecodeSpots/check_all_funcs.py @@ -1,5 +1,4 @@ import typing -import warnings from collections import Counter, defaultdict from concurrent.futures.process import ProcessPoolExecutor from copy import deepcopy @@ -13,8 +12,6 @@ from starfish.core.codebook.codebook import Codebook from starfish.types import Axes -warnings.filterwarnings('ignore') - def findNeighbors(spotTables: dict, searchRadius: float, numJobs: int) -> dict: