Merge pull request #222 from tanaes/fix-amplicon-pooling

Fix amplicon pooling (issue #204)
biocore · May 4, 2018 · a085877 · a085877
2 parents 7b44af9 + e12abd4
commit a085877
Show file tree

Hide file tree

Showing 10 changed files with 575 additions and 379 deletions.
diff --git a/labman/db/process.py b/labman/db/process.py
@@ -57,7 +57,7 @@ def factory(process_id):
             'shotgun library prep': LibraryPrepShotgunProcess,
             'quantification': QuantificationProcess,
             'gDNA normalization': NormalizationProcess,
-            'compress gDNA plates': GDNAPlateCompressionProcess,
+            'compressed gDNA plates': GDNAPlateCompressionProcess,
             'pooling': PoolingProcess,
             'sequencing': SequencingProcess}
 
@@ -605,7 +605,7 @@ class GDNAPlateCompressionProcess(Process):
     """
     _table = 'qiita.compression_process'
     _id_column = 'compression_process_id'
-    _process_type = "compress gDNA plates"
+    _process_type = "compressed gDNA plates"
 
     def _compress_plate(self, out_plate, in_plate, row_pad, col_pad, volume=1):
         """Compresses the 96-well in_plate into the 384-well out_plate"""
@@ -1575,7 +1575,7 @@ class QuantificationProcess(Process):
     _process_type = 'quantification'
 
     @staticmethod
-    def _compute_shotgun_pico_concentration(dna_vals, size=500):
+    def _compute_pico_concentration(dna_vals, size=500):
         """Computes molar concentration of libraries from library DNA
         concentration values.
 
@@ -1831,64 +1831,29 @@ def concentrations(self):
                 (composition_module.Composition.factory(comp_id), r_con, c_con)
                 for comp_id, r_con, c_con in TRN.execute_fetchindex()]
 
-    def compute_concentrations(self, dna_amount=240, min_val=1, max_val=15,
-                               blank_volume=2, size=500):
-        """Compute the normalized concentrations
+    def compute_concentrations(self, size=500):
+        """Compute the normalized library molarity based on pico green dna
+        concentrations estimates.
 
         Parameters
         ----------
-        dna_amount: float, optional
-            (Amplicon) Total amount of DNA, in ng. Default: 240
-        min_val: float, optional
-            (Amplicon) Minimum amount of DNA to normalize to (nM). Default: 1
-        max_val: float, optional
-            (Amplicon) Maximum value. Wells above this number will be
-            excluded (nM). Default: 15
-        blank_volume: float, optional
-            (Amplicon) Amount to pool for the blanks (nM). Default: 2.
         size: int, optional
-            (Shotgun) The average library molecule size, in bp.
+            The average library molecule size, in bp.
         """
         concentrations = self.concentrations
         layout = concentrations[0][0].container.plate.layout
 
         res = None
-        if isinstance(concentrations[0][0],
-                      composition_module.LibraryPrep16SComposition):
-            # Amplicon
-            sample_concs = np.zeros_like(layout, dtype=float)
-            is_blank = np.zeros_like(layout, dtype=bool)
-            for comp, r_conc, _ in concentrations:
-                well = comp.container
-                row = well.row - 1
-                col = well.column - 1
-                sample_concs[row][col] = r_conc
-                sc = comp.gdna_composition.sample_composition
-                is_blank[row][col] = sc.sample_composition_type == 'blank'
-
-            res = QuantificationProcess._compute_amplicon_pool_values(
-                sample_concs, dna_amount)
-            res[sample_concs < min_val] = min_val
-            # If there is any sample whose concentration is above the
-            # user-defined max_value, the decision is to not pool that sample.
-            # To not pool the sample, define it's volume to 0 and it will not
-            # get pooled.
-            res[sample_concs > max_val] = 0
-            res[is_blank] = blank_volume
-        elif isinstance(concentrations[0][0],
-                        composition_module.LibraryPrepShotgunComposition):
-            # Shotgun
-            sample_concs = np.zeros_like(layout, dtype=float)
-            for comp, r_conc, _ in concentrations:
-                well = comp.container
-                row = well.row - 1
-                col = well.column - 1
-                sample_concs[row][col] = r_conc
-
-            res = QuantificationProcess._compute_shotgun_pico_concentration(
-                sample_concs, size)
-        # No need for else, because if it is not one of the above types
-        # we don't need to do anything
+
+        sample_concs = np.zeros_like(layout, dtype=float)
+        for comp, r_conc, _ in concentrations:
+            well = comp.container
+            row = well.row - 1
+            col = well.column - 1
+            sample_concs[row][col] = r_conc
+
+        res = QuantificationProcess._compute_pico_concentration(
+            sample_concs, size)
 
         if res is not None:
             sql_args = []
@@ -1971,7 +1936,7 @@ def estimate_pool_conc_vol(sample_vols, sample_concs):
         return (pool_conc, total_vol)
 
     @staticmethod
-    def compute_shotgun_pooling_values_eqvol(sample_concs, total_vol=60.0):
+    def compute_pooling_values_eqvol(sample_concs, total_vol=60.0, **kwargs):
         """Computes molar concentration of libraries from concentration values,
         using an even volume per sample
 
@@ -1992,9 +1957,9 @@ def compute_shotgun_pooling_values_eqvol(sample_concs, total_vol=60.0):
         return sample_vols
 
     @staticmethod
-    def compute_shotgun_pooling_values_minvol(
-            sample_concs, sample_fracs=None, floor_vol=100, floor_conc=40,
-            total_nmol=.01):
+    def compute_pooling_values_minvol(
+            sample_concs, sample_fracs=None, floor_vol=2, floor_conc=16,
+            total=240, total_each=True, vol_constant=1, **kwargs):
         """Computes pooling volumes for samples based on concentration
         estimates of nM concentrations (`sample_concs`), taking a minimum
         volume of samples below a threshold.
@@ -2010,7 +1975,7 @@ def compute_shotgun_pooling_values_minvol(
         pooling.
 
         Finally, total pooling size is determined by a target nanomolar
-        quantity (`total_nmol`, default .01). For a perfect 384 sample library,
+        quantity (`total`, default .01). For a perfect 384 sample library,
         in which you had all samples at a concentration of exactly 400 nM and
         wanted a total volume of 60 uL, this would be 0.024 nmol.
 
@@ -2022,90 +1987,113 @@ def compute_shotgun_pooling_values_minvol(
         Parameters
         ----------
         sample_concs: 2D array of float
-            nM sample concentrations
+            sample concentrations, with numerator same units as `total`.
         sample_fracs: 2D of float, optional
             fractional value for each sample (default 1/N)
         floor_vol: float, optional
-            volume (nL) at which samples below floor_conc will be pooled.
+            volume at which samples below floor_conc will be pooled.
             Default: 100
         floor_conc: float, optional
-            minimum value (nM) for pooling at real estimated value. Default: 40
-        total_nmol : float, optional
-            total number of nM to have in pool. Default: 0.01
+            minimum value for pooling at real estimated value. Default: 40
+        total : float, optional
+            total quantity (numerator) for pool. Unitless, but could represent
+            for example ng or nmol. Default: 240
+        total_each : bool, optional
+            whether `total` refers to the quantity pooled *per sample*
+            (default; True) or to the total quantity of the pool.
+        vol_constant : float, optional
+            conversion factor between `sample_concs` demoninator and output
+            pooling volume units. E.g. if pooling ng/µL concentrations and
+            producing µL pool volumes, `vol_constant` = 1. If pooling nM
+            concentrations and producing nL pool volumes, `vol_constant` =
+            10**-9. Default: 1
 
         Returns
         -------
         sample_vols: np.array of floats
             the volumes in nL per each sample pooled
         """
         if sample_fracs is None:
-            sample_fracs = np.ones(sample_concs.shape) / sample_concs.size
+            sample_fracs = np.ones(sample_concs.shape)
+
+        if not total_each:
+            sample_fracs = sample_fracs / sample_concs.size
 
         # calculate volumetric fractions including floor val
-        sample_vols = (total_nmol * sample_fracs) / sample_concs
-        # convert L to nL
-        sample_vols *= 10**9
+        sample_vols = (total * sample_fracs) / sample_concs
+        # convert volume from concentration units to pooling units
+        sample_vols *= vol_constant
         # drop volumes for samples below floor concentration to floor_vol
         sample_vols[sample_concs < floor_conc] = floor_vol
         return sample_vols
 
     @staticmethod
-    def compute_shotgun_pooling_values_floor(
-            sample_concs, sample_fracs=None, min_conc=10, floor_conc=50,
-            total_nmol=.01):
-        """Computes pooling volumes for samples based on concentration
-        estimates of nM concentrations (`sample_concs`).
+    def adjust_blank_vols(pool_vols, comp_blanks, blank_vol):
+        """Specifically adjust blanks to a value specified volume
 
-        Reads in concentration values in nM. Samples must be above a minimum
-        concentration threshold (`min_conc`, default 10 nM) to be included.
-        Samples above this threshold but below a given floor concentration
-        (`floor_conc`, default 50 nM) will be pooled as if they were at the
-        floor concentration, to avoid overdiluting the pool.
+        Parameters
+        ----------
+        pool_vols: np.array
+            The per-well pool volumes
+        comp_blanks: np.array of bool
+            Boolean array indicating which wells are blanks
+        blank_vol: float
+            Volume at which to pool blanks
 
-        Samples can be assigned a target molar fraction in the pool by passing
-        a np.array (`sample_fracs`, same shape as `sample_concs`) with
-        fractional values per sample. By default, will aim for equal molar
-        pooling.
+        Returns
+        -------
+        np.array
+            The adjusted per-well pool volumes
+        """
 
-        Finally, total pooling size is determined by a target nanomolar
-        quantity (`total_nmol`, default .01). For a perfect 384 sample library,
-        in which you had all samples at a concentration of exactly 400 nM and
-        wanted a total volume of 60 uL, this would be 0.024 nmol.
+        pool_vols[comp_blanks] = blank_vol
+
+        return(pool_vols)
+
+    @staticmethod
+    def select_blanks(pool_vols, raw_concs, comp_blanks, blank_num):
+        """Specifically retain only the N most concentrated blanks
 
         Parameters
         ----------
-        sample_concs: 2D array of float
-            nM calculated by compute_qpcr_concentration
-        sample_fracs: 2D of float, optional
-            fractional value for each sample (default 1/N)
-        min_conc: float, optional
-            minimum nM concentration to be included in pool. Default: 10
-        floor_conc: float, optional
-            minimum value for pooling for samples above min_conc. Default: 50
-        total_nmol : float, optional
-            total number of nM to have in pool. Default 0.01
+        pool_vols: np.array
+            The per-well pool volumes
+        raw_concs: np.array of float
+            The per-well concentrations
+        comp_blanks: np.array of bool
+            Boolean array indicating which wells are blanks
+        blank_num: int
+            The number of blanks N to pool (in order of highest concentration)
 
         Returns
         -------
-        sample_vols: np.array of floats
-            the volumes in nL per each sample pooled
+        np.array
+            The adjusted per-well pool volumes
         """
-        if sample_fracs is None:
-            sample_fracs = np.ones(sample_concs.shape) / sample_concs.size
-
-        # get samples above threshold
-        sample_fracs_pass = sample_fracs.copy()
-        sample_fracs_pass[sample_concs <= min_conc] = 0
-        # renormalize to exclude lost samples
-        sample_fracs_pass *= 1/sample_fracs_pass.sum()
-        # floor concentration value
-        sample_concs_floor = sample_concs.copy()
-        sample_concs_floor[sample_concs < floor_conc] = floor_conc
-        # calculate volumetric fractions including floor val
-        sample_vols = (total_nmol * sample_fracs_pass) / sample_concs_floor
-        # convert L to nL
-        sample_vols *= 10**9
-        return sample_vols
+
+        if blank_num < 0:
+            raise ValueError("blank_num cannot be negative (passed: %s)" %
+                             blank_num)
+
+        if comp_blanks.shape != pool_vols.shape != raw_concs.shape:
+            raise ValueError("all input arrays must be same shape")
+
+        blanks = []
+
+        adjusted_vols = pool_vols.copy()
+
+        for index, x in np.ndenumerate(comp_blanks):
+            if x:
+                blanks.append((raw_concs[index], index))
+
+        sorted_blanks = sorted(blanks, key=lambda tup: tup[0], reverse=True)
+
+        reject_blanks = sorted_blanks[blank_num:]
+
+        for _, idx in reject_blanks:
+            adjusted_vols[idx] = 0
+
+        return(adjusted_vols)
 
     @classmethod
     def create(cls, user, quantification_process, pool_name, volume,

diff --git a/labman/db/support_files/db_patch_manual.sql b/labman/db/support_files/db_patch_manual.sql
@@ -34,7 +34,7 @@ INSERT INTO qiita.process_type (description) VALUES
     ('primer template creation'), ('primer working plate creation'),
     ('sample plating'), ('gDNA extraction'), ('16S library prep'),
     ('shotgun library prep'), ('quantification'), ('gDNA normalization'),
-    ('pooling'), ('reagent creation'), ('sequencing'), ('compress gDNA plates');
+    ('pooling'), ('reagent creation'), ('sequencing'), ('compressed gDNA plates');
 
 -- Populate the equipment type table
 INSERT INTO qiita.equipment_type (description) VALUES

diff --git a/labman/db/support_files/populate_test_db.sql b/labman/db/support_files/populate_test_db.sql
@@ -571,9 +571,10 @@ BEGIN
         RETURNING process_id INTO p_pool_process_id;
 
     INSERT INTO qiita.pooling_process (process_id, quantification_process_id, robot_id, destination, pooling_function_data)
-        VALUES (p_pool_process_id, pg_quant_subprocess_id, proc_robot_id, 1, '{"function": "amplicon", "parameters": {"dna-amount-": 240, "min-val-": 1, "max-val-": 15, "blank-val-": 2}}'::json)
+        VALUES (p_pool_process_id, pg_quant_subprocess_id, proc_robot_id, 1, '{"function": "amplicon", "parameters": {"total-": 240, "floor-vol-": 2, "floor-conc-": 16}}'::json)
         RETURNING pooling_process_id INTO p_pool_subprocess_id;
 
+
     ----------------------------------------
     ------ SEQUENCING POOLING PROCESS ------
     ----------------------------------------
@@ -660,7 +661,7 @@ BEGIN
 
     -- Quantify plate pools
     INSERT INTO qiita.concentration_calculation (quantitated_composition_id, upstream_process_id, raw_concentration)
-        VALUES (p_pool_composition_id, ppg_quant_subprocess_id, 1.5);
+        VALUES (p_pool_composition_id, ppg_quant_subprocess_id, 25);
 
     -- Pool sequencing run
     INSERT INTO qiita.container (container_type_id, latest_upstream_process_id, remaining_volume)
@@ -721,7 +722,7 @@ BEGIN
 
     SELECT process_type_id INTO gdna_comp_process_type_id
         FROM qiita.process_type
-        WHERE description = 'compress gDNA plates';
+        WHERE description = 'compressed gDNA plates';
 
     INSERT INTO qiita.process (process_type_id, run_date, run_personnel_id)
         VALUES (gdna_comp_process_type_id, '10/25/2017', '[email protected]')
@@ -964,8 +965,13 @@ BEGIN
                 VALUES (lib_prep_16s_composition_id, gdna_subcomposition_id, primer_comp_id);
 
             -- Quantification
-            INSERT INTO qiita.concentration_calculation (quantitated_composition_id, upstream_process_id, raw_concentration, computed_concentration)
-                VALUES (lib_prep_16s_composition_id, pg_quant_subprocess_id, 1.5, 1.5);
+            IF idx_row_well <= 7 THEN
+                INSERT INTO qiita.concentration_calculation (quantitated_composition_id, upstream_process_id, raw_concentration, computed_concentration)
+                    VALUES (lib_prep_16s_composition_id, pg_quant_subprocess_id, 20., 60.6060);
+            ELSE
+                INSERT INTO qiita.concentration_calculation (quantitated_composition_id, upstream_process_id, raw_concentration, computed_concentration)
+                    VALUES (lib_prep_16s_composition_id, pg_quant_subprocess_id, 1., 3.0303);
+            END IF;
 
             -- Pool plate
             INSERT INTO qiita.pool_composition_components (output_pool_composition_id, input_composition_id, input_volume, percentage_of_output)

diff --git a/labman/db/tests/test_composition.py b/labman/db/tests/test_composition.py
@@ -312,7 +312,7 @@ def test_pool_composition_attributes(self):
         exp = {'composition': LibraryPrep16SComposition(1),
                'input_volume': 1.0, 'percentage_of_output': 0}
         self.assertEqual(obs_comp[0], exp)
-        self.assertEqual(obs.raw_concentration, 1.5)
+        self.assertEqual(obs.raw_concentration, 25.0)
 
     def test_primer_set_attributes(self):
         obs = PrimerSet(1)