Fixes #431 and provides supporting code that can also be used in futu…

…re for #483
biocore · May 7, 2019 · 67d5c31 · 67d5c31
1 parent 02ed687
commit 67d5c31
Show file tree

Hide file tree

Showing 2 changed files with 153 additions and 96 deletions.
diff --git a/labcontrol/db/process.py b/labcontrol/db/process.py
@@ -2998,6 +2998,51 @@ def _format_sample_sheet_comments(principal_investigator=None,
 
         return ''.join(comments)
 
+    @staticmethod
+    def _set_control_values_to_plate_value(input_df, plate_col_name,
+                                           value_col_name):
+
+        problem_plate_messages = []
+
+        # create a mask to define all the NON-control rows for this plate
+        non_controls_mask = input_df[value_col_name].notnull()
+
+        # get all the unique plates in the dataframe
+        unique_plates = input_df[plate_col_name].unique()
+        for curr_unique_plate in unique_plates:
+            # create a mask to define all the rows for this plate
+            plate_mask = input_df[plate_col_name] == curr_unique_plate
+
+            # create a mask to define all the rows for this plate where the
+            # value is NOT the control value (None)
+            plate_non_controls_mask = plate_mask & non_controls_mask
+
+            # get the unique values for the part of the df defined in the mask
+            curr_unique_values = (input_df[plate_non_controls_mask]
+                                  [value_col_name].unique())
+            if len(curr_unique_values) != 1:
+                curr_err_msg = "Expected one unique value for plate '{0}' " \
+                               "but received {1}: {2}".format(
+                    curr_unique_plate, len(curr_unique_values),
+                    ", ".join([str(x) for x in curr_unique_values]))
+                problem_plate_messages.append(curr_err_msg)
+            else:
+                # create a mask to define all the rows for this plate where the
+                # value IS the control value (None); ~ "nots" a whole series
+                plate_controls_mask = plate_mask & (~non_controls_mask)
+
+                # ok to just take first non-control value because we
+                # verified above there is only one value there anyway
+                input_df.loc[plate_controls_mask, value_col_name] = \
+                    curr_unique_values[0]
+            # end if
+        # next unique plate
+
+        if len(problem_plate_messages) > 0:
+            raise ValueError("\n".join(problem_plate_messages))
+
+        return input_df
+
     def _format_sample_sheet(self, data, sep=','):
         """Formats Illumina-compatible sample sheet.
 
@@ -3608,6 +3653,9 @@ def _generate_amplicon_prep_information(self):
                 # for content
                 data[curr_prep_sheet_id][content] = result
 
+        plate_col_name = 'Sample_Plate'
+        proj_col_name = 'Project_name'
+
         # converting from dict to pandas and then to tsv
         for curr_prep_sheet_id, vals in data.items():
             df = pd.DataFrame.from_dict(vals, orient='index')
@@ -3629,14 +3677,14 @@ def _generate_amplicon_prep_information(self):
             # 1/3. renaming columns so they match expected casing
             mv = {
                 'barcode': 'BARCODE', 'master_mix': 'MasterMix_lot',
-                'platform': 'PLATFORM', 'sample_plate': 'Sample_Plate',
+                'platform': 'PLATFORM', 'sample_plate': plate_col_name,
                 'run_prefix': 'RUN_PREFIX', 'primer_date': 'Primer_date',
                 'extraction_robot': 'Extraction_robot',
                 'runid': 'RUNID', 'epmotion_tm50_8_tool': 'TM50_8_tool',
                 'library_construction_protocol':
                     'LIBRARY_CONSTRUCTION_PROTOCOL',
                 'plating': 'Plating', 'linker': 'LINKER',
-                'project_name': 'Project_name', 'orig_name2': 'Orig_name',
+                'project_name': proj_col_name, 'orig_name2': 'Orig_name',
                 'well_id': 'Well_ID', 'water_lot': 'Water_Lot',
                 'well_description': 'Well_description',
                 'run_center': 'RUN_CENTER',
@@ -3657,16 +3705,25 @@ def _generate_amplicon_prep_information(self):
             # final output.
             df.drop(['orig_name'], axis=1)
 
+            # Ensure that each sample plate included in sequencing run does not
+            # contain experimental samples for more than (or less than) one
+            # qiita study; assuming this is true, set the project column value
+            # for each non-experimental samples to the value of the project
+            # name for the (single) qiita study on the non-experimental
+            # sample's plate.
+            df = self._set_control_values_to_plate_value(df, plate_col_name,
+                                                         proj_col_name)
+
             # 2/3. sorting rows
-            rows_order = ['Sample_Plate', 'row_num', 'col_num']
+            rows_order = [plate_col_name, 'row_num', 'col_num']
             df.sort_values(by=rows_order, inplace=True)
             # 3/3. sorting and keeping only required columns
             order = [
                 'BARCODE', 'PRIMER', 'Primer_Plate', 'Well_ID', 'Plating',
                 'ExtractionKit_lot', 'Extraction_robot', 'TM1000_8_tool',
                 'Primer_date', 'MasterMix_lot', 'Water_Lot',
                 'Processing_robot', 'TM300_8_tool', 'TM50_8_tool',
-                'Sample_Plate', 'Project_name', 'Orig_name',
+                plate_col_name, proj_col_name, 'Orig_name',
                 'Well_description', 'EXPERIMENT_DESIGN_DESCRIPTION',
                 'LIBRARY_CONSTRUCTION_PROTOCOL', 'LINKER', 'PLATFORM',
                 'RUN_CENTER', 'RUN_DATE', 'RUN_PREFIX', 'pcr_primers',