From bcd8d46b3f29a9c221b0f186af91452908d496a8 Mon Sep 17 00:00:00 2001
From: AmandaBirmingham <lists@imladris.com>
Date: Thu, 26 Sep 2024 13:38:06 -0700
Subject: [PATCH] added sample id instead of barcode handling, extended sample
 sheet parsing, fixed function name typo

---
 q2_surpi/_formats_and_types.py | 15 +++++++++++----
 q2_surpi/_plugin.py            | 25 +++++++++++++++----------
 q2_surpi/plugin_setup.py       | 12 ++++++++----
 3 files changed, 34 insertions(+), 18 deletions(-)

diff --git a/q2_surpi/_formats_and_types.py b/q2_surpi/_formats_and_types.py
index 77a2816..799b909 100644
--- a/q2_surpi/_formats_and_types.py
+++ b/q2_surpi/_formats_and_types.py
@@ -9,6 +9,7 @@
 FAMILY_KEY = "family"
 TAG_KEY = "tag"
 SAMPLE_NAME_KEY = 'Sample_Name'
+SS_SAMPLE_ID_KEY = "Sample_ID"
 INDEX_1_KEY = "index"
 INDEX_2_KEY = "index2"
 BARCODE_KEY = 'barcode'
@@ -46,10 +47,10 @@ class SurpiSampleSheetFormat(model.TextFileFormat):
     """Represents a csv-delimited sample sheet file used by SURPI+."""
 
     def _validate_(self, level):
-        _ = surpi_count_fp_to_df(self.path)
+        _ = surpi_sample_sheet_fp_to_df(self.path)
 
 
-def surpi_count_fp_to_df(fp: str) -> pandas.DataFrame:
+def surpi_sample_sheet_fp_to_df(fp: str) -> pandas.DataFrame:
     # open the file and count each line until we find one that starts with
     # [Data]
 
@@ -62,8 +63,14 @@ def surpi_count_fp_to_df(fp: str) -> pandas.DataFrame:
                 continue
             # endif line.startswith("[Data]")
 
-            if is_data and not line.startswith(','):
-                data_table_lines.append(line)
+            if is_data:
+                if line.startswith("["):
+                    # if we've reached the beginning of the next section, stop
+                    break
+
+                if not line.startswith(','):
+                    # add non-empty lines to the data table
+                    data_table_lines.append(line)
             # endif is_data and not line.startswith(',')
         # endfor line in f
     # endwith self.path.open("r") as f
diff --git a/q2_surpi/_plugin.py b/q2_surpi/_plugin.py
index ab8fab8..cb7fb26 100644
--- a/q2_surpi/_plugin.py
+++ b/q2_surpi/_plugin.py
@@ -1,6 +1,7 @@
 import pandas
 from q2_surpi._formats_and_types import FEATURE_ID_KEY, FAMILY_KEY, \
-    GENUS_KEY, SPECIES_KEY, BARCODE_KEY, TAG_KEY, SAMPLE_NAME_KEY
+    GENUS_KEY, SPECIES_KEY, BARCODE_KEY, TAG_KEY, SAMPLE_NAME_KEY, \
+    SS_SAMPLE_ID_KEY
 
 SAMPLE_ID_KEY = 'sample-id'
 TAXON_KEY = 'Taxon'
@@ -15,7 +16,8 @@
 # automagically and this will receive pandas.DataFrames as its arguments.
 def extract(
         surpi_output: pandas.DataFrame,
-        surpi_sample_info: pandas.DataFrame) -> \
+        surpi_sample_info: pandas.DataFrame,
+        ids_are_barcodes: bool = True) -> \
         (pandas.DataFrame, pandas.DataFrame):
 
     """Turn SURPI data into a feature table dataframe and a taxonomy dataframe.
@@ -26,6 +28,9 @@ def extract(
         A DataFrame containing the content of a SURPI counttable [sic] file.
     surpi_sample_info_df : pandas.DataFrame
         A DataFrame containing the content of a SURPI sample sheet file.
+    ids_are_barcodes : bool, optional
+        True if the sample ids are barcodes. False if the sample ids are
+        sample sheet sample ids. Default is True.
 
     Returns
     -------
@@ -37,6 +42,8 @@ def extract(
         the QIIME 2 taxonomy format.
     """
 
+    ss_sample_id_key = BARCODE_KEY if ids_are_barcodes else SS_SAMPLE_ID_KEY
+
     # Generate the taxonomy result
     taxonomy = surpi_output[[SPECIES_KEY, GENUS_KEY, FAMILY_KEY]].copy()
     taxonomy[TAXON_KEY] = surpi_output.apply(
@@ -56,26 +63,24 @@ def extract(
     surpi_feature_table_df[FEATURE_ID_KEY] = taxonomy.index
     surpi_feature_table_df = surpi_feature_table_df.set_index(FEATURE_ID_KEY)
     surpi_feature_table_df = surpi_feature_table_df.T
-    surpi_feature_table_df.index.name = BARCODE_KEY
+    surpi_feature_table_df.index.name = ss_sample_id_key
     surpi_feature_table_df = surpi_feature_table_df.reset_index()
-    feature_barcodes = surpi_feature_table_df[BARCODE_KEY].unique()
+    feature_barcodes = surpi_feature_table_df[ss_sample_id_key].unique()
 
     # merge the sample info with the feature table
-    # TODO: this is speculative code and may need to be adjusted; I don't
-    #  know yet what the sample info looks like
     limited_sample_info_df = \
-        surpi_sample_info[[BARCODE_KEY, SAMPLE_NAME_KEY]]
+        surpi_sample_info[[ss_sample_id_key, SAMPLE_NAME_KEY]]
     surpi_feature_table_df = surpi_feature_table_df.merge(
-        limited_sample_info_df, on=BARCODE_KEY, how='inner',
+        limited_sample_info_df, on=ss_sample_id_key, how='inner',
         validate='one_to_one')
-    identified_barcodes = surpi_feature_table_df[BARCODE_KEY].unique()
+    identified_barcodes = surpi_feature_table_df[ss_sample_id_key].unique()
     unidentified_barcodes = set(feature_barcodes) - set(identified_barcodes)
     if len(unidentified_barcodes) > 0:
         raise ValueError(
             f"The following barcodes were not linked to sample identifiers "
             f"in the sample sheet: {unidentified_barcodes}")
 
-    surpi_feature_table_df.drop(columns=[BARCODE_KEY], inplace=True)
+    surpi_feature_table_df.drop(columns=[ss_sample_id_key], inplace=True)
     surpi_feature_table_df.set_index(SAMPLE_NAME_KEY, inplace=True)
     surpi_feature_table_df.index.name = SAMPLE_ID_KEY
 
diff --git a/q2_surpi/plugin_setup.py b/q2_surpi/plugin_setup.py
index 70d8eb6..e10928a 100644
--- a/q2_surpi/plugin_setup.py
+++ b/q2_surpi/plugin_setup.py
@@ -1,12 +1,12 @@
 import pandas
 from q2_types.feature_table import FeatureTable, Frequency
 from q2_types.feature_data import FeatureData, Taxonomy
-from qiime2.plugin import (Plugin, Citations)
+from qiime2.plugin import (Plugin, Citations, Bool)
 import q2_surpi
 from q2_surpi._formats_and_types import (
     SurpiCountTable, SurpiCountTableFormat, SurpiCountTableDirectoryFormat,
     SurpiSampleSheet, SurpiSampleSheetFormat, SurpiSampleSheetDirectoryFormat,
-    surpi_count_fp_to_df)
+    surpi_sample_sheet_fp_to_df)
 
 
 plugin = Plugin(
@@ -42,7 +42,7 @@ def _1(ff: SurpiCountTableFormat) -> pandas.DataFrame:
 @plugin.register_transformer
 # load a SurpiSampleSheetFormat into a dataframe
 def _2(ff: SurpiSampleSheetFormat) -> pandas.DataFrame:
-    result = surpi_count_fp_to_df(str(ff))
+    result = surpi_sample_sheet_fp_to_df(str(ff))
     return result
 
 
@@ -70,7 +70,11 @@ def _2(ff: SurpiSampleSheetFormat) -> pandas.DataFrame:
     input_descriptions={
         'surpi_output': "SURPI counts per species per barcode.",
         'surpi_sample_info': 'Info linking sample ids to barcodes.'},
-    parameters={},
+    parameters={'ids_are_barcodes': Bool},
+    parameter_descriptions={
+        'ids_are_barcodes': ("True if the sample ids in the count tables are "
+                             "barcodes. False if they are the sample sheet's "
+                             "sample ids. Default is True.")},
     outputs=[('table', FeatureTable[Frequency]),
              ('taxonomy', FeatureData[Taxonomy])],
     output_descriptions={