From bcd8d46b3f29a9c221b0f186af91452908d496a8 Mon Sep 17 00:00:00 2001 From: AmandaBirmingham Date: Thu, 26 Sep 2024 13:38:06 -0700 Subject: [PATCH] added sample id instead of barcode handling, extended sample sheet parsing, fixed function name typo --- q2_surpi/_formats_and_types.py | 15 +++++++++++---- q2_surpi/_plugin.py | 25 +++++++++++++++---------- q2_surpi/plugin_setup.py | 12 ++++++++---- 3 files changed, 34 insertions(+), 18 deletions(-) diff --git a/q2_surpi/_formats_and_types.py b/q2_surpi/_formats_and_types.py index 77a2816..799b909 100644 --- a/q2_surpi/_formats_and_types.py +++ b/q2_surpi/_formats_and_types.py @@ -9,6 +9,7 @@ FAMILY_KEY = "family" TAG_KEY = "tag" SAMPLE_NAME_KEY = 'Sample_Name' +SS_SAMPLE_ID_KEY = "Sample_ID" INDEX_1_KEY = "index" INDEX_2_KEY = "index2" BARCODE_KEY = 'barcode' @@ -46,10 +47,10 @@ class SurpiSampleSheetFormat(model.TextFileFormat): """Represents a csv-delimited sample sheet file used by SURPI+.""" def _validate_(self, level): - _ = surpi_count_fp_to_df(self.path) + _ = surpi_sample_sheet_fp_to_df(self.path) -def surpi_count_fp_to_df(fp: str) -> pandas.DataFrame: +def surpi_sample_sheet_fp_to_df(fp: str) -> pandas.DataFrame: # open the file and count each line until we find one that starts with # [Data] @@ -62,8 +63,14 @@ def surpi_count_fp_to_df(fp: str) -> pandas.DataFrame: continue # endif line.startswith("[Data]") - if is_data and not line.startswith(','): - data_table_lines.append(line) + if is_data: + if line.startswith("["): + # if we've reached the beginning of the next section, stop + break + + if not line.startswith(','): + # add non-empty lines to the data table + data_table_lines.append(line) # endif is_data and not line.startswith(',') # endfor line in f # endwith self.path.open("r") as f diff --git a/q2_surpi/_plugin.py b/q2_surpi/_plugin.py index ab8fab8..cb7fb26 100644 --- a/q2_surpi/_plugin.py +++ b/q2_surpi/_plugin.py @@ -1,6 +1,7 @@ import pandas from q2_surpi._formats_and_types import FEATURE_ID_KEY, FAMILY_KEY, \ - GENUS_KEY, SPECIES_KEY, BARCODE_KEY, TAG_KEY, SAMPLE_NAME_KEY + GENUS_KEY, SPECIES_KEY, BARCODE_KEY, TAG_KEY, SAMPLE_NAME_KEY, \ + SS_SAMPLE_ID_KEY SAMPLE_ID_KEY = 'sample-id' TAXON_KEY = 'Taxon' @@ -15,7 +16,8 @@ # automagically and this will receive pandas.DataFrames as its arguments. def extract( surpi_output: pandas.DataFrame, - surpi_sample_info: pandas.DataFrame) -> \ + surpi_sample_info: pandas.DataFrame, + ids_are_barcodes: bool = True) -> \ (pandas.DataFrame, pandas.DataFrame): """Turn SURPI data into a feature table dataframe and a taxonomy dataframe. @@ -26,6 +28,9 @@ def extract( A DataFrame containing the content of a SURPI counttable [sic] file. surpi_sample_info_df : pandas.DataFrame A DataFrame containing the content of a SURPI sample sheet file. + ids_are_barcodes : bool, optional + True if the sample ids are barcodes. False if the sample ids are + sample sheet sample ids. Default is True. Returns ------- @@ -37,6 +42,8 @@ def extract( the QIIME 2 taxonomy format. """ + ss_sample_id_key = BARCODE_KEY if ids_are_barcodes else SS_SAMPLE_ID_KEY + # Generate the taxonomy result taxonomy = surpi_output[[SPECIES_KEY, GENUS_KEY, FAMILY_KEY]].copy() taxonomy[TAXON_KEY] = surpi_output.apply( @@ -56,26 +63,24 @@ def extract( surpi_feature_table_df[FEATURE_ID_KEY] = taxonomy.index surpi_feature_table_df = surpi_feature_table_df.set_index(FEATURE_ID_KEY) surpi_feature_table_df = surpi_feature_table_df.T - surpi_feature_table_df.index.name = BARCODE_KEY + surpi_feature_table_df.index.name = ss_sample_id_key surpi_feature_table_df = surpi_feature_table_df.reset_index() - feature_barcodes = surpi_feature_table_df[BARCODE_KEY].unique() + feature_barcodes = surpi_feature_table_df[ss_sample_id_key].unique() # merge the sample info with the feature table - # TODO: this is speculative code and may need to be adjusted; I don't - # know yet what the sample info looks like limited_sample_info_df = \ - surpi_sample_info[[BARCODE_KEY, SAMPLE_NAME_KEY]] + surpi_sample_info[[ss_sample_id_key, SAMPLE_NAME_KEY]] surpi_feature_table_df = surpi_feature_table_df.merge( - limited_sample_info_df, on=BARCODE_KEY, how='inner', + limited_sample_info_df, on=ss_sample_id_key, how='inner', validate='one_to_one') - identified_barcodes = surpi_feature_table_df[BARCODE_KEY].unique() + identified_barcodes = surpi_feature_table_df[ss_sample_id_key].unique() unidentified_barcodes = set(feature_barcodes) - set(identified_barcodes) if len(unidentified_barcodes) > 0: raise ValueError( f"The following barcodes were not linked to sample identifiers " f"in the sample sheet: {unidentified_barcodes}") - surpi_feature_table_df.drop(columns=[BARCODE_KEY], inplace=True) + surpi_feature_table_df.drop(columns=[ss_sample_id_key], inplace=True) surpi_feature_table_df.set_index(SAMPLE_NAME_KEY, inplace=True) surpi_feature_table_df.index.name = SAMPLE_ID_KEY diff --git a/q2_surpi/plugin_setup.py b/q2_surpi/plugin_setup.py index 70d8eb6..e10928a 100644 --- a/q2_surpi/plugin_setup.py +++ b/q2_surpi/plugin_setup.py @@ -1,12 +1,12 @@ import pandas from q2_types.feature_table import FeatureTable, Frequency from q2_types.feature_data import FeatureData, Taxonomy -from qiime2.plugin import (Plugin, Citations) +from qiime2.plugin import (Plugin, Citations, Bool) import q2_surpi from q2_surpi._formats_and_types import ( SurpiCountTable, SurpiCountTableFormat, SurpiCountTableDirectoryFormat, SurpiSampleSheet, SurpiSampleSheetFormat, SurpiSampleSheetDirectoryFormat, - surpi_count_fp_to_df) + surpi_sample_sheet_fp_to_df) plugin = Plugin( @@ -42,7 +42,7 @@ def _1(ff: SurpiCountTableFormat) -> pandas.DataFrame: @plugin.register_transformer # load a SurpiSampleSheetFormat into a dataframe def _2(ff: SurpiSampleSheetFormat) -> pandas.DataFrame: - result = surpi_count_fp_to_df(str(ff)) + result = surpi_sample_sheet_fp_to_df(str(ff)) return result @@ -70,7 +70,11 @@ def _2(ff: SurpiSampleSheetFormat) -> pandas.DataFrame: input_descriptions={ 'surpi_output': "SURPI counts per species per barcode.", 'surpi_sample_info': 'Info linking sample ids to barcodes.'}, - parameters={}, + parameters={'ids_are_barcodes': Bool}, + parameter_descriptions={ + 'ids_are_barcodes': ("True if the sample ids in the count tables are " + "barcodes. False if they are the sample sheet's " + "sample ids. Default is True.")}, outputs=[('table', FeatureTable[Frequency]), ('taxonomy', FeatureData[Taxonomy])], output_descriptions={