From 9e058207709cc1c168898e9d572e6a434c09a9a0 Mon Sep 17 00:00:00 2001 From: Sebastian Krautwurst Date: Mon, 8 Mar 2021 14:14:03 +0100 Subject: [PATCH] force dtype string in data index --- bin/summary_report.py | 20 ++++++++++++++------ workflows/process/summary_report.nf | 2 +- 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/bin/summary_report.py b/bin/summary_report.py index 9f3dca8..36d1c2c 100755 --- a/bin/summary_report.py +++ b/bin/summary_report.py @@ -127,10 +127,11 @@ def check_and_init_tabledata(self, t_index): if self.tabledata is None: self.tabledata = pd.DataFrame(index=sorted(t_index)) self.tabledata.columns.name = 'Sample' + self.force_index_dtype_string(self.tabledata) self.add_col_description(f'Missing values (\'n/a\') denote cases where the respective program could not determine a result.') else: for item in t_index: - assert item in self.tabledata.index, f'Index not found in existing table: {item}' + assert item in self.tabledata.index, f'Index not found in existing table: {item}. Available: {self.tabledata.index}' def add_col_description(self, desc): @@ -242,11 +243,15 @@ def write_html_report(self): ### functions to add columns + def force_index_dtype_string(self, dataframe): + dataframe.index = dataframe.index.astype('string') + def add_pangolin_results(self, pangolin_results): log(f'Adding Pangolin results ...') # column names: # taxon,lineage,probability,pangoLEARN_version,status,note - res_data = pd.read_csv(pangolin_results, index_col='taxon') + res_data = pd.read_csv(pangolin_results, index_col='taxon', dtype={'taxon': str}) + self.force_index_dtype_string(res_data) self.check_and_init_tabledata(res_data.index) res_data['lineage_prob'] = [f'{l}
({p:.2f})' for l,p in zip(res_data['lineage'], res_data['probability'])] @@ -258,7 +263,8 @@ def add_pangolin_results(self, pangolin_results): def add_president_results(self, president_results): log(f'Adding President results ...') - res_data = pd.read_csv(president_results, index_col='query_name', sep='\t') + res_data = pd.read_csv(president_results, index_col='query_name', sep='\t', dtype={'query_name': str}) + self.force_index_dtype_string(res_data) self.check_and_init_tabledata(res_data.index) def identity_markup(ident, mismatches): @@ -308,7 +314,8 @@ def percN_markup(nn, ql): def add_nextclade_results(self, nextclade_results): log(f'Adding Nextclade results ...') - res_data = pd.read_csv(nextclade_results, index_col='seqName', sep='\t') + res_data = pd.read_csv(nextclade_results, index_col='seqName', sep='\t', dtype={'seqName': str}) + self.force_index_dtype_string(res_data) self.check_and_init_tabledata(res_data.index) res_data['mutations_formatted'] = [m.replace(',', ', ') if type(m) == str else '-' for m in res_data['aaSubstitutions']] @@ -342,8 +349,9 @@ def spike_markup(field): def add_kraken2_results(self, kraken2_results): log(f'Adding Kraken2 results ...') # column names: - #sample,num_sarscov2,num_human - res_data = pd.read_csv(kraken2_results, index_col=0) + # sample,num_unclassified,num_sarscov2,num_human + res_data = pd.read_csv(kraken2_results, index_col='sample', dtype={'sample': str}) + self.force_index_dtype_string(res_data) self.check_and_init_tabledata(res_data.index) def readable_si_units(number): diff --git a/workflows/process/summary_report.nf b/workflows/process/summary_report.nf index 51f9a52..dca2bb9 100644 --- a/workflows/process/summary_report.nf +++ b/workflows/process/summary_report.nf @@ -23,7 +23,7 @@ process summary_report { ''' else ''' - echo '#sample,num_unclassified,num_sarscov2,num_human' > kraken2_results.csv + echo 'sample,num_unclassified,num_sarscov2,num_human' > kraken2_results.csv for KF in !{kraken2_results}; do echo -n "${KF%.kreport}," >> kraken2_results.csv NUNCLASS=$(awk -v ORS= '$5=="0" {print $3 "," }' $KF)