From daca1f599e9b0e6cbbbbce73c79ddc06675f66b1 Mon Sep 17 00:00:00 2001 From: Luke Thompson Date: Tue, 6 Dec 2016 05:26:38 -0800 Subject: [PATCH 1/7] new script to calculate OTU distribution statistics --- scripts/summarize_otu_distributions.py | 95 ++++++++++++++++++++++++++ 1 file changed, 95 insertions(+) create mode 100755 scripts/summarize_otu_distributions.py diff --git a/scripts/summarize_otu_distributions.py b/scripts/summarize_otu_distributions.py new file mode 100755 index 0000000..a3b8b9a --- /dev/null +++ b/scripts/summarize_otu_distributions.py @@ -0,0 +1,95 @@ +#!/usr/bin/env python + +# ---------------------------------------------------------------------------- +# Copyright (c) 2016, The Deblur Development Team. +# +# Distributed under the terms of the BSD 3-clause License. +# +# The full license is in the file LICENSE, distributed with this software. +# ---------------------------------------------------------------------------- + +import click +import pandas as pd +import numpy as np +import biom + +@click.command() +@click.option('--input_table', '-i', required=True, + type=click.Path(resolve_path=True, readable=True, exists=True, + file_okay=True), + help="Input rarefied OTU table (.biom)") +@click.option('--output_summary', '-o', required=True, + type=click.Path(resolve_path=True, readable=True, exists=False, + file_okay=True), + help="Output OTU summary (.tsv)") + +def make_otu_summary(input_table, output_summary): + """Summarize distribution information about each OTU (sequnece) in a Deblur + biom table. + + Input biom table must be rarefied for results to be meaningful.""" + + # Read OTU table (must be rarefied) + table = biom.load_table(input_table) + num_samples = len(table.ids(axis='sample')) + + # Get arrays of sample IDs and OTUs (sequences), dicts per OTU of total + # observations, number of samples, list of samples, and taxonomy + otu_total_obs = {} + otu_num_samples = {} + otu_list_samples = {} + samples = table.ids(axis='sample') + otus = table.ids(axis='observation') + for idx, cdat in enumerate(table.iter_data(axis='observation')): + otu_total_obs[otus[idx]] = np.sum(cdat) + otu_num_samples[otus[idx]] = np.sum(cdat > 0) + otu_list_samples[otus[idx]] = samples[np.where(cdat > 0)[0]] + otu_tax = {i: '; '.join(md['taxonomy']) for v, i, md in table.iter( + axis='observation')} + + # Create Pandas DataFrame of index, sequence, total_obs, num_samples, + # list_samples + df_otus = pd.DataFrame(index=np.arange(len(otus))) + df_otus['sequence'] = [otus[i] for i in df_otus.index] + df_otus['total_obs'] = [otu_total_obs[seq] for seq in df_otus.sequence] + df_otus['num_samples'] = [otu_num_samples[seq] for seq in df_otus.sequence] + df_otus['list_samples'] = \ + [','.join(otu_list_samples[seq]) for seq in df_otus.sequence] + df_otus['taxonomy'] = [otu_tax[seq] for seq in df_otus.sequence] + + # Add columns for total_obs_rank and num_samples_rank + # sort by total_obs, reset index, rename index to total_obs + df_otus = df_otus.sort_values('total_obs', ascending=False).reset_index( + drop=True) + df_otus.index.rename('total_obs_rank', inplace=True) + # sort by num_samples, reset index, rename index to total_obs + df_otus = df_otus.sort_values('num_samples', ascending=False).reset_index( + drop=False) + df_otus.index.rename('num_samples_rank', inplace=True) + # keep sorted by num_samples, reset index + df_otus = df_otus.reset_index(drop=False) + + # Add columns for total_obs_percent and num_samples_percent + df_otus['total_obs_frac'] = df_otus['total_obs']/df_otus['total_obs'].sum() + df_otus['num_samples_frac'] = df_otus['num_samples'] / num_samples + + # Add 1 to the rank so they are true rank and not python-style + df_otus['num_samples_rank'] = df_otus['num_samples_rank'] + 1 + df_otus['total_obs_rank'] = df_otus['total_obs_rank'] + 1 + + # Reorder columns + df_otus = df_otus[['sequence', + 'num_samples', + 'num_samples_frac', + 'num_samples_rank', + 'total_obs', + 'total_obs_rank', + 'total_obs_frac', + 'taxonomy', + 'list_samples']] + + # Write to tsv + df_otus.to_csv(output_summary, sep='\t') + +if __name__ == '__main__': + make_otu_summary() From 8ba14beef2298116ea62a796708139c77449f539 Mon Sep 17 00:00:00 2001 From: Luke Thompson Date: Tue, 6 Dec 2016 08:34:31 -0800 Subject: [PATCH 2/7] using more standard variable names --- scripts/summarize_otu_distributions.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/scripts/summarize_otu_distributions.py b/scripts/summarize_otu_distributions.py index a3b8b9a..16acc88 100755 --- a/scripts/summarize_otu_distributions.py +++ b/scripts/summarize_otu_distributions.py @@ -14,23 +14,23 @@ import biom @click.command() -@click.option('--input_table', '-i', required=True, +@click.option('--input_biom_fp', '-i', required=True, type=click.Path(resolve_path=True, readable=True, exists=True, file_okay=True), help="Input rarefied OTU table (.biom)") -@click.option('--output_summary', '-o', required=True, +@click.option('--output_summary_fp', '-o', required=True, type=click.Path(resolve_path=True, readable=True, exists=False, file_okay=True), help="Output OTU summary (.tsv)") -def make_otu_summary(input_table, output_summary): +def make_otu_summary(input_biom_fp, output_summary_fp): """Summarize distribution information about each OTU (sequnece) in a Deblur biom table. Input biom table must be rarefied for results to be meaningful.""" # Read OTU table (must be rarefied) - table = biom.load_table(input_table) + table = biom.load_table(input_biom_fp) num_samples = len(table.ids(axis='sample')) # Get arrays of sample IDs and OTUs (sequences), dicts per OTU of total @@ -89,7 +89,7 @@ def make_otu_summary(input_table, output_summary): 'list_samples']] # Write to tsv - df_otus.to_csv(output_summary, sep='\t') + df_otus.to_csv(output_summary_fp, sep='\t') if __name__ == '__main__': make_otu_summary() From 54b3c91250bb8c426eba101a375fa6faf4022a39 Mon Sep 17 00:00:00 2001 From: Luke Thompson Date: Tue, 6 Dec 2016 08:35:09 -0800 Subject: [PATCH 3/7] script to check amplicon type using deblur fasta --- scripts/verify_amplicon_type.py | 83 +++++++++++++++++++++++++++++++++ 1 file changed, 83 insertions(+) create mode 100755 scripts/verify_amplicon_type.py diff --git a/scripts/verify_amplicon_type.py b/scripts/verify_amplicon_type.py new file mode 100755 index 0000000..eea9158 --- /dev/null +++ b/scripts/verify_amplicon_type.py @@ -0,0 +1,83 @@ +#!/usr/bin/env python + +# ---------------------------------------------------------------------------- +# Copyright (c) 2016, The Deblur Development Team. +# +# Distributed under the terms of the BSD 3-clause License. +# +# The full license is in the file LICENSE, distributed with this software. +# ---------------------------------------------------------------------------- + +import click +import numpy as np +import pandas as pd + +def count_starting_kmers(input_fasta_fp, num_seqs, seed): + """Generate value_counts dataframe of starting k-mers for random subsample + of a fasta file""" + kmer_length = 4 + if seed: + np.random.seed(seed) + starting_kmers = [] + with open(input_fasta_fp) as handle: + lines = pd.Series(handle.readlines()) + num_lines = len(lines) + if num_lines/2 < num_seqs: + rand_line_nos = np.random.choice(np.arange(1,num_lines,2), + size=num_seqs, replace=True) + else: + rand_line_nos = np.random.choice(np.arange(1,num_lines,2), + size=num_seqs, replace=False) + rand_lines = lines[rand_line_nos] + for sequence in rand_lines: + starting_kmers.append(sequence[:kmer_length]) + starting_kmer_value_counts = pd.Series(starting_kmers).value_counts() + return(starting_kmer_value_counts) + +@click.command() +@click.option('--input_fasta_fp', '-f', required=True, + type=click.Path(resolve_path=True, readable=True, exists=True, + file_okay=True), + help="Input fasta file from Deblur (.fa, .fna, .fasta)") +@click.option('--num_seqs', '-n', required=False, type=int, default=10000, + help="Number of sequences to randomly subsample [default: 10000]") +@click.option('--cutoff', '-c', required=False, type=float, default=0.5, + help="Minimum fraction of sequences required to match " + "a diagnostic k-mer [default: 0.5]") +@click.option('--seed', '-s', required=False, type=int, + help="Random number seed [default: None]") + +def verify_amplicon_type(input_fasta_fp, num_seqs, cutoff, seed): + """Determine the most likely amplicon type of a fasta file based on the + first few nucleotides. + + The most frequent starting k-mer in a random subsample of sequences must + match, above a given cutoff fraction of sequences, one of the following + diagnostic k-mers: + + k-mer\tAmplicon\tForward primer + TACG\t16S rRNA\t515f + NNNN\t18S rRNA\tEuk1391f + NNNN\tITS rRNA\tITS1f + """ + starting_kmer_value_counts = count_starting_kmers(input_fasta_fp, num_seqs, + seed) + top_kmer = starting_kmer_value_counts.index[0] + top_kmer_count = starting_kmer_value_counts[0] + top_kmer_frac = top_kmer_count/num_seqs + if (top_kmer == 'TACG') & (top_kmer_frac > cutoff): + print('Amplicon type: 16S/515f (%s%% of sequences start with %s)' % + (round(top_kmer_frac*100, 1), top_kmer)) + elif (top_kmer == 'NNNN') & (top_kmer_frac > cutoff): + print('Amplicon type: 18S/Euk1391f (%s%% of sequences start with %s)' % + (round(top_kmer_frac*100, 1), top_kmer)) + elif (top_kmer == 'NNNN') & (top_kmer_frac > cutoff): + print('Amplicon type: ITS/ITS1f (%s%% of sequences start with %s)' % + (round(top_kmer_frac*100, 1), top_kmer)) + else: + print('Could not determine amplicon type'), + print('(most frequent starting k-mer was %s with %s%%)' % + (top_kmer, round(top_kmer_frac*100, 1))) + +if __name__ == '__main__': + verify_amplicon_type() From 58ec12fdfc7893e1fbe8f08c6fe2dabd3bff3bf4 Mon Sep 17 00:00:00 2001 From: Luke Thompson Date: Tue, 6 Dec 2016 13:09:48 -0800 Subject: [PATCH 4/7] added support for 18S and ITS, now explicitly calling k-mers tetramers --- scripts/verify_amplicon_type.py | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/scripts/verify_amplicon_type.py b/scripts/verify_amplicon_type.py index eea9158..9949d98 100755 --- a/scripts/verify_amplicon_type.py +++ b/scripts/verify_amplicon_type.py @@ -13,7 +13,7 @@ import pandas as pd def count_starting_kmers(input_fasta_fp, num_seqs, seed): - """Generate value_counts dataframe of starting k-mers for random subsample + """Generate value_counts dataframe of 5' tetramers for random subsample of a fasta file""" kmer_length = 4 if seed: @@ -43,40 +43,43 @@ def count_starting_kmers(input_fasta_fp, num_seqs, seed): help="Number of sequences to randomly subsample [default: 10000]") @click.option('--cutoff', '-c', required=False, type=float, default=0.5, help="Minimum fraction of sequences required to match " - "a diagnostic k-mer [default: 0.5]") + "a diagnostic 5' tetramer [default: 0.5]") @click.option('--seed', '-s', required=False, type=int, help="Random number seed [default: None]") def verify_amplicon_type(input_fasta_fp, num_seqs, cutoff, seed): """Determine the most likely amplicon type of a fasta file based on the - first few nucleotides. + first four nucleotides. - The most frequent starting k-mer in a random subsample of sequences must + The most frequent 5' tetramer in a random subsample of sequences must match, above a given cutoff fraction of sequences, one of the following - diagnostic k-mers: + diagnostic tetramers: - k-mer\tAmplicon\tForward primer + Tetramer\tAmplicon\tForward primer TACG\t16S rRNA\t515f - NNNN\t18S rRNA\tEuk1391f - NNNN\tITS rRNA\tITS1f + GTAG\tITS rRNA\tITS1f + GCT[AC]\t18S rRNA\tEuk1391f """ starting_kmer_value_counts = count_starting_kmers(input_fasta_fp, num_seqs, seed) top_kmer = starting_kmer_value_counts.index[0] top_kmer_count = starting_kmer_value_counts[0] top_kmer_frac = top_kmer_count/num_seqs + second_kmer = starting_kmer_value_counts.index[1] + second_kmer_count = starting_kmer_value_counts[1] + top2_kmer_frac = (top_kmer_count+second_kmer_count)/num_seqs if (top_kmer == 'TACG') & (top_kmer_frac > cutoff): print('Amplicon type: 16S/515f (%s%% of sequences start with %s)' % (round(top_kmer_frac*100, 1), top_kmer)) - elif (top_kmer == 'NNNN') & (top_kmer_frac > cutoff): - print('Amplicon type: 18S/Euk1391f (%s%% of sequences start with %s)' % - (round(top_kmer_frac*100, 1), top_kmer)) - elif (top_kmer == 'NNNN') & (top_kmer_frac > cutoff): + elif (top_kmer == 'GTAG') & (top_kmer_frac > cutoff): print('Amplicon type: ITS/ITS1f (%s%% of sequences start with %s)' % (round(top_kmer_frac*100, 1), top_kmer)) + elif (top_kmer in ['GCTA', 'GCTC']) & (second_kmer in ['GCTA', 'GCTC']) & (top2_kmer_frac > cutoff): + print('Amplicon type: 18S/Euk1391f (%s%% of sequences start with %s or %s)' % + (round(top2_kmer_frac*100, 1), top_kmer, second_kmer)) else: print('Could not determine amplicon type'), - print('(most frequent starting k-mer was %s with %s%%)' % + print('(most frequent starting tetramer was %s with %s%%)' % (top_kmer, round(top_kmer_frac*100, 1))) if __name__ == '__main__': From 887ec13a8d1509359d790d8fbbc396bb131afda3 Mon Sep 17 00:00:00 2001 From: Luke Thompson Date: Tue, 6 Dec 2016 13:27:52 -0800 Subject: [PATCH 5/7] fixed typo --- scripts/summarize_otu_distributions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/summarize_otu_distributions.py b/scripts/summarize_otu_distributions.py index 16acc88..0176e94 100755 --- a/scripts/summarize_otu_distributions.py +++ b/scripts/summarize_otu_distributions.py @@ -24,7 +24,7 @@ help="Output OTU summary (.tsv)") def make_otu_summary(input_biom_fp, output_summary_fp): - """Summarize distribution information about each OTU (sequnece) in a Deblur + """Summarize distribution information about each OTU (sequence) in a Deblur biom table. Input biom table must be rarefied for results to be meaningful.""" From 47e5f807bee8aa0d45042334442d2bdac55d879a Mon Sep 17 00:00:00 2001 From: Luke Thompson Date: Wed, 7 Dec 2016 10:25:42 -0800 Subject: [PATCH 6/7] deleting summarize_otu_distributions.py and adding to biom-format --- scripts/summarize_otu_distributions.py | 95 -------------------------- 1 file changed, 95 deletions(-) delete mode 100755 scripts/summarize_otu_distributions.py diff --git a/scripts/summarize_otu_distributions.py b/scripts/summarize_otu_distributions.py deleted file mode 100755 index 0176e94..0000000 --- a/scripts/summarize_otu_distributions.py +++ /dev/null @@ -1,95 +0,0 @@ -#!/usr/bin/env python - -# ---------------------------------------------------------------------------- -# Copyright (c) 2016, The Deblur Development Team. -# -# Distributed under the terms of the BSD 3-clause License. -# -# The full license is in the file LICENSE, distributed with this software. -# ---------------------------------------------------------------------------- - -import click -import pandas as pd -import numpy as np -import biom - -@click.command() -@click.option('--input_biom_fp', '-i', required=True, - type=click.Path(resolve_path=True, readable=True, exists=True, - file_okay=True), - help="Input rarefied OTU table (.biom)") -@click.option('--output_summary_fp', '-o', required=True, - type=click.Path(resolve_path=True, readable=True, exists=False, - file_okay=True), - help="Output OTU summary (.tsv)") - -def make_otu_summary(input_biom_fp, output_summary_fp): - """Summarize distribution information about each OTU (sequence) in a Deblur - biom table. - - Input biom table must be rarefied for results to be meaningful.""" - - # Read OTU table (must be rarefied) - table = biom.load_table(input_biom_fp) - num_samples = len(table.ids(axis='sample')) - - # Get arrays of sample IDs and OTUs (sequences), dicts per OTU of total - # observations, number of samples, list of samples, and taxonomy - otu_total_obs = {} - otu_num_samples = {} - otu_list_samples = {} - samples = table.ids(axis='sample') - otus = table.ids(axis='observation') - for idx, cdat in enumerate(table.iter_data(axis='observation')): - otu_total_obs[otus[idx]] = np.sum(cdat) - otu_num_samples[otus[idx]] = np.sum(cdat > 0) - otu_list_samples[otus[idx]] = samples[np.where(cdat > 0)[0]] - otu_tax = {i: '; '.join(md['taxonomy']) for v, i, md in table.iter( - axis='observation')} - - # Create Pandas DataFrame of index, sequence, total_obs, num_samples, - # list_samples - df_otus = pd.DataFrame(index=np.arange(len(otus))) - df_otus['sequence'] = [otus[i] for i in df_otus.index] - df_otus['total_obs'] = [otu_total_obs[seq] for seq in df_otus.sequence] - df_otus['num_samples'] = [otu_num_samples[seq] for seq in df_otus.sequence] - df_otus['list_samples'] = \ - [','.join(otu_list_samples[seq]) for seq in df_otus.sequence] - df_otus['taxonomy'] = [otu_tax[seq] for seq in df_otus.sequence] - - # Add columns for total_obs_rank and num_samples_rank - # sort by total_obs, reset index, rename index to total_obs - df_otus = df_otus.sort_values('total_obs', ascending=False).reset_index( - drop=True) - df_otus.index.rename('total_obs_rank', inplace=True) - # sort by num_samples, reset index, rename index to total_obs - df_otus = df_otus.sort_values('num_samples', ascending=False).reset_index( - drop=False) - df_otus.index.rename('num_samples_rank', inplace=True) - # keep sorted by num_samples, reset index - df_otus = df_otus.reset_index(drop=False) - - # Add columns for total_obs_percent and num_samples_percent - df_otus['total_obs_frac'] = df_otus['total_obs']/df_otus['total_obs'].sum() - df_otus['num_samples_frac'] = df_otus['num_samples'] / num_samples - - # Add 1 to the rank so they are true rank and not python-style - df_otus['num_samples_rank'] = df_otus['num_samples_rank'] + 1 - df_otus['total_obs_rank'] = df_otus['total_obs_rank'] + 1 - - # Reorder columns - df_otus = df_otus[['sequence', - 'num_samples', - 'num_samples_frac', - 'num_samples_rank', - 'total_obs', - 'total_obs_rank', - 'total_obs_frac', - 'taxonomy', - 'list_samples']] - - # Write to tsv - df_otus.to_csv(output_summary_fp, sep='\t') - -if __name__ == '__main__': - make_otu_summary() From 3ba33bffe235d787864d0a1ee43657e79faf0200 Mon Sep 17 00:00:00 2001 From: Luke Thompson Date: Wed, 14 Dec 2016 17:21:54 -0800 Subject: [PATCH 7/7] updated 18S tetramers after positive filtering with Silva 18S --- scripts/verify_amplicon_type.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/scripts/verify_amplicon_type.py b/scripts/verify_amplicon_type.py index 9949d98..6bfa174 100755 --- a/scripts/verify_amplicon_type.py +++ b/scripts/verify_amplicon_type.py @@ -64,19 +64,24 @@ def verify_amplicon_type(input_fasta_fp, num_seqs, cutoff, seed): seed) top_kmer = starting_kmer_value_counts.index[0] top_kmer_count = starting_kmer_value_counts[0] - top_kmer_frac = top_kmer_count/num_seqs second_kmer = starting_kmer_value_counts.index[1] second_kmer_count = starting_kmer_value_counts[1] + third_kmer = starting_kmer_value_counts.index[2] + third_kmer_count = starting_kmer_value_counts[2] + top_kmer_frac = top_kmer_count/num_seqs top2_kmer_frac = (top_kmer_count+second_kmer_count)/num_seqs + top3_kmer_frac = (top_kmer_count+second_kmer_count+third_kmer_count)/num_seqs if (top_kmer == 'TACG') & (top_kmer_frac > cutoff): print('Amplicon type: 16S/515f (%s%% of sequences start with %s)' % (round(top_kmer_frac*100, 1), top_kmer)) elif (top_kmer == 'GTAG') & (top_kmer_frac > cutoff): print('Amplicon type: ITS/ITS1f (%s%% of sequences start with %s)' % (round(top_kmer_frac*100, 1), top_kmer)) - elif (top_kmer in ['GCTA', 'GCTC']) & (second_kmer in ['GCTA', 'GCTC']) & (top2_kmer_frac > cutoff): - print('Amplicon type: 18S/Euk1391f (%s%% of sequences start with %s or %s)' % - (round(top2_kmer_frac*100, 1), top_kmer, second_kmer)) + elif (top_kmer in ['GCTA', 'GCTC', 'ACAC']) & (second_kmer in ['GCTA', 'GCTC', + 'ACAC']) & (third_kmer in ['GCTA', 'GCTC', 'ACAC']) & ( + top3_kmer_frac > cutoff): + print('Amplicon type: 18S/Euk1391f (%s%% of sequences start with %s, %s, or %s)' % + (round(top3_kmer_frac*100, 1), top_kmer, second_kmer, third_kmer)) else: print('Could not determine amplicon type'), print('(most frequent starting tetramer was %s with %s%%)' %