biocore · cuttlefishh · Dec 6, 2016 · Dec 6, 2016 · Dec 6, 2016 · Dec 6, 2016
diff --git a/scripts/summarize_otu_distributions.py b/scripts/summarize_otu_distributions.py
@@ -0,0 +1,95 @@
+#!/usr/bin/env python
+
+# ----------------------------------------------------------------------------
+# Copyright (c) 2016, The Deblur Development Team.
+#
+# Distributed under the terms of the BSD 3-clause License.
+#
+# The full license is in the file LICENSE, distributed with this software.
+# ----------------------------------------------------------------------------
+
+import click
+import pandas as pd
+import numpy as np
+import biom
+
+@click.command()
+@click.option('--input_biom_fp', '-i', required=True,
+              type=click.Path(resolve_path=True, readable=True, exists=True,
+              file_okay=True), 
+              help="Input rarefied OTU table (.biom)")
+@click.option('--output_summary_fp', '-o', required=True,
+              type=click.Path(resolve_path=True, readable=True, exists=False,
+              file_okay=True), 
+              help="Output OTU summary (.tsv)")
+
+def make_otu_summary(input_biom_fp, output_summary_fp):
+    """Summarize distribution information about each OTU (sequence) in a Deblur
+    biom table.
+
+    Input biom table must be rarefied for results to be meaningful."""
+
+    # Read OTU table (must be rarefied)
+    table = biom.load_table(input_biom_fp)
+    num_samples = len(table.ids(axis='sample'))
+
+    # Get arrays of sample IDs and OTUs (sequences), dicts per OTU of total 
+    # observations, number of samples, list of samples, and taxonomy
+    otu_total_obs = {}
+    otu_num_samples = {}
+    otu_list_samples = {}
+    samples = table.ids(axis='sample')
+    otus = table.ids(axis='observation')
+    for idx, cdat in enumerate(table.iter_data(axis='observation')):
+        otu_total_obs[otus[idx]] = np.sum(cdat)
+        otu_num_samples[otus[idx]] = np.sum(cdat > 0)
+        otu_list_samples[otus[idx]] = samples[np.where(cdat > 0)[0]]
+    otu_tax = {i: '; '.join(md['taxonomy']) for v, i, md in table.iter(
+      axis='observation')}
+
+    # Create Pandas DataFrame of index, sequence, total_obs, num_samples,
+    # list_samples
+    df_otus = pd.DataFrame(index=np.arange(len(otus)))
+    df_otus['sequence'] = [otus[i] for i in df_otus.index]
+    df_otus['total_obs'] = [otu_total_obs[seq] for seq in df_otus.sequence]
+    df_otus['num_samples'] = [otu_num_samples[seq] for seq in df_otus.sequence]
+    df_otus['list_samples'] = \
+      [','.join(otu_list_samples[seq]) for seq in df_otus.sequence]
+    df_otus['taxonomy'] = [otu_tax[seq] for seq in df_otus.sequence]
+
+    # Add columns for total_obs_rank and num_samples_rank
+    # sort by total_obs, reset index, rename index to total_obs
+    df_otus = df_otus.sort_values('total_obs', ascending=False).reset_index(
+      drop=True)
+    df_otus.index.rename('total_obs_rank', inplace=True)
+    # sort by num_samples, reset index, rename index to total_obs
+    df_otus = df_otus.sort_values('num_samples', ascending=False).reset_index(
+      drop=False)
+    df_otus.index.rename('num_samples_rank', inplace=True)
+    # keep sorted by num_samples, reset index
+    df_otus = df_otus.reset_index(drop=False)
+
+    # Add columns for total_obs_percent and num_samples_percent
+    df_otus['total_obs_frac'] = df_otus['total_obs']/df_otus['total_obs'].sum()
+    df_otus['num_samples_frac'] = df_otus['num_samples'] / num_samples
+
+    # Add 1 to the rank so they are true rank and not python-style
+    df_otus['num_samples_rank'] = df_otus['num_samples_rank'] + 1
+    df_otus['total_obs_rank'] = df_otus['total_obs_rank'] + 1
+
+    # Reorder columns
+    df_otus = df_otus[['sequence', 
+                       'num_samples',
+                       'num_samples_frac',
+                       'num_samples_rank',
+                       'total_obs', 
+                       'total_obs_rank', 
+                       'total_obs_frac',
+                       'taxonomy',
+                       'list_samples']]
+
+    # Write to tsv
+    df_otus.to_csv(output_summary_fp, sep='\t')
+
+if __name__ == '__main__':
+    make_otu_summary()
diff --git a/scripts/verify_amplicon_type.py b/scripts/verify_amplicon_type.py
@@ -0,0 +1,86 @@
+#!/usr/bin/env python
+
+# ----------------------------------------------------------------------------
+# Copyright (c) 2016, The Deblur Development Team.
+#
+# Distributed under the terms of the BSD 3-clause License.
+#
+# The full license is in the file LICENSE, distributed with this software.
+# ----------------------------------------------------------------------------
+
+import click
+import numpy as np
+import pandas as pd
+
+def count_starting_kmers(input_fasta_fp, num_seqs, seed):
+    """Generate value_counts dataframe of 5' tetramers for random subsample
+    of a fasta file"""
+    kmer_length = 4
+    if seed:
+        np.random.seed(seed)
+    starting_kmers = []
+    with open(input_fasta_fp) as handle:
+        lines = pd.Series(handle.readlines())
+        num_lines = len(lines)
+        if num_lines/2 < num_seqs:
+            rand_line_nos = np.random.choice(np.arange(1,num_lines,2), 
+                                             size=num_seqs, replace=True)
+        else:
+            rand_line_nos = np.random.choice(np.arange(1,num_lines,2), 
+                                             size=num_seqs, replace=False)
+        rand_lines = lines[rand_line_nos]
+    for sequence in rand_lines:
+        starting_kmers.append(sequence[:kmer_length])
+    starting_kmer_value_counts = pd.Series(starting_kmers).value_counts()
+    return(starting_kmer_value_counts)
+
+@click.command()
+@click.option('--input_fasta_fp', '-f', required=True,
+              type=click.Path(resolve_path=True, readable=True, exists=True,
+              file_okay=True), 
+              help="Input fasta file from Deblur (.fa, .fna, .fasta)")
+@click.option('--num_seqs', '-n', required=False, type=int, default=10000,
+              help="Number of sequences to randomly subsample [default: 10000]")
+@click.option('--cutoff', '-c', required=False, type=float, default=0.5,
+              help="Minimum fraction of sequences required to match "
+                   "a diagnostic 5' tetramer [default: 0.5]")
+@click.option('--seed', '-s', required=False, type=int, 
+              help="Random number seed [default: None]")
+
+def verify_amplicon_type(input_fasta_fp, num_seqs, cutoff, seed):
+    """Determine the most likely amplicon type of a fasta file based on the
+    first four nucleotides.
+
+    The most frequent 5' tetramer in a random subsample of sequences must 
+    match, above a given cutoff fraction of sequences, one of the following  
+    diagnostic tetramers:
+
+      Tetramer\tAmplicon\tForward primer                             
+      TACG\t16S rRNA\t515f                                       
+      GTAG\tITS rRNA\tITS1f                                       
+      GCT[AC]\t18S rRNA\tEuk1391f                                    
+    """
+    starting_kmer_value_counts = count_starting_kmers(input_fasta_fp, num_seqs, 
+      seed)
+    top_kmer = starting_kmer_value_counts.index[0]
+    top_kmer_count = starting_kmer_value_counts[0]
+    top_kmer_frac = top_kmer_count/num_seqs
+    second_kmer = starting_kmer_value_counts.index[1]
+    second_kmer_count = starting_kmer_value_counts[1]
+    top2_kmer_frac = (top_kmer_count+second_kmer_count)/num_seqs
+    if (top_kmer == 'TACG') & (top_kmer_frac > cutoff):
+        print('Amplicon type: 16S/515f (%s%% of sequences start with %s)' %
+              (round(top_kmer_frac*100, 1), top_kmer))
+    elif (top_kmer == 'GTAG') & (top_kmer_frac > cutoff):
+        print('Amplicon type: ITS/ITS1f (%s%% of sequences start with %s)' %
+              (round(top_kmer_frac*100, 1), top_kmer))
+    elif (top_kmer in ['GCTA', 'GCTC']) & (second_kmer in ['GCTA', 'GCTC']) & (top2_kmer_frac > cutoff):
+        print('Amplicon type: 18S/Euk1391f (%s%% of sequences start with %s or %s)' %
+              (round(top2_kmer_frac*100, 1), top_kmer, second_kmer))
+    else:
+        print('Could not determine amplicon type'),
+        print('(most frequent starting tetramer was %s with %s%%)' %
+              (top_kmer, round(top_kmer_frac*100, 1)))
+
+if __name__ == '__main__':
+    verify_amplicon_type()