Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Utility scripts for Deblur #119

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
95 changes: 95 additions & 0 deletions scripts/summarize_otu_distributions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
#!/usr/bin/env python

# ----------------------------------------------------------------------------
# Copyright (c) 2016, The Deblur Development Team.
#
# Distributed under the terms of the BSD 3-clause License.
#
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------

import click
import pandas as pd
import numpy as np
import biom

@click.command()
@click.option('--input_biom_fp', '-i', required=True,
type=click.Path(resolve_path=True, readable=True, exists=True,
file_okay=True),
help="Input rarefied OTU table (.biom)")
@click.option('--output_summary_fp', '-o', required=True,
type=click.Path(resolve_path=True, readable=True, exists=False,
file_okay=True),
help="Output OTU summary (.tsv)")

def make_otu_summary(input_biom_fp, output_summary_fp):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

wouldn't it make sense for this to be part of biom?

"""Summarize distribution information about each OTU (sequence) in a Deblur
biom table.

Input biom table must be rarefied for results to be meaningful."""

# Read OTU table (must be rarefied)
table = biom.load_table(input_biom_fp)
num_samples = len(table.ids(axis='sample'))

# Get arrays of sample IDs and OTUs (sequences), dicts per OTU of total
# observations, number of samples, list of samples, and taxonomy
otu_total_obs = {}
otu_num_samples = {}
otu_list_samples = {}
samples = table.ids(axis='sample')
otus = table.ids(axis='observation')
for idx, cdat in enumerate(table.iter_data(axis='observation')):
otu_total_obs[otus[idx]] = np.sum(cdat)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this forloop can be replaced with calls to Table.sum

otu_num_samples[otus[idx]] = np.sum(cdat > 0)
otu_list_samples[otus[idx]] = samples[np.where(cdat > 0)[0]]
otu_tax = {i: '; '.join(md['taxonomy']) for v, i, md in table.iter(
axis='observation')}

# Create Pandas DataFrame of index, sequence, total_obs, num_samples,
# list_samples
df_otus = pd.DataFrame(index=np.arange(len(otus)))
df_otus['sequence'] = [otus[i] for i in df_otus.index]
df_otus['total_obs'] = [otu_total_obs[seq] for seq in df_otus.sequence]
df_otus['num_samples'] = [otu_num_samples[seq] for seq in df_otus.sequence]
df_otus['list_samples'] = \
[','.join(otu_list_samples[seq]) for seq in df_otus.sequence]
df_otus['taxonomy'] = [otu_tax[seq] for seq in df_otus.sequence]

# Add columns for total_obs_rank and num_samples_rank
# sort by total_obs, reset index, rename index to total_obs
df_otus = df_otus.sort_values('total_obs', ascending=False).reset_index(
drop=True)
df_otus.index.rename('total_obs_rank', inplace=True)
# sort by num_samples, reset index, rename index to total_obs
df_otus = df_otus.sort_values('num_samples', ascending=False).reset_index(
drop=False)
df_otus.index.rename('num_samples_rank', inplace=True)
# keep sorted by num_samples, reset index
df_otus = df_otus.reset_index(drop=False)

# Add columns for total_obs_percent and num_samples_percent
df_otus['total_obs_frac'] = df_otus['total_obs']/df_otus['total_obs'].sum()
df_otus['num_samples_frac'] = df_otus['num_samples'] / num_samples

# Add 1 to the rank so they are true rank and not python-style
df_otus['num_samples_rank'] = df_otus['num_samples_rank'] + 1
df_otus['total_obs_rank'] = df_otus['total_obs_rank'] + 1

# Reorder columns
df_otus = df_otus[['sequence',
'num_samples',
'num_samples_frac',
'num_samples_rank',
'total_obs',
'total_obs_rank',
'total_obs_frac',
'taxonomy',
'list_samples']]

# Write to tsv
df_otus.to_csv(output_summary_fp, sep='\t')

if __name__ == '__main__':
make_otu_summary()
86 changes: 86 additions & 0 deletions scripts/verify_amplicon_type.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
#!/usr/bin/env python

# ----------------------------------------------------------------------------
# Copyright (c) 2016, The Deblur Development Team.
#
# Distributed under the terms of the BSD 3-clause License.
#
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------

import click
import numpy as np
import pandas as pd

def count_starting_kmers(input_fasta_fp, num_seqs, seed):
"""Generate value_counts dataframe of 5' tetramers for random subsample
of a fasta file"""
kmer_length = 4
if seed:
np.random.seed(seed)
starting_kmers = []
with open(input_fasta_fp) as handle:
lines = pd.Series(handle.readlines())
num_lines = len(lines)
if num_lines/2 < num_seqs:
rand_line_nos = np.random.choice(np.arange(1,num_lines,2),
size=num_seqs, replace=True)
else:
rand_line_nos = np.random.choice(np.arange(1,num_lines,2),
size=num_seqs, replace=False)
rand_lines = lines[rand_line_nos]
for sequence in rand_lines:
starting_kmers.append(sequence[:kmer_length])
starting_kmer_value_counts = pd.Series(starting_kmers).value_counts()
return(starting_kmer_value_counts)

@click.command()
@click.option('--input_fasta_fp', '-f', required=True,
type=click.Path(resolve_path=True, readable=True, exists=True,
file_okay=True),
help="Input fasta file from Deblur (.fa, .fna, .fasta)")
@click.option('--num_seqs', '-n', required=False, type=int, default=10000,
help="Number of sequences to randomly subsample [default: 10000]")
@click.option('--cutoff', '-c', required=False, type=float, default=0.5,
help="Minimum fraction of sequences required to match "
"a diagnostic 5' tetramer [default: 0.5]")
@click.option('--seed', '-s', required=False, type=int,
help="Random number seed [default: None]")

def verify_amplicon_type(input_fasta_fp, num_seqs, cutoff, seed):
"""Determine the most likely amplicon type of a fasta file based on the
first four nucleotides.

The most frequent 5' tetramer in a random subsample of sequences must
match, above a given cutoff fraction of sequences, one of the following
diagnostic tetramers:

Tetramer\tAmplicon\tForward primer
TACG\t16S rRNA\t515f
GTAG\tITS rRNA\tITS1f
GCT[AC]\t18S rRNA\tEuk1391f
"""
starting_kmer_value_counts = count_starting_kmers(input_fasta_fp, num_seqs,
seed)
top_kmer = starting_kmer_value_counts.index[0]
top_kmer_count = starting_kmer_value_counts[0]
top_kmer_frac = top_kmer_count/num_seqs
second_kmer = starting_kmer_value_counts.index[1]
second_kmer_count = starting_kmer_value_counts[1]
top2_kmer_frac = (top_kmer_count+second_kmer_count)/num_seqs
if (top_kmer == 'TACG') & (top_kmer_frac > cutoff):
print('Amplicon type: 16S/515f (%s%% of sequences start with %s)' %
(round(top_kmer_frac*100, 1), top_kmer))
elif (top_kmer == 'GTAG') & (top_kmer_frac > cutoff):
print('Amplicon type: ITS/ITS1f (%s%% of sequences start with %s)' %
(round(top_kmer_frac*100, 1), top_kmer))
elif (top_kmer in ['GCTA', 'GCTC']) & (second_kmer in ['GCTA', 'GCTC']) & (top2_kmer_frac > cutoff):
print('Amplicon type: 18S/Euk1391f (%s%% of sequences start with %s or %s)' %
(round(top2_kmer_frac*100, 1), top_kmer, second_kmer))
else:
print('Could not determine amplicon type'),
print('(most frequent starting tetramer was %s with %s%%)' %
(top_kmer, round(top_kmer_frac*100, 1)))

if __name__ == '__main__':
verify_amplicon_type()