Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Updated diversity.py and added network_plots_gephi.py #53

Merged
merged 6 commits into from
Jun 28, 2016
82 changes: 58 additions & 24 deletions bin/diversity.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,11 @@
Calculate and plot alpha diversity for two or more sample categories.
"""
import sys
import argparse
import csv
import argparse
import os.path as osp
from itertools import izip_longest
from collections import defaultdict
from phylotoast import graph_util as gu, util as putil
importerrors = []
try:
Expand All @@ -21,7 +23,6 @@
except ImportError as ie:
importerrors.append(ie)
try:
# matplotlib.use("Agg") # for use on headless server
from matplotlib import pyplot as plt, gridspec
except ImportError as ie:
importerrors.append(ie)
Expand All @@ -42,17 +43,24 @@ def calc_diversity(method, parsed_mapf, biom, cats, cats_index):
for sid, sample_counts in gather_samples(biom).items():
sample_ids.append(sid)
if sid in parsed_mapf:
counts[parsed_mapf[sid][cats_index]].append(sample_counts)
counts[parsed_mapf[sid][cats_index]].append((sid, sample_counts))

div_calc = {cat: [method(count) for count in counts] for cat, counts in counts.items()}
div_calc = {cat: {count[0]: method(count[1]) for count in counts}
for cat, counts in counts.items()}

return div_calc, sample_ids


def print_MannWhitneyU(x, y=None):
def print_MannWhitneyU(div_calc):
"""
Compute the Mann-Whitney U test for unequal group sample sizes.
"""
try:
x = div_calc.values()[0].values()
y = div_calc.values()[1].values()
except:
return "Error setting up input arrays for Mann-Whitney U Test. Skipping "\
"significance testing."
T, p = stats.mannwhitneyu(x, y)
print "\nMann-Whitney U test statistic:", T
print "Two-tailed p-value: {}".format(2 * p)
Expand All @@ -63,7 +71,16 @@ def print_KruskalWallisH(div_calc):
Compute the Kruskal-Wallis H-test for independent samples. A typical rule is that
each group must have at least 5 measurements.
"""
h, p = stats.kruskal(*div_calc)
calc = defaultdict(list)
try:
for k1, v1 in div_calc.iteritems():
for k2, v2 in v1.iteritems():
calc[k1].append(v2)
print calc
except:
return "Error setting up input arrays for Kruskal-Wallis H-Test. Skipping "\
"significance testing."
h, p = stats.kruskal(*calc.values())
print "\nKruskal-Wallis H-test statistic for {} groups: {}".format(str(len(div_calc)), h)
print "p-value: {}".format(p)

Expand All @@ -76,9 +93,9 @@ def plot_group_diversity(diversities, grp_colors, title, diversity_type, out_dir
ax_div = fig_div.add_subplot(grid[0, 0])

for i, grp in enumerate(diversities):
gu.plot_kde(diversities[grp], ax_div, title, grp_colors[grp])
gu.plot_kde(diversities[grp].values(), ax_div, title, grp_colors[grp])

ax_div.set_xlabel(diversity_type)
ax_div.set_xlabel(diversity_type.title())
ax_div.set_ylabel("Density")
ax_div.legend([plt.Rectangle((0, 0), 1, 1, fc=color) for color in grp_colors.values()],
grp_colors.keys(), loc="best")
Expand All @@ -97,10 +114,10 @@ def write_diversity_metrics(data, sample_ids, fp=None):

with open(fp, "w") as outf:
out = csv.writer(outf, delimiter="\t")
out.writerow(["calculation", "group"])
for group in data:
for entry in data[group]:
out.writerow([entry, group])
out.writerow(["SampleID", "Group", "Calculation"])
for group, d in data.iteritems():
for sid, value in d.iteritems():
out.writerow([sid, group, value])


def handle_program_options():
Expand All @@ -123,6 +140,10 @@ def handle_program_options():
The full list of metrics is available at:\
http://scikit-bio.org/docs/latest/generated/skbio.diversity.alpha.html.\
Beta diversity metrics will be supported in the future.")
parser.add_argument("--x_label", default=[None], nargs="+",
help="The name of the diversity metric to be displayed on the\
plot as the X-axis label. If multiple metrics are specified,\
then multiple entries for the X-axis label should be given.")
parser.add_argument("--color_by",
help="A column name in the mapping file containing\
hexadecimal (#FF0000) color values that will\
Expand All @@ -132,26 +153,35 @@ def handle_program_options():
help="A descriptive title that will appear at the top \
of the output plot. Surround with quotes if there are\
spaces in the title.")
parser.add_argument("--x_label", default="diversity", nargs="+",
help="The name of the diversity metric to be displayed on the\
plot as the X-axis label. If multiple metrics are specified,\
then multiple entries for the X-axis label should be given.")
parser.add_argument("-o", "--out_dir", default=".",
help="The directory plots will be saved to.")
parser.add_argument("--image_type", default="png",
help="The type of image to save: PNG, SVG, PDF, EPS, etc...")
help="The type of image to save: png, svg, pdf, eps, etc...")
parser.add_argument("--save_calculations",
help="Path and name of text file to store the calculated "
"diversity metrics.")
parser.add_argument("--show_significance", action="store_false", help="Display "
"significance testing results. The results will be shown by "
"default.")
parser.add_argument("--show_available_metrics", action="store_true",
help="Supply this parameter to see which alpha diversity metrics "
" are available for usage. No calculations will be performed"
" if this parameter is provided.")
return parser.parse_args()


def main():
args = handle_program_options()

metrics = [m for m in alpha.__all__ if "_ci" not in m]
try:
metrics.remove("faith_pd")
except ValueError:
pass
if args.show_available_metrics:
print "\nAvailable alpha diversity metrics:"
return "\n".join(metrics)

try:
with open(args.map_file):
pass
Expand Down Expand Up @@ -179,23 +209,27 @@ def main():
colors = putil.color_mapping(sample_map, header, args.category, args.color_by)

# Perform diversity calculations and density plotting
for method, x_label in zip(args.diversity, args.x_label):
for method, x_label in izip_longest(args.diversity, args.x_label):
if x_label is None:
x_label = method.title()
if method not in alpha.__all__:
sys.exit("ERROR: Diversity metric not found: " + method)
sys.exit("ERROR: Diversity metric not found: {}.".format(method))
elif method in alpha.__all__ and method not in metrics:
sys.exit("Currently, PhyloToAST does not support {} metric.".format(method))
metric = eval("alpha."+method)
div_calc, sample_ids = calc_diversity(metric, sample_map, biom_tbl,
cat_vals, cat_idx)

plot_group_diversity(div_calc, colors, plot_title, x_label,
args.out_dir, args.image_type)
plot_group_diversity(div_calc, colors, plot_title, x_label, args.out_dir,
args.image_type)

# calculate and print significance testing results
if args.show_significance:
print "Diversity significance testing: {}".format(x_label)
if len(cat_vals) == 2:
print_MannWhitneyU(*div_calc.values())
print_MannWhitneyU(div_calc)
elif len(cat_vals) > 2:
print_KruskalWallisH(div_calc.values())
print_KruskalWallisH(div_calc)
print
else:
continue
Expand All @@ -205,4 +239,4 @@ def main():


if __name__ == "__main__":
main()
sys.exit(main())
155 changes: 155 additions & 0 deletions bin/network_plots_gephi.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
#!/usr/bin/env python
"""
Abstract: Generate network plots using NetworkX and Gephi.
Author: Akshay Paropkari
Date: 02/10/2016
"""
import sys
import argparse
from collections import defaultdict
importerrors = []
try:
import biom
except ImportError as ie:
importerrors.append(ie)
try:
import pandas as pd
except ImportError as ie:
importerrors.append(ie)
try:
import networkx as nx
except ImportError:
importerrors.append(ie)
try:
from phylotoast import biom_calc as bc, otu_calc as oc
except ImportError:
importerrors.append(ie)
if len(importerrors) > 0:
for err in importerrors:
print "Import Error: {}".format(err)
sys.exit()


def get_relative_abundance(biomfile):
"""
Return arcsine transformed relative abundance from a BIOM format file.

:type biomfile: BIOM format file
:param biomfile: BIOM format file used to obtain relative abundances for each OTU in
a SampleID, which are used as node sizes in network plots.

:type return: Dictionary of dictionaries.
:return: Dictionary keyed on SampleID whose value is a dictionarykeyed on OTU Name
whose value is the arc sine tranfsormed relative abundance value for that
SampleID-OTU Name pair.
"""
biomf = biom.load_table(biomfile)
norm_biomf = biomf.norm(inplace=False)
rel_abd = {}
for sid in norm_biomf.ids():
rel_abd[sid] = {}
for otuid in norm_biomf.ids("observation"):
otuname = oc.otu_name(norm_biomf.metadata(otuid, axis="observation")["taxonomy"])
otuname = " ".join(otuname.split("_"))
abd = norm_biomf.get_value_by_ids(otuid, sid)
rel_abd[sid][otuname] = abd
ast_rel_abd = bc.arcsine_sqrt_transform(rel_abd)
return ast_rel_abd


def handle_program_options():
"""Parses the given options passed in at the command line."""
parser = argparse.ArgumentParser(description="Create network plots based "
"on correlation matrix.")
parser.add_argument("biom_file", help="Biom file OTU table.")
parser.add_argument("mapping_file", help="Mapping file for reading "
"sampleIDs and their groups.")
parser.add_argument("condition_column", help="Column name in mapping file "
"denoting the categories.")
parser.add_argument("in_corr_mat", help="Correlation matrix file. The "
"format for the tab-separated file should be: "
"Category -> Variable -> by Variable -> Correlation")
parser.add_argument("cat_name", help="Category to be plotted.")
parser.add_argument("-go", "--gexf_out",
help="Graph information written to this Graph Exchange"
" XML Format file. This file can be input to Gephi.")
parser.add_argument("-fp", "--fil_pct", type=float, default=0.75,
help="Specify the minimum value of correlation "
"strength to display. By default, all correlations "
">=0.75 will be shown.")
parser.add_argument("-w", "--stats_out_fnh",
help="Write out graph statistics.")
return parser.parse_args()


def main():
args = handle_program_options()

# Error handling input files
try:
relative_abundance = get_relative_abundance(args.biom_file)
except OSError as ose:
sys.exit("\nError in BIOM file path: {}\n".format(ose))
try:
# Read in the mapping file
mapf = pd.read_csv(args.mapping_file, sep="\t")
except IOError as ioe:
sys.exit("\nError in mapping file path: {}\n".format(ioe))
try:
# Read the correlation data
corr_data = pd.read_csv(args.in_corr_mat, sep="\t")
except IOError as ioe:
sys.exit("\nError in correlation matrix file path: {}\n".format(ioe))

# get category-wise nodes
cat_sids = defaultdict(list)
for rows in mapf.iterrows():
row = rows[1]
cat_sids[row[args.condition_column]].append(row["#SampleID"])

# initialize graph
G = nx.Graph()

# get category-wise graph edge data and edge colors
for rows in corr_data.iterrows():
row = rows[1]
if row["Category"] == args.cat_name and abs(row["Correlation"]) > args.fil_pct:
if row["Correlation"] > 0:
G.add_weighted_edges_from(
[(" ".join(row["Variable"].split("_")),
" ".join(row["by Variable"].split("_")),
row["Correlation"])], color="#00CC00") # Green
else:
G.add_weighted_edges_from(
[(" ".join(row["Variable"].split("_")),
" ".join(row["by Variable"].split("_")),
(row["Correlation"])*-1)], color="#FF0000") # Red

# add node attributes to graph node object
for otu, attr in G.node.iteritems():
total_abd = 0
for sid in cat_sids[args.cat_name]:
total_abd += relative_abundance[sid][otu]
mean_otu_cat_abd = total_abd/len(cat_sids[args.cat_name])
G.node[otu]["node_size"] = mean_otu_cat_abd

# convert node labels to integers
H = nx.convert_node_labels_to_integers(G, first_label=1,
ordering="decreasing degree",
label_attribute="id")

# Write out GEXF file for using with Gephi
if args.gexf_out:
nx.write_gexf(H, args.gexf_out, version="1.2draft")

# Write out betweenness centrality measure to file
if args.stats_out_fnh:
with open(args.stats_out_fnh, "w")as poi:
poi.write("OTU Node\tDegree Centrality\tBetweenness Centrality\n")
dc = nx.degree_centrality(G)
bc = nx.betweenness_centrality(G, weight="weight")
for key in sorted(bc.keys()):
poi.write("{}\t{}\t{}\n".format(key, dc[key], bc[key]))

if __name__ == "__main__":
sys.exit(main())
Loading