smdabdoub · smdabdoub · Jun 28, 2016 · Jun 27, 2016 · Jun 28, 2016 · Jun 28, 2016
diff --git a/bin/diversity.py b/bin/diversity.py
@@ -3,9 +3,11 @@
 Calculate and plot alpha diversity for two or more sample categories.
 """
 import sys
-import argparse
 import csv
+import argparse
 import os.path as osp
+from itertools import izip_longest
+from collections import defaultdict
 from phylotoast import graph_util as gu, util as putil
 importerrors = []
 try:
@@ -21,7 +23,6 @@
 except ImportError as ie:
     importerrors.append(ie)
 try:
-    # matplotlib.use("Agg")  # for use on headless server
     from matplotlib import pyplot as plt, gridspec
 except ImportError as ie:
     importerrors.append(ie)
@@ -42,17 +43,24 @@ def calc_diversity(method, parsed_mapf, biom, cats, cats_index):
     for sid, sample_counts in gather_samples(biom).items():
         sample_ids.append(sid)
         if sid in parsed_mapf:
-            counts[parsed_mapf[sid][cats_index]].append(sample_counts)
+            counts[parsed_mapf[sid][cats_index]].append((sid, sample_counts))
 
-    div_calc = {cat: [method(count) for count in counts] for cat, counts in counts.items()}
+    div_calc = {cat: {count[0]: method(count[1]) for count in counts}
+                for cat, counts in counts.items()}
 
     return div_calc, sample_ids
 
 
-def print_MannWhitneyU(x, y=None):
+def print_MannWhitneyU(div_calc):
     """
     Compute the Mann-Whitney U test for unequal group sample sizes.
     """
+    try:
+        x = div_calc.values()[0].values()
+        y = div_calc.values()[1].values()
+    except:
+        return "Error setting up input arrays for Mann-Whitney U Test. Skipping "\
+               "significance testing."
     T, p = stats.mannwhitneyu(x, y)
     print "\nMann-Whitney U test statistic:", T
     print "Two-tailed p-value: {}".format(2 * p)
@@ -63,7 +71,16 @@ def print_KruskalWallisH(div_calc):
     Compute the Kruskal-Wallis H-test for independent samples. A typical rule is that
     each group must have at least 5 measurements.
     """
-    h, p = stats.kruskal(*div_calc)
+    calc = defaultdict(list)
+    try:
+        for k1, v1 in div_calc.iteritems():
+            for k2, v2 in v1.iteritems():
+                calc[k1].append(v2)
+        print calc
+    except:
+        return "Error setting up input arrays for Kruskal-Wallis H-Test. Skipping "\
+               "significance testing."
+    h, p = stats.kruskal(*calc.values())
     print "\nKruskal-Wallis H-test statistic for {} groups: {}".format(str(len(div_calc)), h)
     print "p-value: {}".format(p)
 
@@ -76,9 +93,9 @@ def plot_group_diversity(diversities, grp_colors, title, diversity_type, out_dir
     ax_div = fig_div.add_subplot(grid[0, 0])
 
     for i, grp in enumerate(diversities):
-        gu.plot_kde(diversities[grp], ax_div, title, grp_colors[grp])
+        gu.plot_kde(diversities[grp].values(), ax_div, title, grp_colors[grp])
 
-    ax_div.set_xlabel(diversity_type)
+    ax_div.set_xlabel(diversity_type.title())
     ax_div.set_ylabel("Density")
     ax_div.legend([plt.Rectangle((0, 0), 1, 1, fc=color) for color in grp_colors.values()],
                   grp_colors.keys(), loc="best")
@@ -97,10 +114,10 @@ def write_diversity_metrics(data, sample_ids, fp=None):
 
     with open(fp, "w") as outf:
         out = csv.writer(outf, delimiter="\t")
-        out.writerow(["calculation", "group"])
-        for group in data:
-            for entry in data[group]:
-                out.writerow([entry, group])
+        out.writerow(["SampleID", "Group", "Calculation"])
+        for group, d in data.iteritems():
+            for sid, value in d.iteritems():
+                out.writerow([sid, group, value])
 
 
 def handle_program_options():
@@ -123,6 +140,10 @@ def handle_program_options():
                              The full list of metrics is available at:\
                              http://scikit-bio.org/docs/latest/generated/skbio.diversity.alpha.html.\
                              Beta diversity metrics will be supported in the future.")
+    parser.add_argument("--x_label", default=[None], nargs="+",
+                        help="The name of the diversity metric to be displayed on the\
+                        plot as the X-axis label. If multiple metrics are specified,\
+                        then multiple entries for the X-axis label should be given.")
     parser.add_argument("--color_by",
                         help="A column name in the mapping file containing\
                               hexadecimal (#FF0000) color values that will\
@@ -132,26 +153,35 @@ def handle_program_options():
                         help="A descriptive title that will appear at the top \
                         of the output plot. Surround with quotes if there are\
                         spaces in the title.")
-    parser.add_argument("--x_label", default="diversity", nargs="+",
-                        help="The name of the diversity metric to be displayed on the\
-                        plot as the X-axis label. If multiple metrics are specified,\
-                        then multiple entries for the X-axis label should be given.")
     parser.add_argument("-o", "--out_dir", default=".",
                         help="The directory plots will be saved to.")
     parser.add_argument("--image_type", default="png",
-                        help="The type of image to save: PNG, SVG, PDF, EPS, etc...")
+                        help="The type of image to save: png, svg, pdf, eps, etc...")
     parser.add_argument("--save_calculations",
                         help="Path and name of text file to store the calculated "
                         "diversity metrics.")
     parser.add_argument("--show_significance", action="store_false", help="Display "
                         "significance testing results. The results will be shown by "
                         "default.")
+    parser.add_argument("--show_available_metrics", action="store_true",
+                        help="Supply this parameter to see which alpha diversity metrics "
+                             " are available for usage. No calculations will be performed"
+                             " if this parameter is provided.")
     return parser.parse_args()
 
 
 def main():
     args = handle_program_options()
 
+    metrics = [m for m in alpha.__all__ if "_ci" not in m]
+    try:
+        metrics.remove("faith_pd")
+    except ValueError:
+        pass
+    if args.show_available_metrics:
+        print "\nAvailable alpha diversity metrics:"
+        return "\n".join(metrics)
+
     try:
         with open(args.map_file):
             pass
@@ -179,23 +209,27 @@ def main():
     colors = putil.color_mapping(sample_map, header, args.category, args.color_by)
 
     # Perform diversity calculations and density plotting
-    for method, x_label in zip(args.diversity, args.x_label):
+    for method, x_label in izip_longest(args.diversity, args.x_label):
+        if x_label is None:
+            x_label = method.title()
         if method not in alpha.__all__:
-            sys.exit("ERROR: Diversity metric not found: " + method)
+            sys.exit("ERROR: Diversity metric not found: {}.".format(method))
+        elif method in alpha.__all__ and method not in metrics:
+            sys.exit("Currently, PhyloToAST does not support {} metric.".format(method))
         metric = eval("alpha."+method)
         div_calc, sample_ids = calc_diversity(metric, sample_map, biom_tbl,
                                               cat_vals, cat_idx)
 
-        plot_group_diversity(div_calc, colors, plot_title, x_label,
-                             args.out_dir, args.image_type)
+        plot_group_diversity(div_calc, colors, plot_title, x_label, args.out_dir,
+                             args.image_type)
 
         # calculate and print significance testing results
         if args.show_significance:
             print "Diversity significance testing: {}".format(x_label)
             if len(cat_vals) == 2:
-                print_MannWhitneyU(*div_calc.values())
+                print_MannWhitneyU(div_calc)
             elif len(cat_vals) > 2:
-                print_KruskalWallisH(div_calc.values())
+                print_KruskalWallisH(div_calc)
             print
         else:
             continue
@@ -205,4 +239,4 @@ def main():
 
 
 if __name__ == "__main__":
-    main()
+    sys.exit(main())
diff --git a/bin/network_plots_gephi.py b/bin/network_plots_gephi.py
@@ -0,0 +1,155 @@
+#!/usr/bin/env python
+"""
+Abstract: Generate network plots using NetworkX and Gephi.
+Author: Akshay Paropkari
+Date: 02/10/2016
+"""
+import sys
+import argparse
+from collections import defaultdict
+importerrors = []
+try:
+    import biom
+except ImportError as ie:
+    importerrors.append(ie)
+try:
+    import pandas as pd
+except ImportError as ie:
+    importerrors.append(ie)
+try:
+    import networkx as nx
+except ImportError:
+    importerrors.append(ie)
+try:
+    from phylotoast import biom_calc as bc, otu_calc as oc
+except ImportError:
+    importerrors.append(ie)
+if len(importerrors) > 0:
+    for err in importerrors:
+        print "Import Error: {}".format(err)
+    sys.exit()
+
+
+def get_relative_abundance(biomfile):
+    """
+    Return arcsine transformed relative abundance from a BIOM format file.
+
+    :type biomfile: BIOM format file
+    :param biomfile: BIOM format file used to obtain relative abundances for each OTU in
+                     a SampleID, which are used as node sizes in network plots.
+
+    :type return: Dictionary of dictionaries.
+    :return: Dictionary keyed on SampleID whose value is a dictionarykeyed on OTU Name
+             whose value is the arc sine tranfsormed relative abundance value for that
+             SampleID-OTU Name pair.
+    """
+    biomf = biom.load_table(biomfile)
+    norm_biomf = biomf.norm(inplace=False)
+    rel_abd = {}
+    for sid in norm_biomf.ids():
+        rel_abd[sid] = {}
+        for otuid in norm_biomf.ids("observation"):
+            otuname = oc.otu_name(norm_biomf.metadata(otuid, axis="observation")["taxonomy"])
+            otuname = " ".join(otuname.split("_"))
+            abd = norm_biomf.get_value_by_ids(otuid, sid)
+            rel_abd[sid][otuname] = abd
+    ast_rel_abd = bc.arcsine_sqrt_transform(rel_abd)
+    return ast_rel_abd
+
+
+def handle_program_options():
+    """Parses the given options passed in at the command line."""
+    parser = argparse.ArgumentParser(description="Create network plots based "
+                                     "on correlation matrix.")
+    parser.add_argument("biom_file", help="Biom file OTU table.")
+    parser.add_argument("mapping_file", help="Mapping file for reading "
+                        "sampleIDs and their groups.")
+    parser.add_argument("condition_column", help="Column name in mapping file "
+                        "denoting the categories.")
+    parser.add_argument("in_corr_mat", help="Correlation matrix file. The "
+                        "format for the tab-separated file should be: "
+                        "Category -> Variable -> by Variable -> Correlation")
+    parser.add_argument("cat_name", help="Category to be plotted.")
+    parser.add_argument("-go", "--gexf_out",
+                        help="Graph information written to this Graph Exchange"
+                        " XML Format file. This file can be input to Gephi.")
+    parser.add_argument("-fp", "--fil_pct", type=float, default=0.75,
+                        help="Specify the minimum value of correlation "
+                        "strength to display. By default, all correlations "
+                        ">=0.75 will be shown.")
+    parser.add_argument("-w", "--stats_out_fnh",
+                        help="Write out graph statistics.")
+    return parser.parse_args()
+
+
+def main():
+    args = handle_program_options()
+
+    # Error handling input files
+    try:
+        relative_abundance = get_relative_abundance(args.biom_file)
+    except OSError as ose:
+        sys.exit("\nError in BIOM file path: {}\n".format(ose))
+    try:
+        # Read in the mapping file
+        mapf = pd.read_csv(args.mapping_file, sep="\t")
+    except IOError as ioe:
+        sys.exit("\nError in mapping file path: {}\n".format(ioe))
+    try:
+        # Read the correlation data
+        corr_data = pd.read_csv(args.in_corr_mat, sep="\t")
+    except IOError as ioe:
+        sys.exit("\nError in correlation matrix file path: {}\n".format(ioe))
+
+    # get category-wise nodes
+    cat_sids = defaultdict(list)
+    for rows in mapf.iterrows():
+        row = rows[1]
+        cat_sids[row[args.condition_column]].append(row["#SampleID"])
+
+    # initialize graph
+    G = nx.Graph()
+
+    # get category-wise graph edge data and edge colors
+    for rows in corr_data.iterrows():
+        row = rows[1]
+        if row["Category"] == args.cat_name and abs(row["Correlation"]) > args.fil_pct:
+            if row["Correlation"] > 0:
+                G.add_weighted_edges_from(
+                    [(" ".join(row["Variable"].split("_")),
+                      " ".join(row["by Variable"].split("_")),
+                      row["Correlation"])], color="#00CC00")  # Green
+            else:
+                G.add_weighted_edges_from(
+                    [(" ".join(row["Variable"].split("_")),
+                      " ".join(row["by Variable"].split("_")),
+                      (row["Correlation"])*-1)], color="#FF0000")  # Red
+
+    # add node attributes to graph node object
+    for otu, attr in G.node.iteritems():
+        total_abd = 0
+        for sid in cat_sids[args.cat_name]:
+            total_abd += relative_abundance[sid][otu]
+        mean_otu_cat_abd = total_abd/len(cat_sids[args.cat_name])
+        G.node[otu]["node_size"] = mean_otu_cat_abd
+
+    # convert node labels to integers
+    H = nx.convert_node_labels_to_integers(G, first_label=1,
+                                           ordering="decreasing degree",
+                                           label_attribute="id")
+
+    # Write out GEXF file for using with Gephi
+    if args.gexf_out:
+        nx.write_gexf(H, args.gexf_out, version="1.2draft")
+
+    # Write out betweenness centrality measure to file
+    if args.stats_out_fnh:
+        with open(args.stats_out_fnh, "w")as poi:
+            poi.write("OTU Node\tDegree Centrality\tBetweenness Centrality\n")
+            dc = nx.degree_centrality(G)
+            bc = nx.betweenness_centrality(G, weight="weight")
+            for key in sorted(bc.keys()):
+                poi.write("{}\t{}\t{}\n".format(key, dc[key], bc[key]))
+
+if __name__ == "__main__":
+    sys.exit(main())