From b20b467f6454f1a4c0d1a93bde536146064f3172 Mon Sep 17 00:00:00 2001
From: Akshay Paropkari <paropkari.1@gmail.com>
Date: Mon, 27 Jun 2016 16:02:22 -0400
Subject: [PATCH 1/6] Updated diversity.py. Added ability to write out sample
 IDs when user provides '--save_calculations' parameter. Updated both
 significance testing functions. Removed '--x_label' parameter, improving the
 diversity nomenclature in figures as well as significance test outputs.
 Updated imports.

Updated '--image_type' parameter for usage clarity while specifying image type for saving figures.
---
 bin/diversity.py | 56 +++++++++++++++++++++++++++++-------------------
 1 file changed, 34 insertions(+), 22 deletions(-)

diff --git a/bin/diversity.py b/bin/diversity.py
index 0b206e9..06b40d5 100755
--- a/bin/diversity.py
+++ b/bin/diversity.py
@@ -3,9 +3,10 @@
 Calculate and plot alpha diversity for two or more sample categories.
 """
 import sys
-import argparse
 import csv
+import argparse
 import os.path as osp
+from collections import defaultdict
 from phylotoast import graph_util as gu, util as putil
 importerrors = []
 try:
@@ -21,7 +22,6 @@
 except ImportError as ie:
     importerrors.append(ie)
 try:
-    # matplotlib.use("Agg")  # for use on headless server
     from matplotlib import pyplot as plt, gridspec
 except ImportError as ie:
     importerrors.append(ie)
@@ -42,17 +42,24 @@ def calc_diversity(method, parsed_mapf, biom, cats, cats_index):
     for sid, sample_counts in gather_samples(biom).items():
         sample_ids.append(sid)
         if sid in parsed_mapf:
-            counts[parsed_mapf[sid][cats_index]].append(sample_counts)
+            counts[parsed_mapf[sid][cats_index]].append((sid, sample_counts))
 
-    div_calc = {cat: [method(count) for count in counts] for cat, counts in counts.items()}
+    div_calc = {cat: {count[0]: method(count[1]) for count in counts}
+                for cat, counts in counts.items()}
 
     return div_calc, sample_ids
 
 
-def print_MannWhitneyU(x, y=None):
+def print_MannWhitneyU(div_calc):
     """
     Compute the Mann-Whitney U test for unequal group sample sizes.
     """
+    try:
+        x = div_calc.values()[0].values()
+        y = div_calc.values()[1].values()
+    except:
+        return "Error setting up input arrays for Mann-Whitney U Test. Skipping "\
+               "significance testing."
     T, p = stats.mannwhitneyu(x, y)
     print "\nMann-Whitney U test statistic:", T
     print "Two-tailed p-value: {}".format(2 * p)
@@ -63,7 +70,16 @@ def print_KruskalWallisH(div_calc):
     Compute the Kruskal-Wallis H-test for independent samples. A typical rule is that
     each group must have at least 5 measurements.
     """
-    h, p = stats.kruskal(*div_calc)
+    calc = defaultdict(list)
+    try:
+        for k1, v1 in div_calc.iteritems():
+            for k2, v2 in v1.iteritems():
+                calc[k1].append(v2)
+        print calc
+    except:
+        return "Error setting up input arrays for Kruskal-Wallis H-Test. Skipping "\
+               "significance testing."
+    h, p = stats.kruskal(*calc.values())
     print "\nKruskal-Wallis H-test statistic for {} groups: {}".format(str(len(div_calc)), h)
     print "p-value: {}".format(p)
 
@@ -76,9 +92,9 @@ def plot_group_diversity(diversities, grp_colors, title, diversity_type, out_dir
     ax_div = fig_div.add_subplot(grid[0, 0])
 
     for i, grp in enumerate(diversities):
-        gu.plot_kde(diversities[grp], ax_div, title, grp_colors[grp])
+        gu.plot_kde(diversities[grp].values(), ax_div, title, grp_colors[grp])
 
-    ax_div.set_xlabel(diversity_type)
+    ax_div.set_xlabel(diversity_type.title())
     ax_div.set_ylabel("Density")
     ax_div.legend([plt.Rectangle((0, 0), 1, 1, fc=color) for color in grp_colors.values()],
                   grp_colors.keys(), loc="best")
@@ -97,10 +113,10 @@ def write_diversity_metrics(data, sample_ids, fp=None):
 
     with open(fp, "w") as outf:
         out = csv.writer(outf, delimiter="\t")
-        out.writerow(["calculation", "group"])
-        for group in data:
-            for entry in data[group]:
-                out.writerow([entry, group])
+        out.writerow(["SampleID", "Group", "Calculation"])
+        for group, d in data.iteritems():
+            for sid, value in d.iteritems():
+                out.writerow([sid, group, value])
 
 
 def handle_program_options():
@@ -132,14 +148,10 @@ def handle_program_options():
                         help="A descriptive title that will appear at the top \
                         of the output plot. Surround with quotes if there are\
                         spaces in the title.")
-    parser.add_argument("--x_label", default="diversity", nargs="+",
-                        help="The name of the diversity metric to be displayed on the\
-                        plot as the X-axis label. If multiple metrics are specified,\
-                        then multiple entries for the X-axis label should be given.")
     parser.add_argument("-o", "--out_dir", default=".",
                         help="The directory plots will be saved to.")
     parser.add_argument("--image_type", default="png",
-                        help="The type of image to save: PNG, SVG, PDF, EPS, etc...")
+                        help="The type of image to save: png, svg, pdf, eps, etc...")
     parser.add_argument("--save_calculations",
                         help="Path and name of text file to store the calculated "
                         "diversity metrics.")
@@ -179,23 +191,23 @@ def main():
     colors = putil.color_mapping(sample_map, header, args.category, args.color_by)
 
     # Perform diversity calculations and density plotting
-    for method, x_label in zip(args.diversity, args.x_label):
+    for method in args.diversity:
         if method not in alpha.__all__:
             sys.exit("ERROR: Diversity metric not found: " + method)
         metric = eval("alpha."+method)
         div_calc, sample_ids = calc_diversity(metric, sample_map, biom_tbl,
                                               cat_vals, cat_idx)
 
-        plot_group_diversity(div_calc, colors, plot_title, x_label,
+        plot_group_diversity(div_calc, colors, plot_title, method,
                              args.out_dir, args.image_type)
 
         # calculate and print significance testing results
         if args.show_significance:
-            print "Diversity significance testing: {}".format(x_label)
+            print "Diversity significance testing: {}".format(method.title())
             if len(cat_vals) == 2:
-                print_MannWhitneyU(*div_calc.values())
+                print_MannWhitneyU(div_calc)
             elif len(cat_vals) > 2:
-                print_KruskalWallisH(div_calc.values())
+                print_KruskalWallisH(div_calc)
             print
         else:
             continue

From 02865333d70d9183302a7fc8245ce0492c583280 Mon Sep 17 00:00:00 2001
From: Akshay Paropkari <paropkari.1@gmail.com>
Date: Tue, 28 Jun 2016 10:02:16 -0400
Subject: [PATCH 2/6] Added new script - 'network_plots_gephi.py'. The script
 creates network plots based on correlation matrix.

---
 bin/network_plots_gephi.py | 155 +++++++++++++++++++++++++++++++++++++
 1 file changed, 155 insertions(+)
 create mode 100755 bin/network_plots_gephi.py

diff --git a/bin/network_plots_gephi.py b/bin/network_plots_gephi.py
new file mode 100755
index 0000000..1c28bf0
--- /dev/null
+++ b/bin/network_plots_gephi.py
@@ -0,0 +1,155 @@
+#!/usr/bin/env python
+"""
+Abstract: Generate network plots using NetworkX and Gephi.
+Author: Akshay Paropkari
+Date: 02/10/2016
+"""
+import sys
+import argparse
+from collections import defaultdict
+importerrors = []
+try:
+    import biom
+except ImportError as ie:
+    importerrors.append(ie)
+try:
+    import pandas as pd
+except ImportError as ie:
+    importerrors.append(ie)
+try:
+    import networkx as nx
+except ImportError:
+    importerrors.append(ie)
+try:
+    from phylotoast import biom_calc as bc, otu_calc as oc
+except ImportError:
+    importerrors.append(ie)
+if len(importerrors) > 0:
+    for err in importerrors:
+        print "Import Error: {}".format(err)
+    sys.exit()
+
+
+def get_relative_abundance(biomfile):
+    """
+    Return arcsine transformed relative abundance from a BIOM format file.
+
+    :type biomfile: BIOM format file
+    :param biomfile: BIOM format file used to obtain relative abundances for each OTU in
+                     a SampleID, which are used as node sizes in network plots.
+
+    :type return: Dictionary of dictionaries.
+    :return: Dictionary keyed on SampleID whose value is a dictionarykeyed on OTU Name
+             whose value is the arc sine tranfsormed relative abundance value for that
+             SampleID-OTU Name pair.
+    """
+    biomf = biom.load_table(biomfile)
+    norm_biomf = biomf.norm(inplace=False)
+    rel_abd = {}
+    for sid in norm_biomf.ids():
+        rel_abd[sid] = {}
+        for otuid in norm_biomf.ids("observation"):
+            otuname = oc.otu_name(norm_biomf.metadata(otuid, axis="observation")["taxonomy"])
+            otuname = " ".join(otuname.split("_"))
+            abd = norm_biomf.get_value_by_ids(otuid, sid)
+            rel_abd[sid][otuname] = abd
+    ast_rel_abd = bc.arcsine_sqrt_transform(rel_abd)
+    return ast_rel_abd
+
+
+def handle_program_options():
+    """Parses the given options passed in at the command line."""
+    parser = argparse.ArgumentParser(description="Create network plots based "
+                                     "on correlation matrix.")
+    parser.add_argument("biom_file", help="Biom file OTU table.")
+    parser.add_argument("mapping_file", help="Mapping file for reading "
+                        "sampleIDs and their groups.")
+    parser.add_argument("condition_column", help="Column name in mapping file "
+                        "denoting the categories.")
+    parser.add_argument("in_corr_mat", help="Correlation matrix file. The "
+                        "format for the tab-separated file should be: "
+                        "Category -> Variable -> by Variable -> Correlation")
+    parser.add_argument("cat_name", help="Category to be plotted.")
+    parser.add_argument("-go", "--gexf_out",
+                        help="Graph information written to this Graph Exchange"
+                        " XML Format file. This file can be input to Gephi.")
+    parser.add_argument("-fp", "--fil_pct", type=float, default=0.75,
+                        help="Specify the minimum value of correlation "
+                        "strength to display. By default, all correlations "
+                        ">=0.75 will be shown.")
+    parser.add_argument("-w", "--stats_out_fnh",
+                        help="Write out graph statistics.")
+    return parser.parse_args()
+
+
+def main():
+    args = handle_program_options()
+
+    # Error handling input files
+    try:
+        relative_abundance = get_relative_abundance(args.biom_file)
+    except OSError as ose:
+        sys.exit("\nError in BIOM file path: {}\n".format(ose))
+    try:
+        # Read in the mapping file
+        mapf = pd.read_csv(args.mapping_file, sep="\t")
+    except IOError as ioe:
+        sys.exit("\nError in mapping file path: {}\n".format(ioe))
+    try:
+        # Read the correlation data
+        corr_data = pd.read_csv(args.in_corr_mat, sep="\t")
+    except IOError as ioe:
+        sys.exit("\nError in correlation matrix file path: {}\n".format(ioe))
+
+    # get category-wise nodes
+    cat_sids = defaultdict(list)
+    for rows in mapf.iterrows():
+        row = rows[1]
+        cat_sids[row[args.condition_column]].append(row["#SampleID"])
+
+    # initialize graph
+    G = nx.Graph()
+
+    # get category-wise graph edge data and edge colors
+    for rows in corr_data.iterrows():
+        row = rows[1]
+        if row["Category"] == args.cat_name and abs(row["Correlation"]) > args.fil_pct:
+            if row["Correlation"] > 0:
+                G.add_weighted_edges_from(
+                    [(" ".join(row["Variable"].split("_")),
+                      " ".join(row["by Variable"].split("_")),
+                      row["Correlation"])], color="#00CC00")  # Green
+            else:
+                G.add_weighted_edges_from(
+                    [(" ".join(row["Variable"].split("_")),
+                      " ".join(row["by Variable"].split("_")),
+                      (row["Correlation"])*-1)], color="#FF0000")  # Red
+
+    # add node attributes to graph node object
+    for otu, attr in G.node.iteritems():
+        total_abd = 0
+        for sid in cat_sids[args.cat_name]:
+            total_abd += relative_abundance[sid][otu]
+        mean_otu_cat_abd = total_abd/len(cat_sids[args.cat_name])
+        G.node[otu]["node_size"] = mean_otu_cat_abd
+
+    # convert node labels to integers
+    H = nx.convert_node_labels_to_integers(G, first_label=1,
+                                           ordering="decreasing degree",
+                                           label_attribute="id")
+
+    # Write out GEXF file for using with Gephi
+    if args.gexf_out:
+        nx.write_gexf(H, args.gexf_out, version="1.2draft")
+
+    # Write out betweenness centrality measure to file
+    if args.stats_out_fnh:
+        with open(args.stats_out_fnh, "w")as poi:
+            poi.write("OTU Node\tDegree Centrality\tBetweenness Centrality\n")
+            dc = nx.degree_centrality(G)
+            bc = nx.betweenness_centrality(G, weight="weight")
+            for key in sorted(bc.keys()):
+                poi.write("{}\t{}\t{}\n".format(key, dc[key], bc[key]))
+
+if __name__ == "__main__":
+    sys.exit(main())

From 0cf33dd8ba80751e2ac187bc572361f4acd01484 Mon Sep 17 00:00:00 2001
From: Akshay Paropkari <paropkari.1@gmail.com>
Date: Tue, 28 Jun 2016 10:28:34 -0400
Subject: [PATCH 3/6] Updated diversity.py. Added --show_available_metrics
 parameter for displaying alpha diversity metrics available for users to use.
 Addresses #48.

---
 bin/diversity.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/bin/diversity.py b/bin/diversity.py
index 06b40d5..3b7cee7 100755
--- a/bin/diversity.py
+++ b/bin/diversity.py
@@ -158,12 +158,22 @@ def handle_program_options():
     parser.add_argument("--show_significance", action="store_false", help="Display "
                         "significance testing results. The results will be shown by "
                         "default.")
+    parser.add_argument("--show_available_metrics", action="store_true",
+                        help="Supply this parameter to see which alpha diversity metrics "
+                             " are available for usage. No calculations will be performed"
+                             " if this parameter is provided.")
     return parser.parse_args()
 
 
 def main():
     args = handle_program_options()
 
+    if args.show_available_metrics:
+        print "\nAvailable alpha diversity metrics:"
+        return "\n".join(alpha.__all__)
+    else:
+        return "ERROR! Cannot print out alpha diversity metrics list."
+
     try:
         with open(args.map_file):
             pass
@@ -217,4 +227,4 @@ def main():
 
 
 if __name__ == "__main__":
-    main()
+    sys.exit(main())

From ee8affe1a4b36699a2a74b7c6c8664506fa5afc9 Mon Sep 17 00:00:00 2001
From: Akshay Paropkari <paropkari.1@gmail.com>
Date: Tue, 28 Jun 2016 12:02:33 -0400
Subject: [PATCH 4/6] Added documentation for network_plots_gephi.py script.

---
 docs/scripts/network_plots_gephi.txt | 54 ++++++++++++++++++++++++++++
 1 file changed, 54 insertions(+)
 create mode 100644 docs/scripts/network_plots_gephi.txt

diff --git a/docs/scripts/network_plots_gephi.txt b/docs/scripts/network_plots_gephi.txt
new file mode 100644
index 0000000..3201a0f
--- /dev/null
+++ b/docs/scripts/network_plots_gephi.txt
@@ -0,0 +1,54 @@
+network_plots_gephi.py
+======================
+
+Create network plots based on correlation matrix.
+
+.. code-block:: bash
+
+    usage: network_plots_gephi.py [-h] [-go GEXF_OUT] [-fp FIL_PCT] [-w STATS_OUT_FNH] biom_file mapping_file condition_column in_corr_mat cat_name
+
+Required Arguments
+-------------------
+
+.. cmdoption:: biom_file
+
+    The biom-format file.
+
+.. cmdoption:: mapping_file
+
+    Mapping file for reading sampleIDs and their groups.
+
+.. cmdoption:: condition_column
+
+    Column name in mapping file denoting the categories.
+
+.. cmdoption:: in_corr_mat
+
+    Correlation matrix file. The format for the tab-separated file should be: Category -> Variable -> by Variable -> Correlation
+
+.. cmdoption:: cat_name
+
+    Category to be plotted.
+
+Optional Arguments
+------------------
+
+.. cmdoption:: -go GEXF_OUT, --gexf_out GEXF_OUT
+
+    The directory to output the PCoA plots to.
+
+.. cmdoption:: --scaling_factor SCALING_FACTOR
+
+    Graph information written to this Graph Exchange XML Format file. This file can be input to Gephi.
+
+.. cmdoption:: -fp FIL_PCT, --fil_pct FIL_PCT
+
+    Specify the minimum value of correlation strength to display. By default, all correlations greater than or equal to 0.75 will be shown.
+
+.. cmdoption:: -w STATS_OUT_FNH, --stats_out_fnh STATS_OUT_FNH
+
+    Write out graph statistics - degree and betweenness centrality calculations for each node.
+
+.. cmdoption:: -h, --help
+
+    Show this help message and exit

From ec1cd8849777b8b32ca44e8899e005a5c0e067ca Mon Sep 17 00:00:00 2001
From: Akshay Paropkari <paropkari.1@gmail.com>
Date: Tue, 28 Jun 2016 12:04:51 -0400
Subject: [PATCH 5/6] Updated documentation for diversity.py.

---
 docs/scripts/diversity.txt | 36 ++++++++++++++++++++++++++----------
 1 file changed, 26 insertions(+), 10 deletions(-)

diff --git a/docs/scripts/diversity.txt b/docs/scripts/diversity.txt
index 5cb9880..5d8c08c 100755
--- a/docs/scripts/diversity.txt
+++ b/docs/scripts/diversity.txt
@@ -1,31 +1,31 @@
 ============
 diversity.py
 ============
-Calculate and plot for two sample categories: Shannon diversity, Chao1 diversity, and a Jaccard similiarity distance matrix heatmap.
+Calculate the alpha diversity of a set of samples using one or more metrics and output a kernal density estimator-smoothed histogram of the results.
 
 .. code-block:: bash
 
-    usage: usage: diversity.py [-h][-c {shannon,chao1,jaccard} [{shannon,chao1,jaccard} ...]] [-p IMAGE_TYPE] map_file biom_file category plot_title out_dir
+    usage: diversity.py [-h] [-d DIVERSITY [DIVERSITY ...]] [--plot_title PLOT_TITLE] [--image_type IMAGE_TYPE] [--save_calculations SAVE_CALCULATIONS] [--show_significance] [--show_available_metrics] -m MAP_FILE -i BIOM_FP -c CATEGORY --color_by COLOR_BY -o OUT_DIR
 
 Required arguments
 -------------------
-.. cmdoption:: map_file
+.. cmdoption:: -m MAP_FILE, --map_file MAP_FILE
 
     QIIME mapping file.
 
-.. cmdoption:: biom_file
+.. cmdoption:: -i BIOM_FP, --biom_fp BIOM_FP
 
     BIOM table file name
 
-.. cmdoption:: category
+.. cmdoption:: -c CATEGORY, --category CATEGORY
 
     Specific category from the mapping file.
 
-.. cmdoption:: plot_title
+.. cmdoption:: --color_by COLOR_BY
 
-    The name of a PDF file the pathway map will be written to.
+    A column name in the mapping file containing hexadecimal (#FF0000) color values that will be used to color the groups. Each sample ID must have a color entry.
 
-.. cmdoption:: out_dir
+.. cmdoption:: -o OUT_DIR, --out_dir OUT_DIR
 
     The directory all plots will be saved to.
 
@@ -35,10 +35,26 @@ Optional arguments
 
     show this help message and exit
 
-.. cmdoption:: -c {shannon,chao1,jaccard} [{shannon,chao1,jaccard} ...]
+.. cmdoption:: -d DIVERSITY [DIVERSITY ...], --diversity DIVERSITY [DIVERSITY ...]
+
+    The alpha diversity metric. Default value is 'shannon', which will calculate the Shannon entropy. Multiple metrics can be specified (space separated). The full list of metrics is available at: http://scikit-bio.org/docs/latest/generated/skbio.diversity.alpha.html.
 
-    Choose the type of calculation needed. Default value is "None", which will output all 3 types of calculations.
+.. cmdoption:: --plot_title PLOT_TITLE
+
+    The name of a PDF file the pathway map will be written to.
 
 .. cmdoption:: -p IMAGE_TYPE, --image_type IMAGE_TYPE
 
     The type of image to save: PNG, SVG, etc.
+
+.. cmdoption:: --save_calculations SAVE_CALCULATIONS
+
+    Path and name of text file to store the calculated diversity metrics.
+
+.. cmdoption:: --show_significance
+
+    Display significance testing results. The results will be shown by default.
+
+.. cmdoption:: --show_available_metrics
+
+    Supply this parameter to see which alpha diversity metrics are available for usage. No calculations will be performed if this parameter is provided.

From 0db546f8e42b524b1e6ba0021b5c6b7b8a9360e8 Mon Sep 17 00:00:00 2001
From: Akshay Paropkari <paropkari.1@gmail.com>
Date: Tue, 28 Jun 2016 14:02:36 -0400
Subject: [PATCH 6/6] Updated diversity.py. Added checks valid, but not yet
 supported by PhyloToAST diversity metrics. Brought back --x_label parameter.

---
 bin/diversity.py | 28 ++++++++++++++++++++--------
 1 file changed, 20 insertions(+), 8 deletions(-)

diff --git a/bin/diversity.py b/bin/diversity.py
index 3b7cee7..b9c4212 100755
--- a/bin/diversity.py
+++ b/bin/diversity.py
@@ -6,6 +6,7 @@
 import csv
 import argparse
 import os.path as osp
+from itertools import izip_longest
 from collections import defaultdict
 from phylotoast import graph_util as gu, util as putil
 importerrors = []
@@ -139,6 +140,10 @@ def handle_program_options():
                              The full list of metrics is available at:\
                              http://scikit-bio.org/docs/latest/generated/skbio.diversity.alpha.html.\
                              Beta diversity metrics will be supported in the future.")
+    parser.add_argument("--x_label", default=[None], nargs="+",
+                        help="The name of the diversity metric to be displayed on the\
+                        plot as the X-axis label. If multiple metrics are specified,\
+                        then multiple entries for the X-axis label should be given.")
     parser.add_argument("--color_by",
                         help="A column name in the mapping file containing\
                               hexadecimal (#FF0000) color values that will\
@@ -168,11 +173,14 @@ def handle_program_options():
 def main():
     args = handle_program_options()
 
+    metrics = [m for m in alpha.__all__ if "_ci" not in m]
+    try:
+        metrics.remove("faith_pd")
+    except ValueError:
+        pass
     if args.show_available_metrics:
         print "\nAvailable alpha diversity metrics:"
-        return "\n".join(alpha.__all__)
-    else:
-        return "ERROR! Cannot print out alpha diversity metrics list."
+        return "\n".join(metrics)
 
     try:
         with open(args.map_file):
@@ -201,19 +209,23 @@ def main():
     colors = putil.color_mapping(sample_map, header, args.category, args.color_by)
 
     # Perform diversity calculations and density plotting
-    for method in args.diversity:
+    for method, x_label in izip_longest(args.diversity, args.x_label):
+        if x_label is None:
+            x_label = method.title()
         if method not in alpha.__all__:
-            sys.exit("ERROR: Diversity metric not found: " + method)
+            sys.exit("ERROR: Diversity metric not found: {}.".format(method))
+        elif method in alpha.__all__ and method not in metrics:
+            sys.exit("Currently, PhyloToAST does not support {} metric.".format(method))
         metric = eval("alpha."+method)
         div_calc, sample_ids = calc_diversity(metric, sample_map, biom_tbl,
                                               cat_vals, cat_idx)
 
-        plot_group_diversity(div_calc, colors, plot_title, method,
-                             args.out_dir, args.image_type)
+        plot_group_diversity(div_calc, colors, plot_title, x_label, args.out_dir,
+                             args.image_type)
 
         # calculate and print significance testing results
         if args.show_significance:
-            print "Diversity significance testing: {}".format(method.title())
+            print "Diversity significance testing: {}".format(x_label)
             if len(cat_vals) == 2:
                 print_MannWhitneyU(div_calc)
             elif len(cat_vals) > 2: