From b20b467f6454f1a4c0d1a93bde536146064f3172 Mon Sep 17 00:00:00 2001 From: Akshay Paropkari Date: Mon, 27 Jun 2016 16:02:22 -0400 Subject: [PATCH 1/6] Updated diversity.py. Added ability to write out sample IDs when user provides '--save_calculations' parameter. Updated both significance testing functions. Removed '--x_label' parameter, improving the diversity nomenclature in figures as well as significance test outputs. Updated imports. Updated '--image_type' parameter for usage clarity while specifying image type for saving figures. --- bin/diversity.py | 56 +++++++++++++++++++++++++++++------------------- 1 file changed, 34 insertions(+), 22 deletions(-) diff --git a/bin/diversity.py b/bin/diversity.py index 0b206e9..06b40d5 100755 --- a/bin/diversity.py +++ b/bin/diversity.py @@ -3,9 +3,10 @@ Calculate and plot alpha diversity for two or more sample categories. """ import sys -import argparse import csv +import argparse import os.path as osp +from collections import defaultdict from phylotoast import graph_util as gu, util as putil importerrors = [] try: @@ -21,7 +22,6 @@ except ImportError as ie: importerrors.append(ie) try: - # matplotlib.use("Agg") # for use on headless server from matplotlib import pyplot as plt, gridspec except ImportError as ie: importerrors.append(ie) @@ -42,17 +42,24 @@ def calc_diversity(method, parsed_mapf, biom, cats, cats_index): for sid, sample_counts in gather_samples(biom).items(): sample_ids.append(sid) if sid in parsed_mapf: - counts[parsed_mapf[sid][cats_index]].append(sample_counts) + counts[parsed_mapf[sid][cats_index]].append((sid, sample_counts)) - div_calc = {cat: [method(count) for count in counts] for cat, counts in counts.items()} + div_calc = {cat: {count[0]: method(count[1]) for count in counts} + for cat, counts in counts.items()} return div_calc, sample_ids -def print_MannWhitneyU(x, y=None): +def print_MannWhitneyU(div_calc): """ Compute the Mann-Whitney U test for unequal group sample sizes. """ + try: + x = div_calc.values()[0].values() + y = div_calc.values()[1].values() + except: + return "Error setting up input arrays for Mann-Whitney U Test. Skipping "\ + "significance testing." T, p = stats.mannwhitneyu(x, y) print "\nMann-Whitney U test statistic:", T print "Two-tailed p-value: {}".format(2 * p) @@ -63,7 +70,16 @@ def print_KruskalWallisH(div_calc): Compute the Kruskal-Wallis H-test for independent samples. A typical rule is that each group must have at least 5 measurements. """ - h, p = stats.kruskal(*div_calc) + calc = defaultdict(list) + try: + for k1, v1 in div_calc.iteritems(): + for k2, v2 in v1.iteritems(): + calc[k1].append(v2) + print calc + except: + return "Error setting up input arrays for Kruskal-Wallis H-Test. Skipping "\ + "significance testing." + h, p = stats.kruskal(*calc.values()) print "\nKruskal-Wallis H-test statistic for {} groups: {}".format(str(len(div_calc)), h) print "p-value: {}".format(p) @@ -76,9 +92,9 @@ def plot_group_diversity(diversities, grp_colors, title, diversity_type, out_dir ax_div = fig_div.add_subplot(grid[0, 0]) for i, grp in enumerate(diversities): - gu.plot_kde(diversities[grp], ax_div, title, grp_colors[grp]) + gu.plot_kde(diversities[grp].values(), ax_div, title, grp_colors[grp]) - ax_div.set_xlabel(diversity_type) + ax_div.set_xlabel(diversity_type.title()) ax_div.set_ylabel("Density") ax_div.legend([plt.Rectangle((0, 0), 1, 1, fc=color) for color in grp_colors.values()], grp_colors.keys(), loc="best") @@ -97,10 +113,10 @@ def write_diversity_metrics(data, sample_ids, fp=None): with open(fp, "w") as outf: out = csv.writer(outf, delimiter="\t") - out.writerow(["calculation", "group"]) - for group in data: - for entry in data[group]: - out.writerow([entry, group]) + out.writerow(["SampleID", "Group", "Calculation"]) + for group, d in data.iteritems(): + for sid, value in d.iteritems(): + out.writerow([sid, group, value]) def handle_program_options(): @@ -132,14 +148,10 @@ def handle_program_options(): help="A descriptive title that will appear at the top \ of the output plot. Surround with quotes if there are\ spaces in the title.") - parser.add_argument("--x_label", default="diversity", nargs="+", - help="The name of the diversity metric to be displayed on the\ - plot as the X-axis label. If multiple metrics are specified,\ - then multiple entries for the X-axis label should be given.") parser.add_argument("-o", "--out_dir", default=".", help="The directory plots will be saved to.") parser.add_argument("--image_type", default="png", - help="The type of image to save: PNG, SVG, PDF, EPS, etc...") + help="The type of image to save: png, svg, pdf, eps, etc...") parser.add_argument("--save_calculations", help="Path and name of text file to store the calculated " "diversity metrics.") @@ -179,23 +191,23 @@ def main(): colors = putil.color_mapping(sample_map, header, args.category, args.color_by) # Perform diversity calculations and density plotting - for method, x_label in zip(args.diversity, args.x_label): + for method in args.diversity: if method not in alpha.__all__: sys.exit("ERROR: Diversity metric not found: " + method) metric = eval("alpha."+method) div_calc, sample_ids = calc_diversity(metric, sample_map, biom_tbl, cat_vals, cat_idx) - plot_group_diversity(div_calc, colors, plot_title, x_label, + plot_group_diversity(div_calc, colors, plot_title, method, args.out_dir, args.image_type) # calculate and print significance testing results if args.show_significance: - print "Diversity significance testing: {}".format(x_label) + print "Diversity significance testing: {}".format(method.title()) if len(cat_vals) == 2: - print_MannWhitneyU(*div_calc.values()) + print_MannWhitneyU(div_calc) elif len(cat_vals) > 2: - print_KruskalWallisH(div_calc.values()) + print_KruskalWallisH(div_calc) print else: continue From 02865333d70d9183302a7fc8245ce0492c583280 Mon Sep 17 00:00:00 2001 From: Akshay Paropkari Date: Tue, 28 Jun 2016 10:02:16 -0400 Subject: [PATCH 2/6] Added new script - 'network_plots_gephi.py'. The script creates network plots based on correlation matrix. --- bin/network_plots_gephi.py | 155 +++++++++++++++++++++++++++++++++++++ 1 file changed, 155 insertions(+) create mode 100755 bin/network_plots_gephi.py diff --git a/bin/network_plots_gephi.py b/bin/network_plots_gephi.py new file mode 100755 index 0000000..1c28bf0 --- /dev/null +++ b/bin/network_plots_gephi.py @@ -0,0 +1,155 @@ +#!/usr/bin/env python +""" +Abstract: Generate network plots using NetworkX and Gephi. +Author: Akshay Paropkari +Date: 02/10/2016 +""" +import sys +import argparse +from collections import defaultdict +importerrors = [] +try: + import biom +except ImportError as ie: + importerrors.append(ie) +try: + import pandas as pd +except ImportError as ie: + importerrors.append(ie) +try: + import networkx as nx +except ImportError: + importerrors.append(ie) +try: + from phylotoast import biom_calc as bc, otu_calc as oc +except ImportError: + importerrors.append(ie) +if len(importerrors) > 0: + for err in importerrors: + print "Import Error: {}".format(err) + sys.exit() + + +def get_relative_abundance(biomfile): + """ + Return arcsine transformed relative abundance from a BIOM format file. + + :type biomfile: BIOM format file + :param biomfile: BIOM format file used to obtain relative abundances for each OTU in + a SampleID, which are used as node sizes in network plots. + + :type return: Dictionary of dictionaries. + :return: Dictionary keyed on SampleID whose value is a dictionarykeyed on OTU Name + whose value is the arc sine tranfsormed relative abundance value for that + SampleID-OTU Name pair. + """ + biomf = biom.load_table(biomfile) + norm_biomf = biomf.norm(inplace=False) + rel_abd = {} + for sid in norm_biomf.ids(): + rel_abd[sid] = {} + for otuid in norm_biomf.ids("observation"): + otuname = oc.otu_name(norm_biomf.metadata(otuid, axis="observation")["taxonomy"]) + otuname = " ".join(otuname.split("_")) + abd = norm_biomf.get_value_by_ids(otuid, sid) + rel_abd[sid][otuname] = abd + ast_rel_abd = bc.arcsine_sqrt_transform(rel_abd) + return ast_rel_abd + + +def handle_program_options(): + """Parses the given options passed in at the command line.""" + parser = argparse.ArgumentParser(description="Create network plots based " + "on correlation matrix.") + parser.add_argument("biom_file", help="Biom file OTU table.") + parser.add_argument("mapping_file", help="Mapping file for reading " + "sampleIDs and their groups.") + parser.add_argument("condition_column", help="Column name in mapping file " + "denoting the categories.") + parser.add_argument("in_corr_mat", help="Correlation matrix file. The " + "format for the tab-separated file should be: " + "Category -> Variable -> by Variable -> Correlation") + parser.add_argument("cat_name", help="Category to be plotted.") + parser.add_argument("-go", "--gexf_out", + help="Graph information written to this Graph Exchange" + " XML Format file. This file can be input to Gephi.") + parser.add_argument("-fp", "--fil_pct", type=float, default=0.75, + help="Specify the minimum value of correlation " + "strength to display. By default, all correlations " + ">=0.75 will be shown.") + parser.add_argument("-w", "--stats_out_fnh", + help="Write out graph statistics.") + return parser.parse_args() + + +def main(): + args = handle_program_options() + + # Error handling input files + try: + relative_abundance = get_relative_abundance(args.biom_file) + except OSError as ose: + sys.exit("\nError in BIOM file path: {}\n".format(ose)) + try: + # Read in the mapping file + mapf = pd.read_csv(args.mapping_file, sep="\t") + except IOError as ioe: + sys.exit("\nError in mapping file path: {}\n".format(ioe)) + try: + # Read the correlation data + corr_data = pd.read_csv(args.in_corr_mat, sep="\t") + except IOError as ioe: + sys.exit("\nError in correlation matrix file path: {}\n".format(ioe)) + + # get category-wise nodes + cat_sids = defaultdict(list) + for rows in mapf.iterrows(): + row = rows[1] + cat_sids[row[args.condition_column]].append(row["#SampleID"]) + + # initialize graph + G = nx.Graph() + + # get category-wise graph edge data and edge colors + for rows in corr_data.iterrows(): + row = rows[1] + if row["Category"] == args.cat_name and abs(row["Correlation"]) > args.fil_pct: + if row["Correlation"] > 0: + G.add_weighted_edges_from( + [(" ".join(row["Variable"].split("_")), + " ".join(row["by Variable"].split("_")), + row["Correlation"])], color="#00CC00") # Green + else: + G.add_weighted_edges_from( + [(" ".join(row["Variable"].split("_")), + " ".join(row["by Variable"].split("_")), + (row["Correlation"])*-1)], color="#FF0000") # Red + + # add node attributes to graph node object + for otu, attr in G.node.iteritems(): + total_abd = 0 + for sid in cat_sids[args.cat_name]: + total_abd += relative_abundance[sid][otu] + mean_otu_cat_abd = total_abd/len(cat_sids[args.cat_name]) + G.node[otu]["node_size"] = mean_otu_cat_abd + + # convert node labels to integers + H = nx.convert_node_labels_to_integers(G, first_label=1, + ordering="decreasing degree", + label_attribute="id") + + # Write out GEXF file for using with Gephi + if args.gexf_out: + nx.write_gexf(H, args.gexf_out, version="1.2draft") + + # Write out betweenness centrality measure to file + if args.stats_out_fnh: + with open(args.stats_out_fnh, "w")as poi: + poi.write("OTU Node\tDegree Centrality\tBetweenness Centrality\n") + dc = nx.degree_centrality(G) + bc = nx.betweenness_centrality(G, weight="weight") + for key in sorted(bc.keys()): + poi.write("{}\t{}\t{}\n".format(key, dc[key], bc[key])) + +if __name__ == "__main__": + sys.exit(main()) From 0cf33dd8ba80751e2ac187bc572361f4acd01484 Mon Sep 17 00:00:00 2001 From: Akshay Paropkari Date: Tue, 28 Jun 2016 10:28:34 -0400 Subject: [PATCH 3/6] Updated diversity.py. Added --show_available_metrics parameter for displaying alpha diversity metrics available for users to use. Addresses #48. --- bin/diversity.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/bin/diversity.py b/bin/diversity.py index 06b40d5..3b7cee7 100755 --- a/bin/diversity.py +++ b/bin/diversity.py @@ -158,12 +158,22 @@ def handle_program_options(): parser.add_argument("--show_significance", action="store_false", help="Display " "significance testing results. The results will be shown by " "default.") + parser.add_argument("--show_available_metrics", action="store_true", + help="Supply this parameter to see which alpha diversity metrics " + " are available for usage. No calculations will be performed" + " if this parameter is provided.") return parser.parse_args() def main(): args = handle_program_options() + if args.show_available_metrics: + print "\nAvailable alpha diversity metrics:" + return "\n".join(alpha.__all__) + else: + return "ERROR! Cannot print out alpha diversity metrics list." + try: with open(args.map_file): pass @@ -217,4 +227,4 @@ def main(): if __name__ == "__main__": - main() + sys.exit(main()) From ee8affe1a4b36699a2a74b7c6c8664506fa5afc9 Mon Sep 17 00:00:00 2001 From: Akshay Paropkari Date: Tue, 28 Jun 2016 12:02:33 -0400 Subject: [PATCH 4/6] Added documentation for network_plots_gephi.py script. --- docs/scripts/network_plots_gephi.txt | 54 ++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) create mode 100644 docs/scripts/network_plots_gephi.txt diff --git a/docs/scripts/network_plots_gephi.txt b/docs/scripts/network_plots_gephi.txt new file mode 100644 index 0000000..3201a0f --- /dev/null +++ b/docs/scripts/network_plots_gephi.txt @@ -0,0 +1,54 @@ +network_plots_gephi.py +====================== + +Create network plots based on correlation matrix. + +.. code-block:: bash + + usage: network_plots_gephi.py [-h] [-go GEXF_OUT] [-fp FIL_PCT] [-w STATS_OUT_FNH] biom_file mapping_file condition_column in_corr_mat cat_name + +Required Arguments +------------------- + +.. cmdoption:: biom_file + + The biom-format file. + +.. cmdoption:: mapping_file + + Mapping file for reading sampleIDs and their groups. + +.. cmdoption:: condition_column + + Column name in mapping file denoting the categories. + +.. cmdoption:: in_corr_mat + + Correlation matrix file. The format for the tab-separated file should be: Category -> Variable -> by Variable -> Correlation + +.. cmdoption:: cat_name + + Category to be plotted. + +Optional Arguments +------------------ + +.. cmdoption:: -go GEXF_OUT, --gexf_out GEXF_OUT + + The directory to output the PCoA plots to. + +.. cmdoption:: --scaling_factor SCALING_FACTOR + + Graph information written to this Graph Exchange XML Format file. This file can be input to Gephi. + +.. cmdoption:: -fp FIL_PCT, --fil_pct FIL_PCT + + Specify the minimum value of correlation strength to display. By default, all correlations greater than or equal to 0.75 will be shown. + +.. cmdoption:: -w STATS_OUT_FNH, --stats_out_fnh STATS_OUT_FNH + + Write out graph statistics - degree and betweenness centrality calculations for each node. + +.. cmdoption:: -h, --help + + Show this help message and exit From ec1cd8849777b8b32ca44e8899e005a5c0e067ca Mon Sep 17 00:00:00 2001 From: Akshay Paropkari Date: Tue, 28 Jun 2016 12:04:51 -0400 Subject: [PATCH 5/6] Updated documentation for diversity.py. --- docs/scripts/diversity.txt | 36 ++++++++++++++++++++++++++---------- 1 file changed, 26 insertions(+), 10 deletions(-) diff --git a/docs/scripts/diversity.txt b/docs/scripts/diversity.txt index 5cb9880..5d8c08c 100755 --- a/docs/scripts/diversity.txt +++ b/docs/scripts/diversity.txt @@ -1,31 +1,31 @@ ============ diversity.py ============ -Calculate and plot for two sample categories: Shannon diversity, Chao1 diversity, and a Jaccard similiarity distance matrix heatmap. +Calculate the alpha diversity of a set of samples using one or more metrics and output a kernal density estimator-smoothed histogram of the results. .. code-block:: bash - usage: usage: diversity.py [-h][-c {shannon,chao1,jaccard} [{shannon,chao1,jaccard} ...]] [-p IMAGE_TYPE] map_file biom_file category plot_title out_dir + usage: diversity.py [-h] [-d DIVERSITY [DIVERSITY ...]] [--plot_title PLOT_TITLE] [--image_type IMAGE_TYPE] [--save_calculations SAVE_CALCULATIONS] [--show_significance] [--show_available_metrics] -m MAP_FILE -i BIOM_FP -c CATEGORY --color_by COLOR_BY -o OUT_DIR Required arguments ------------------- -.. cmdoption:: map_file +.. cmdoption:: -m MAP_FILE, --map_file MAP_FILE QIIME mapping file. -.. cmdoption:: biom_file +.. cmdoption:: -i BIOM_FP, --biom_fp BIOM_FP BIOM table file name -.. cmdoption:: category +.. cmdoption:: -c CATEGORY, --category CATEGORY Specific category from the mapping file. -.. cmdoption:: plot_title +.. cmdoption:: --color_by COLOR_BY - The name of a PDF file the pathway map will be written to. + A column name in the mapping file containing hexadecimal (#FF0000) color values that will be used to color the groups. Each sample ID must have a color entry. -.. cmdoption:: out_dir +.. cmdoption:: -o OUT_DIR, --out_dir OUT_DIR The directory all plots will be saved to. @@ -35,10 +35,26 @@ Optional arguments show this help message and exit -.. cmdoption:: -c {shannon,chao1,jaccard} [{shannon,chao1,jaccard} ...] +.. cmdoption:: -d DIVERSITY [DIVERSITY ...], --diversity DIVERSITY [DIVERSITY ...] + + The alpha diversity metric. Default value is 'shannon', which will calculate the Shannon entropy. Multiple metrics can be specified (space separated). The full list of metrics is available at: http://scikit-bio.org/docs/latest/generated/skbio.diversity.alpha.html. - Choose the type of calculation needed. Default value is "None", which will output all 3 types of calculations. +.. cmdoption:: --plot_title PLOT_TITLE + + The name of a PDF file the pathway map will be written to. .. cmdoption:: -p IMAGE_TYPE, --image_type IMAGE_TYPE The type of image to save: PNG, SVG, etc. + +.. cmdoption:: --save_calculations SAVE_CALCULATIONS + + Path and name of text file to store the calculated diversity metrics. + +.. cmdoption:: --show_significance + + Display significance testing results. The results will be shown by default. + +.. cmdoption:: --show_available_metrics + + Supply this parameter to see which alpha diversity metrics are available for usage. No calculations will be performed if this parameter is provided. From 0db546f8e42b524b1e6ba0021b5c6b7b8a9360e8 Mon Sep 17 00:00:00 2001 From: Akshay Paropkari Date: Tue, 28 Jun 2016 14:02:36 -0400 Subject: [PATCH 6/6] Updated diversity.py. Added checks valid, but not yet supported by PhyloToAST diversity metrics. Brought back --x_label parameter. --- bin/diversity.py | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/bin/diversity.py b/bin/diversity.py index 3b7cee7..b9c4212 100755 --- a/bin/diversity.py +++ b/bin/diversity.py @@ -6,6 +6,7 @@ import csv import argparse import os.path as osp +from itertools import izip_longest from collections import defaultdict from phylotoast import graph_util as gu, util as putil importerrors = [] @@ -139,6 +140,10 @@ def handle_program_options(): The full list of metrics is available at:\ http://scikit-bio.org/docs/latest/generated/skbio.diversity.alpha.html.\ Beta diversity metrics will be supported in the future.") + parser.add_argument("--x_label", default=[None], nargs="+", + help="The name of the diversity metric to be displayed on the\ + plot as the X-axis label. If multiple metrics are specified,\ + then multiple entries for the X-axis label should be given.") parser.add_argument("--color_by", help="A column name in the mapping file containing\ hexadecimal (#FF0000) color values that will\ @@ -168,11 +173,14 @@ def handle_program_options(): def main(): args = handle_program_options() + metrics = [m for m in alpha.__all__ if "_ci" not in m] + try: + metrics.remove("faith_pd") + except ValueError: + pass if args.show_available_metrics: print "\nAvailable alpha diversity metrics:" - return "\n".join(alpha.__all__) - else: - return "ERROR! Cannot print out alpha diversity metrics list." + return "\n".join(metrics) try: with open(args.map_file): @@ -201,19 +209,23 @@ def main(): colors = putil.color_mapping(sample_map, header, args.category, args.color_by) # Perform diversity calculations and density plotting - for method in args.diversity: + for method, x_label in izip_longest(args.diversity, args.x_label): + if x_label is None: + x_label = method.title() if method not in alpha.__all__: - sys.exit("ERROR: Diversity metric not found: " + method) + sys.exit("ERROR: Diversity metric not found: {}.".format(method)) + elif method in alpha.__all__ and method not in metrics: + sys.exit("Currently, PhyloToAST does not support {} metric.".format(method)) metric = eval("alpha."+method) div_calc, sample_ids = calc_diversity(metric, sample_map, biom_tbl, cat_vals, cat_idx) - plot_group_diversity(div_calc, colors, plot_title, method, - args.out_dir, args.image_type) + plot_group_diversity(div_calc, colors, plot_title, x_label, args.out_dir, + args.image_type) # calculate and print significance testing results if args.show_significance: - print "Diversity significance testing: {}".format(method.title()) + print "Diversity significance testing: {}".format(x_label) if len(cat_vals) == 2: print_MannWhitneyU(div_calc) elif len(cat_vals) > 2: