Merge pull request #34 from pravs3683/stat_for_multiple_samples

Stat for multiple samples
galaxyproteomics · Sep 21, 2020 · 28f471b · 28f471b
2 parents a09c920 + 44f4377
commit 28f471b
Show file tree

Hide file tree

Showing 12 changed files with 141 additions and 61 deletions.
diff --git a/metaquantome/classes/SampleGroups.py b/metaquantome/classes/SampleGroups.py
@@ -32,7 +32,8 @@ def __init__(self, sinfo):
 
         # name of experimental groups
         # sort alphabetically, so it's deterministic
-        self.grp_names = sorted(list(sample_names.keys()))
+        #self.grp_names = sorted(list(sample_names.keys()))
+        self.grp_names = list(sample_names.keys())
 
         # when calculating means, column names for means
         # same order as grp names

diff --git a/metaquantome/cli.py b/metaquantome/cli.py
@@ -37,7 +37,7 @@ def cli():
                    min_pep_nsamp=args.min_pep_nsamp, outfile=args.outfile)
     elif args.command == "stat":
         stat(infile=args.file, sinfo=args.samps, paired=args.paired, parametric=args.parametric, ontology=args.ontology,
-             mode=args.mode, outfile=args.outfile)
+             mode=args.mode, outfile=args.outfile, control_group=args.control_group)
     elif args.command == "viz":
         run_viz(plottype=args.plottype,
                 img=args.img,
@@ -51,6 +51,7 @@ def cli():
                 textannot=args.textannot,
                 calculate_sep=args.calculate_sep,
                 fc_name=args.fc_name,
+                fc_corr_p=args.fc_corr_p,
                 flip_fc=args.flip_fc,
                 gosplit=args.gosplit,
                 sinfo=args.samps,
@@ -62,7 +63,9 @@ def cli():
                 target_onto=args.target_onto,
                 width=args.width,
                 height=args.height,
-                tabfile=args.tabfile)
+                tabfile=args.tabfile,
+                feature_cluster_size=args.feature_cluster_size,
+                sample_cluster_size=args.sample_cluster_size)
     else:
         ValueError('incorrect mode. please provide one of "db", "expand", "filter", "stat", or "viz".')
     sys.exit(0)
@@ -214,7 +217,9 @@ def parse_args_cli():
                                   'then a Wilcoxon test is performed.')
     parser_stat.add_argument('--paired', action='store_true',
                              help='Perform paired tests.')
-
+    parser_stat.add_argument('--control_group', required=True,
+                             help='Sample group name of control samples (will be used as denominator for fold change).')
+
     # ---- METAQUANTOME VIZ ---- #
     parser_viz.add_argument('--plottype', '-p', required=True, choices=['bar', 'volcano', 'heatmap', 'pca', 'ft_dist', 'stacked_bar'],
                             help="Select the type of plot to generate.")
@@ -226,11 +231,13 @@ def parse_args_cli():
                             help="Height of the image in inches. Defaults vary by plot type.")
     parser_viz.add_argument('--infile', '-i', required=True,
                             help="Input file from stat or filter.")
-    parser_viz.add_argument('--strip',
+    parser_viz.add_argument('--strip', default=None,
                             help="Text to remove from column names for plotting.")
     parser_viz.add_argument('--tabfile', default=None,
                             help="Optional. File to write plot table to.")
-
+    parser_viz.add_argument('--fc_corr_p', default=None,
+                            help="Name of the corrected p-value column in the stat dataframe. Used while generating volcano plot and while using filter_to_sig in heatmap")
+
     bar = parser_viz.add_argument_group('Arguments for barplots - including total taxonomy peptide intensity ("bar"), function-taxonomy ' +
                                         'interaction distributions ("ft_dist"), and stacked taxonomy bar plots ("stacked_bar")')
     bar.add_argument('--meancol',
@@ -274,6 +281,10 @@ def parse_args_cli():
                       help="Flag. Only plot significant terms? Necessitates use of results from `test`.")
     heat.add_argument('--alpha', default='0.05',
                       help="If filter_to_sig, the q-value significance level.")
+    heat.add_argument('--feature_cluster_size', default='2',
+                      help="Number of clusters 'k' to cut the feature dendrogram tree. Default = 2")
+    heat.add_argument('--sample_cluster_size', default='2',
+                      help="Number of clusters 'k' to cut the sample dendrogram tree. Default = 2")
 
     pca = parser_viz.add_argument_group('Principal Components Analysis')
     pca.add_argument("--calculate_sep", action="store_true",

diff --git a/metaquantome/data/test/cli_mult_test_out.tab b/metaquantome/data/test/cli_mult_test_out.tab
@@ -1,4 +1,4 @@
-id	description	s1_mean	s2_mean	int1	int2	int3	int4	int5	int6	log2fc_s1_over_s2	p	corrected_p
+id	description	s1_mean	s2_mean	int1	int2	int3	int4	int5	int6	log2fc_s1_over_s2	p_s1_over_s2	corrected_p_s1_over_s2
 C	Energy production and conversion	3.9696263509564815	3.841302253980942	3.584962500721156	4.3219280948873635	3.906890595608519	3.584962500721156	4.392317422778762	3.3219280948873617	0.1283240969755397	0.6832561051460733	0.6832561051460733
 D	Cell cycle control, cell division, chromosome partitioning	10.013089999440444	3.5443205162238103	9.965784284662089	10.228818690495881	9.813781191217037	3.584962500721156	3.700439718141092	3.3219280948873617	6.468769483216634	2.70286875006428e-06	8.10860625019284e-06
 N	Cell motility	4.544320516223809	11.468284625191268	4.3219280948873635	4.906890595608519	4.3219280948873635	11.773139206719689	10.965784284662087	11.550746785383243	-6.923964108967459	3.349508567404191e-05	5.024262851106286e-05
diff --git a/metaquantome/data/test/ec_ttest_tested.tab b/metaquantome/data/test/ec_ttest_tested.tab
@@ -1,4 +1,4 @@
-id	description	s1_mean	s2_mean	int1	int2	int3	int4	int5	int6	int1_n_peptide	int2_n_peptide	int3_n_peptide	int4_n_peptide	int5_n_peptide	int6_n_peptide	int1_n_samp_children	int2_n_samp_children	int3_n_samp_children	int4_n_samp_children	int5_n_samp_children	int6_n_samp_children	log2fc_s1_over_s2	p	corrected_p
+id	description	s1_mean	s2_mean	int1	int2	int3	int4	int5	int6	int1_n_peptide	int2_n_peptide	int3_n_peptide	int4_n_peptide	int5_n_peptide	int6_n_peptide	int1_n_samp_children	int2_n_samp_children	int3_n_samp_children	int4_n_samp_children	int5_n_samp_children	int6_n_samp_children	log2fc_s1_over_s2	p_s1_over_s2	corrected_p_s1_over_s2
 1.-.-.-	Oxidoreductases.	10.013089999440444	3.5443205162238103	9.965784284662089	10.228818690495881	9.813781191217037	3.584962500721156	3.700439718141092	3.3219280948873617	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	6.468769483216634	2.70286875006428e-06	9.46004062522498e-06
 1.2.-.-	Acting on the aldehyde or oxo group of donors.	10.013089999440444	3.5443205162238103	9.965784284662089	10.228818690495881	9.813781191217037	3.584962500721156	3.700439718141092	3.3219280948873617	1.0	1.0	1.0	1.0	1.0	1.0	NA	NA	NA	NA	NA	NA	6.468769483216634	2.70286875006428e-06	9.46004062522498e-06
 3.-.-.-	Hydrolases.	5.28540221886225	11.475564566327488	5.0	5.6438561897747235	5.1292830169449655	11.778077129535358	10.980853606379736	11.555547771647065	2.0	2.0	2.0	2.0	2.0	2.0	1.0	1.0	1.0	1.0	1.0	1.0	-6.190162347465239	4.742887028677606e-05	5.533368200123874e-05

diff --git a/metaquantome/data/test/go_tested.tab b/metaquantome/data/test/go_tested.tab
@@ -1,4 +1,4 @@
-id	name	namespace	s1_mean	s2_mean	int1	int2	int3	int4	int5	int6	int1_n_peptide	int2_n_peptide	int3_n_peptide	int4_n_peptide	int5_n_peptide	int6_n_peptide	int1_n_samp_children	int2_n_samp_children	int3_n_samp_children	int4_n_samp_children	int5_n_samp_children	int6_n_samp_children	log2fc_s1_over_s2	p	corrected_p
+id	name	namespace	s1_mean	s2_mean	int1	int2	int3	int4	int5	int6	int1_n_peptide	int2_n_peptide	int3_n_peptide	int4_n_peptide	int5_n_peptide	int6_n_peptide	int1_n_samp_children	int2_n_samp_children	int3_n_samp_children	int4_n_samp_children	int5_n_samp_children	int6_n_samp_children	log2fc_s1_over_s2	p_s1_over_s2	corrected_p_s1_over_s2
 GO:0000003	reproduction	biological_process	10.013089999440444	3.5443205162238103	9.965784284662089	10.228818690495881	9.813781191217037	3.584962500721156	3.700439718141092	3.3219280948873617	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	6.468769483216634	2.70286875006428e-06	6.7571718751607e-06
 GO:0008150	biological_process	biological_process	10.066537719931583	11.4814630999144	10.011227255423254	10.287712379549447	9.868822554774999	11.782998208920414	10.9901039638575	11.560332834212444	3.0	3.0	3.0	3.0	3.0	3.0	3.0	3.0	3.0	3.0	3.0	3.0	-1.414925379982817	0.013580739592325305	0.01697592449040663
 GO:0008152	metabolic process	biological_process	3.9696263509564815	3.841302253980942	3.584962500721156	4.3219280948873635	3.906890595608519	3.584962500721156	4.392317422778762	3.3219280948873617	1.0	1.0	1.0	1.0	1.0	1.0	NA	NA	NA	NA	NA	NA	0.1283240969755397	0.6832561051460733	0.6832561051460733

diff --git a/metaquantome/modules/run_viz.py b/metaquantome/modules/run_viz.py
@@ -8,11 +8,11 @@
 
 def run_viz(plottype, img, infile, strip=None,
             mode=None, meancol=None, nterms='5', target_rank=None, barcol=6,  # barplot, stacked_bar
-            textannot=None, fc_name=None, flip_fc=False, gosplit=False,  # volcano
+            textannot=None, fc_name=None, fc_corr_p=None, flip_fc=False, gosplit=False,  # volcano
             sinfo=None, filter_to_sig=False, alpha='0.05',  # heatmap
             calculate_sep=False,  # pca
             whichway=None, name=None, id=None, target_onto=None, # ft_dist
-            width='5', height='5', tabfile=None):
+            width='5', height='5', tabfile=None, feature_cluster_size=2, sample_cluster_size=2):
     """
     Wrapper script for the command-line R visualizations
     The documentation for each of the arguments is in cli.py
@@ -24,12 +24,12 @@ def run_viz(plottype, img, infile, strip=None,
     if plottype == "bar":
         cmd += [mode, meancol, nterms, width, height, target_rank, target_onto, barcol, tabfile]
     elif plottype == "volcano":
-        cmd += [str(textannot), fc_name, flip_fc, gosplit, width, height, tabfile]
+        cmd += [str(textannot), fc_name, fc_corr_p, flip_fc, gosplit, width, height, tabfile]
     elif plottype == "heatmap":
         samp_grps = SampleGroups(sinfo)
         all_intcols_str = ','.join(samp_grps.all_intcols)
         json_dump = json.dumps(samp_grps.sample_names)
-        cmd += [all_intcols_str, json_dump, filter_to_sig, alpha, width, height, strip]
+        cmd += [all_intcols_str, json_dump, filter_to_sig, alpha, width, height, strip, feature_cluster_size, sample_cluster_size, fc_corr_p]
     elif plottype == "pca":
         samp_grps = SampleGroups(sinfo)
         all_intcols_str = ','.join(samp_grps.all_intcols)

diff --git a/metaquantome/modules/stat.py b/metaquantome/modules/stat.py
@@ -7,7 +7,7 @@
 from metaquantome.util.stat_io import read_expanded_table, write_stat
 
 
-def stat(infile, sinfo, paired, parametric, ontology, mode, outfile):
+def stat(infile, sinfo, paired, parametric, ontology, mode, outfile, control_group):
     """
     Module function that tests differential expression between 2 experimental conditions
 
@@ -27,13 +27,44 @@ def stat(infile, sinfo, paired, parametric, ontology, mode, outfile):
     # read in
     df = read_expanded_table(infile, samp_grps)
 
-    if samp_grps.ngrps != 2:
-        ValueError('testing is only available for 2 experimental conditions.')
+    #if samp_grps.ngrps != 2:
+    #    raise ValueError('testing is only available for 2 experimental conditions.')
+
+    ###################
+    # Addition by Praveen to enable pairwise stat while inputting multiple sample groups
+    ###################
+    ctrl_grp = control_group
+    if ctrl_grp not in samp_grps.grp_names:
+        raise ValueError('Control sample group incorrect/missing.')
+    #    print(samp_grps.grp_names)
+
+    df_test = df.copy()
+
+    case_grps = samp_grps.grp_names
+    ctrl_grp = ctrl_grp
+    case_grps.remove(ctrl_grp)
+    fc_new_cols = []
+    for each_case in case_grps:
+        tmp_samp_json = '{"'+str(each_case.strip())+'":'+str(samp_grps.sample_names[each_case]).replace("'","\"")+', "'+ str(ctrl_grp.strip()) +'":'+str(samp_grps.sample_names[str(ctrl_grp.strip())]).replace("'","\"")+'}'
+        tmp_samp_grps = SampleGroups(tmp_samp_json)
+
+        # run test for each pair of groups
+        df_tmp = test_norm_intensity(df, tmp_samp_grps, paired, parametric)
+
+        df_test[tmp_samp_grps.fc_name] = df_tmp[tmp_samp_grps.fc_name].tolist()
+        df_test[P_COLNAME+"_"+tmp_samp_grps.fc_name.replace("log2fc_","")] = df_tmp[P_COLNAME+"_"+tmp_samp_grps.fc_name.replace("log2fc_","")].tolist()
+        df_test[P_CORR_COLNAME+"_"+tmp_samp_grps.fc_name.replace("log2fc_","")] = df_tmp[P_CORR_COLNAME+"_"+tmp_samp_grps.fc_name.replace("log2fc_","")].tolist()
+
+        fc_new_cols = fc_new_cols + [tmp_samp_grps.fc_name, P_COLNAME+"_"+tmp_samp_grps.fc_name.replace("log2fc_",""), P_CORR_COLNAME+"_"+tmp_samp_grps.fc_name.replace("log2fc_","")]
+    ###################
+
     # run test
-    df_test = test_norm_intensity(df, samp_grps, paired, parametric)
+    #df_test = test_norm_intensity(df, samp_grps, paired, parametric)
+
     # write out
     if outfile:
-        write_stat(df_test, samp_grps=samp_grps, ontology=ontology, mode=mode, outfile=outfile)
+        #write_stat(df_test, samp_grps=samp_grps, ontology=ontology, mode=mode, outfile=outfile)
+        write_stat(df_tmp, samp_grps=samp_grps, ontology=ontology, mode=mode, outfile=outfile, fc_new_cols=fc_new_cols)
     # return
     return df_test
 
@@ -80,10 +111,12 @@ def test_norm_intensity(df, samp_grps, paired, parametric):
     df_means = log2_fold_change(df, samp_grps)
 
     # p values, uncorrected for multiple comparisons
-    df_means[P_COLNAME] = test_results
+    #df_means[P_COLNAME] = test_results
+    df_means[P_COLNAME+"_"+samp_grps.fc_name.replace("log2fc_","")] = test_results
 
     # fdr correction
-    df_means[P_CORR_COLNAME] = mc.fdrcorrection0(test_results, method='indep')[1]
+    #df_means[P_CORR_COLNAME] = mc.fdrcorrection0(test_results, method='indep')[1]
+    df_means[P_CORR_COLNAME+"_"+samp_grps.fc_name.replace("log2fc_","")] = mc.fdrcorrection0(test_results, method='indep')[1]
 
     # reset the index to be 'id' - this is mostly for testing, and doesn't affect the output file
     df_means.set_index('id', drop=False, inplace=True)