Skip to content

Commit

Permalink
Merge pull request #34 from pravs3683/stat_for_multiple_samples
Browse files Browse the repository at this point in the history
Stat for multiple samples
  • Loading branch information
jj-umn authored Sep 21, 2020
2 parents a09c920 + 44f4377 commit 28f471b
Show file tree
Hide file tree
Showing 12 changed files with 141 additions and 61 deletions.
3 changes: 2 additions & 1 deletion metaquantome/classes/SampleGroups.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,8 @@ def __init__(self, sinfo):

# name of experimental groups
# sort alphabetically, so it's deterministic
self.grp_names = sorted(list(sample_names.keys()))
#self.grp_names = sorted(list(sample_names.keys()))
self.grp_names = list(sample_names.keys())

# when calculating means, column names for means
# same order as grp names
Expand Down
21 changes: 16 additions & 5 deletions metaquantome/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def cli():
min_pep_nsamp=args.min_pep_nsamp, outfile=args.outfile)
elif args.command == "stat":
stat(infile=args.file, sinfo=args.samps, paired=args.paired, parametric=args.parametric, ontology=args.ontology,
mode=args.mode, outfile=args.outfile)
mode=args.mode, outfile=args.outfile, control_group=args.control_group)
elif args.command == "viz":
run_viz(plottype=args.plottype,
img=args.img,
Expand All @@ -51,6 +51,7 @@ def cli():
textannot=args.textannot,
calculate_sep=args.calculate_sep,
fc_name=args.fc_name,
fc_corr_p=args.fc_corr_p,
flip_fc=args.flip_fc,
gosplit=args.gosplit,
sinfo=args.samps,
Expand All @@ -62,7 +63,9 @@ def cli():
target_onto=args.target_onto,
width=args.width,
height=args.height,
tabfile=args.tabfile)
tabfile=args.tabfile,
feature_cluster_size=args.feature_cluster_size,
sample_cluster_size=args.sample_cluster_size)
else:
ValueError('incorrect mode. please provide one of "db", "expand", "filter", "stat", or "viz".')
sys.exit(0)
Expand Down Expand Up @@ -214,7 +217,9 @@ def parse_args_cli():
'then a Wilcoxon test is performed.')
parser_stat.add_argument('--paired', action='store_true',
help='Perform paired tests.')

parser_stat.add_argument('--control_group', required=True,
help='Sample group name of control samples (will be used as denominator for fold change).')

# ---- METAQUANTOME VIZ ---- #
parser_viz.add_argument('--plottype', '-p', required=True, choices=['bar', 'volcano', 'heatmap', 'pca', 'ft_dist', 'stacked_bar'],
help="Select the type of plot to generate.")
Expand All @@ -226,11 +231,13 @@ def parse_args_cli():
help="Height of the image in inches. Defaults vary by plot type.")
parser_viz.add_argument('--infile', '-i', required=True,
help="Input file from stat or filter.")
parser_viz.add_argument('--strip',
parser_viz.add_argument('--strip', default=None,
help="Text to remove from column names for plotting.")
parser_viz.add_argument('--tabfile', default=None,
help="Optional. File to write plot table to.")

parser_viz.add_argument('--fc_corr_p', default=None,
help="Name of the corrected p-value column in the stat dataframe. Used while generating volcano plot and while using filter_to_sig in heatmap")

bar = parser_viz.add_argument_group('Arguments for barplots - including total taxonomy peptide intensity ("bar"), function-taxonomy ' +
'interaction distributions ("ft_dist"), and stacked taxonomy bar plots ("stacked_bar")')
bar.add_argument('--meancol',
Expand Down Expand Up @@ -274,6 +281,10 @@ def parse_args_cli():
help="Flag. Only plot significant terms? Necessitates use of results from `test`.")
heat.add_argument('--alpha', default='0.05',
help="If filter_to_sig, the q-value significance level.")
heat.add_argument('--feature_cluster_size', default='2',
help="Number of clusters 'k' to cut the feature dendrogram tree. Default = 2")
heat.add_argument('--sample_cluster_size', default='2',
help="Number of clusters 'k' to cut the sample dendrogram tree. Default = 2")

pca = parser_viz.add_argument_group('Principal Components Analysis')
pca.add_argument("--calculate_sep", action="store_true",
Expand Down
2 changes: 1 addition & 1 deletion metaquantome/data/test/cli_mult_test_out.tab
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
id description s1_mean s2_mean int1 int2 int3 int4 int5 int6 log2fc_s1_over_s2 p corrected_p
id description s1_mean s2_mean int1 int2 int3 int4 int5 int6 log2fc_s1_over_s2 p_s1_over_s2 corrected_p_s1_over_s2
C Energy production and conversion 3.9696263509564815 3.841302253980942 3.584962500721156 4.3219280948873635 3.906890595608519 3.584962500721156 4.392317422778762 3.3219280948873617 0.1283240969755397 0.6832561051460733 0.6832561051460733
D Cell cycle control, cell division, chromosome partitioning 10.013089999440444 3.5443205162238103 9.965784284662089 10.228818690495881 9.813781191217037 3.584962500721156 3.700439718141092 3.3219280948873617 6.468769483216634 2.70286875006428e-06 8.10860625019284e-06
N Cell motility 4.544320516223809 11.468284625191268 4.3219280948873635 4.906890595608519 4.3219280948873635 11.773139206719689 10.965784284662087 11.550746785383243 -6.923964108967459 3.349508567404191e-05 5.024262851106286e-05
2 changes: 1 addition & 1 deletion metaquantome/data/test/ec_ttest_tested.tab
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
id description s1_mean s2_mean int1 int2 int3 int4 int5 int6 int1_n_peptide int2_n_peptide int3_n_peptide int4_n_peptide int5_n_peptide int6_n_peptide int1_n_samp_children int2_n_samp_children int3_n_samp_children int4_n_samp_children int5_n_samp_children int6_n_samp_children log2fc_s1_over_s2 p corrected_p
id description s1_mean s2_mean int1 int2 int3 int4 int5 int6 int1_n_peptide int2_n_peptide int3_n_peptide int4_n_peptide int5_n_peptide int6_n_peptide int1_n_samp_children int2_n_samp_children int3_n_samp_children int4_n_samp_children int5_n_samp_children int6_n_samp_children log2fc_s1_over_s2 p_s1_over_s2 corrected_p_s1_over_s2
1.-.-.- Oxidoreductases. 10.013089999440444 3.5443205162238103 9.965784284662089 10.228818690495881 9.813781191217037 3.584962500721156 3.700439718141092 3.3219280948873617 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 6.468769483216634 2.70286875006428e-06 9.46004062522498e-06
1.2.-.- Acting on the aldehyde or oxo group of donors. 10.013089999440444 3.5443205162238103 9.965784284662089 10.228818690495881 9.813781191217037 3.584962500721156 3.700439718141092 3.3219280948873617 1.0 1.0 1.0 1.0 1.0 1.0 NA NA NA NA NA NA 6.468769483216634 2.70286875006428e-06 9.46004062522498e-06
3.-.-.- Hydrolases. 5.28540221886225 11.475564566327488 5.0 5.6438561897747235 5.1292830169449655 11.778077129535358 10.980853606379736 11.555547771647065 2.0 2.0 2.0 2.0 2.0 2.0 1.0 1.0 1.0 1.0 1.0 1.0 -6.190162347465239 4.742887028677606e-05 5.533368200123874e-05
Expand Down
2 changes: 1 addition & 1 deletion metaquantome/data/test/go_tested.tab
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
id name namespace s1_mean s2_mean int1 int2 int3 int4 int5 int6 int1_n_peptide int2_n_peptide int3_n_peptide int4_n_peptide int5_n_peptide int6_n_peptide int1_n_samp_children int2_n_samp_children int3_n_samp_children int4_n_samp_children int5_n_samp_children int6_n_samp_children log2fc_s1_over_s2 p corrected_p
id name namespace s1_mean s2_mean int1 int2 int3 int4 int5 int6 int1_n_peptide int2_n_peptide int3_n_peptide int4_n_peptide int5_n_peptide int6_n_peptide int1_n_samp_children int2_n_samp_children int3_n_samp_children int4_n_samp_children int5_n_samp_children int6_n_samp_children log2fc_s1_over_s2 p_s1_over_s2 corrected_p_s1_over_s2
GO:0000003 reproduction biological_process 10.013089999440444 3.5443205162238103 9.965784284662089 10.228818690495881 9.813781191217037 3.584962500721156 3.700439718141092 3.3219280948873617 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 6.468769483216634 2.70286875006428e-06 6.7571718751607e-06
GO:0008150 biological_process biological_process 10.066537719931583 11.4814630999144 10.011227255423254 10.287712379549447 9.868822554774999 11.782998208920414 10.9901039638575 11.560332834212444 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 -1.414925379982817 0.013580739592325305 0.01697592449040663
GO:0008152 metabolic process biological_process 3.9696263509564815 3.841302253980942 3.584962500721156 4.3219280948873635 3.906890595608519 3.584962500721156 4.392317422778762 3.3219280948873617 1.0 1.0 1.0 1.0 1.0 1.0 NA NA NA NA NA NA 0.1283240969755397 0.6832561051460733 0.6832561051460733
Expand Down
8 changes: 4 additions & 4 deletions metaquantome/modules/run_viz.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,11 @@

def run_viz(plottype, img, infile, strip=None,
mode=None, meancol=None, nterms='5', target_rank=None, barcol=6, # barplot, stacked_bar
textannot=None, fc_name=None, flip_fc=False, gosplit=False, # volcano
textannot=None, fc_name=None, fc_corr_p=None, flip_fc=False, gosplit=False, # volcano
sinfo=None, filter_to_sig=False, alpha='0.05', # heatmap
calculate_sep=False, # pca
whichway=None, name=None, id=None, target_onto=None, # ft_dist
width='5', height='5', tabfile=None):
width='5', height='5', tabfile=None, feature_cluster_size=2, sample_cluster_size=2):
"""
Wrapper script for the command-line R visualizations
The documentation for each of the arguments is in cli.py
Expand All @@ -24,12 +24,12 @@ def run_viz(plottype, img, infile, strip=None,
if plottype == "bar":
cmd += [mode, meancol, nterms, width, height, target_rank, target_onto, barcol, tabfile]
elif plottype == "volcano":
cmd += [str(textannot), fc_name, flip_fc, gosplit, width, height, tabfile]
cmd += [str(textannot), fc_name, fc_corr_p, flip_fc, gosplit, width, height, tabfile]
elif plottype == "heatmap":
samp_grps = SampleGroups(sinfo)
all_intcols_str = ','.join(samp_grps.all_intcols)
json_dump = json.dumps(samp_grps.sample_names)
cmd += [all_intcols_str, json_dump, filter_to_sig, alpha, width, height, strip]
cmd += [all_intcols_str, json_dump, filter_to_sig, alpha, width, height, strip, feature_cluster_size, sample_cluster_size, fc_corr_p]
elif plottype == "pca":
samp_grps = SampleGroups(sinfo)
all_intcols_str = ','.join(samp_grps.all_intcols)
Expand Down
47 changes: 40 additions & 7 deletions metaquantome/modules/stat.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from metaquantome.util.stat_io import read_expanded_table, write_stat


def stat(infile, sinfo, paired, parametric, ontology, mode, outfile):
def stat(infile, sinfo, paired, parametric, ontology, mode, outfile, control_group):
"""
Module function that tests differential expression between 2 experimental conditions
Expand All @@ -27,13 +27,44 @@ def stat(infile, sinfo, paired, parametric, ontology, mode, outfile):
# read in
df = read_expanded_table(infile, samp_grps)

if samp_grps.ngrps != 2:
ValueError('testing is only available for 2 experimental conditions.')
#if samp_grps.ngrps != 2:
# raise ValueError('testing is only available for 2 experimental conditions.')

###################
# Addition by Praveen to enable pairwise stat while inputting multiple sample groups
###################
ctrl_grp = control_group
if ctrl_grp not in samp_grps.grp_names:
raise ValueError('Control sample group incorrect/missing.')
# print(samp_grps.grp_names)

df_test = df.copy()

case_grps = samp_grps.grp_names
ctrl_grp = ctrl_grp
case_grps.remove(ctrl_grp)
fc_new_cols = []
for each_case in case_grps:
tmp_samp_json = '{"'+str(each_case.strip())+'":'+str(samp_grps.sample_names[each_case]).replace("'","\"")+', "'+ str(ctrl_grp.strip()) +'":'+str(samp_grps.sample_names[str(ctrl_grp.strip())]).replace("'","\"")+'}'
tmp_samp_grps = SampleGroups(tmp_samp_json)

# run test for each pair of groups
df_tmp = test_norm_intensity(df, tmp_samp_grps, paired, parametric)

df_test[tmp_samp_grps.fc_name] = df_tmp[tmp_samp_grps.fc_name].tolist()
df_test[P_COLNAME+"_"+tmp_samp_grps.fc_name.replace("log2fc_","")] = df_tmp[P_COLNAME+"_"+tmp_samp_grps.fc_name.replace("log2fc_","")].tolist()
df_test[P_CORR_COLNAME+"_"+tmp_samp_grps.fc_name.replace("log2fc_","")] = df_tmp[P_CORR_COLNAME+"_"+tmp_samp_grps.fc_name.replace("log2fc_","")].tolist()

fc_new_cols = fc_new_cols + [tmp_samp_grps.fc_name, P_COLNAME+"_"+tmp_samp_grps.fc_name.replace("log2fc_",""), P_CORR_COLNAME+"_"+tmp_samp_grps.fc_name.replace("log2fc_","")]
###################

# run test
df_test = test_norm_intensity(df, samp_grps, paired, parametric)
#df_test = test_norm_intensity(df, samp_grps, paired, parametric)

# write out
if outfile:
write_stat(df_test, samp_grps=samp_grps, ontology=ontology, mode=mode, outfile=outfile)
#write_stat(df_test, samp_grps=samp_grps, ontology=ontology, mode=mode, outfile=outfile)
write_stat(df_tmp, samp_grps=samp_grps, ontology=ontology, mode=mode, outfile=outfile, fc_new_cols=fc_new_cols)
# return
return df_test

Expand Down Expand Up @@ -80,10 +111,12 @@ def test_norm_intensity(df, samp_grps, paired, parametric):
df_means = log2_fold_change(df, samp_grps)

# p values, uncorrected for multiple comparisons
df_means[P_COLNAME] = test_results
#df_means[P_COLNAME] = test_results
df_means[P_COLNAME+"_"+samp_grps.fc_name.replace("log2fc_","")] = test_results

# fdr correction
df_means[P_CORR_COLNAME] = mc.fdrcorrection0(test_results, method='indep')[1]
#df_means[P_CORR_COLNAME] = mc.fdrcorrection0(test_results, method='indep')[1]
df_means[P_CORR_COLNAME+"_"+samp_grps.fc_name.replace("log2fc_","")] = mc.fdrcorrection0(test_results, method='indep')[1]

# reset the index to be 'id' - this is mostly for testing, and doesn't affect the output file
df_means.set_index('id', drop=False, inplace=True)
Expand Down
Loading

0 comments on commit 28f471b

Please sign in to comment.