forked from HimesGroup/taffeta
-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathrnaseq_gene_subset_de_report.py
87 lines (76 loc) · 4.36 KB
/
rnaseq_gene_subset_de_report.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
#!/usr/bin/python
import sys
import subprocess
import os
from rnaseq_align_and_qc import *
from rnaseq_de import *
def make_set_rnw_html(rnw_template, project_name, path_start, gtf, ref_genome, gene_list, gene_set_name):
"""
Creates Rnw report. The top of report is below and the rest concatenated from a separate text document (rnw_template).
Runs Sweave to create html document with R output (R2HTML library necessary for this)
"""
out_dir = path_start+project_name+"/"+project_name+"_"+gene_set_name+"_DE_Report/"
if not os.path.exists(out_dir):
os.makedirs(out_dir)
outp = open(out_dir+project_name+"_"+gene_set_name+"_DE_RnaSeqReport.Rnw", "w")
outp.write("<HTML>\n")
outp.write("<Head>\n")
outp.write("<Title>\n")
outp.write("RNA-seq Differential Expression Report for "+gene_set_name+" Genes of "+project_name+"\n")
outp.write("</Title>\n")
outp.write("</Head>\n")
outp.write("<Body>\n\n")
outp.write("<<input.data,echo=FALSE,results=hide>>=\n")
outp.write("library(cummeRbund)\n")
outp.write("curr.batch=\""+project_name+"\"\n")
outp.write("path.start=\""+path_start+project_name+"\"\n")
outp.write("gene.set.name=\""+gene_set_name+"\"\n")
outp.write("gene.set=c("+str(gene_list)[1:-1]+")\n")
#cummerbund database can includes a reference genome and gtf file although these options are not necessary for report
outp.write("cuff_data=readCufflinks(dir=path.start, genome = \""+ref_genome+"\", gtfFile = \""+gtf+"\", rebuild=FALSE) \n")
outp.write("conditions=samples(genes(cuff_data)) \n")
#reports currently focus on selecting top DE results based on genes and plots are made for genes and isoforms.
#it is possible to make reports that include TSS and CDS results, but these are not routinely given to customers
outp.write("features=c(\"genes\") \n")
outp.write("all.features=c(\"genes\", \"isoforms\") \n")
#outp.write("all_features=c(\"genes\", \"isoforms\", \"TSS\", \"CDS\") \n")
outp.write("ref.genome=\""+ref_genome+"\"\n")
outp.write("@\n")
outp.write(rnw_template)
outp.close()
subprocess.call("cd "+out_dir+"; echo \"library(R2HTML); Sweave('"+project_name+"_"+gene_set_name+"_DE_RnaSeqReport.Rnw', driver=RweaveHTML)\" | R --no-save --no-restore", shell=True)
def main(project_name, sample_info_file, path_start, gene_set_name, gene_list_file):
if path_start == "./":
path_start = os.getcwd()
if path_start[-1] != "/":
path_start = path_start+"/"
#Get list of sample info. Fields: [customer_id, gigpad_id, lane, index, ercc_mix, top_dir, batch, label, ref_genome, library_type]
runs = get_sample_info(sample_info_file)
ref_genome_list = map(lambda x: x[8], runs)
#Check whether all samples are of same reference genome
if False in map(lambda y: y==ref_genome_list[0], ref_genome_list):
print "Make sure all samples in project are of the same reference genome"
sys.exit()
else:
ref_genome = ref_genome_list[0]
ref_index, fa, gtf, ref, ERCC_gtf = get_genome_ref_files(ref_genome)
gene_list_o = open(gene_list_file, 'r')
gene_list = gene_list_o.read().split('\n')
if '' in gene_list:
gene_list.remove('')
#Create the report
if not os.path.exists("rnaseq_gene_subset_de_report_Rnw_template.txt"):
print "Cannot find rnaseq_gene_subset_de_report_Rnw_template.txt"
sys.exit()
rnw_in = open("rnaseq_gene_subset_de_report_Rnw_template.txt", "r")
rnw_template = rnw_in.read()
make_set_rnw_html(rnw_template, project_name, path_start, gtf, ref_genome, gene_list, gene_set_name)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Create HTML report of differential expression results for RNA-seq samples associated with a project.")
parser.add_argument("--path_start", default="./", type=str, help="Directory path to where project directory created by rnaseq_de.py is located (default=./)")
parser.add_argument("project_name", type=str, help="Name of project that all samples correspond to. Often a PCPGM batch, but it could correspond to a mixture of batches.")
parser.add_argument("samples_in", help="A tab-delimited txt file containing sample information. See example file: sample_info_file.txt")
parser.add_argument("gene_set_name", type=str, help="Name of gene set.")
parser.add_argument("gene_list_file", help="A text file containing a single official gene_ID per line, corresponding to the gene set of interest.")
args = parser.parse_args()
main(args.project_name, args.samples_in, args.path_start, args.gene_set_name, args.gene_list_file)