forked from HimesGroup/taffeta
-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathmake_igv_script_file.py
148 lines (133 loc) · 7.36 KB
/
make_igv_script_file.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
#!/usr/bin/python
import sys
import subprocess
import os
import math
import string
from rnaseq_align_and_qc import *
def get_sorted_bam_files(runs, path_start):
"""
Gets a list of all sorted bam files from the original list of samples derived from get_sample_info()
Used to create file list for igv script
"""
sample_bam_list = []
for k in runs:
curr_sample, gigpad, lane, index, ercc_mix, top_dir, batch, label, ref_genome, library_type = k
sample_bam_list.append(path_start+batch+"/"+curr_sample+"/tophat_out/"+curr_sample+"_accepted_hits.sorted.bam")
return ",".join(sample_bam_list)
def get_snapshot_by_coordinates(runs, path_start, out_dir, transcript_name, transcript_position, ref_genome, gtf):
"""
For a given transcript identified by name and coordinates, load in a set of sorted bams files (from list of runs of loaded sample_info_file) and take PDF snapshots of raw reads
Tried several iterations of this as a loop, and the only one that didn't make IGV crash was to go to position first and then reload the bam files each time
Max number of samples tried in one window is ~95. Many more samples than this would likely not render properly.
"""
curr_chr = transcript_position.split(":")[0]
pos_range = map(lambda x: float(x), transcript_position.split(":")[1].split("-"))
delta = pos_range[1]-pos_range[0]
if "," in transcript_name:
transcript_name = transcript_name.replace(",", "_")
#Choose correct ref_genome name for igv
if ref_genome == "hg19":
igv_ref_genome = "hg19"
elif ref_genome == "mm10":
igv_ref_genome = "mm10"
elif ref_genome == "Zv9":
igv_ref_genome = "danRer7"
#Included a loop to make several plots when span of "pos" is greater than 25kb. Otherwise, nothing gets plotted and IGV "Zoom-In" notice appears. Likely needs adjustment depending on number of samples loaded.
max_span = 25000.
if delta < max_span:
l = "new\ngenome "+igv_ref_genome+" \ngoto "+transcript_position+"\nload "+gtf+","+get_sorted_bam_files(runs, path_start)+"\nsnapshotDirectory "+out_dir+" \nexpand \nsnapshot "+transcript_name+".png \n"
else:
i = int(math.ceil(delta/max_span))
l = ""
for j in range(i):
curr_pos = curr_chr+":"+str(int(pos_range[0]+j*max_span))+"-"+str(int(pos_range[0]+(j+1)*max_span))
l = l+"new\ngenome "+igv_ref_genome+" \ngoto "+curr_pos+"\nload "+gtf+","+get_sorted_bam_files(runs, path_start)+"\nsnapshotDirectory "+out_dir+" \nexpand \nsnapshot "+transcript_name+"_"+str(j)+".png \n"
return l
def get_sig_genes(sig_gene_file, ref_genome):
f = open(sig_gene_file, "r")
c = f.read().split('\n')
if '' in c:
c.remove('')
d = map(lambda x: [x.split(" ")[1], x.split(" ")[2]], c)
if ref_genome == "hg19":
housekeeping_genes = [["Housekeeping_ACTB", "chr7:5566776-5603415"], \
["Housekeeping_RPL19", "chr17:37356350-37360980"], \
["Housekeeping_GAPDH", "chr12:6643092-6647539"], \
["Housekeeping_GABARAP", "chr17:7143332-7145772"], \
["Housekeeping_B2M", "chr15:45003674-45010359"]]
elif ref_genome == "mm10":
housekeeping_genes = [["Housekeeping_Actb", "chr5:143662795-143670403"], \
["Housekeeping_Rpl19", "chr11:97886024-97893807"], \
["Housekeeping_Gapdh", "chr6:125109870-125117601"], \
["Housekeeping_Gabarap", "chr11:69802872-69810451"], \
["Housekeeping_B2m", "chr2:121971423-121980818"]]
elif ref_genome == "Zv9":
housekeeping_genes = [["Housekeeping_bactin1", "chr1:7723188-7730705"], \
["Housekeeping_rpl19", "chr3:15847924-15854420"], \
["Housekeeping_gapdh", "chr16:19252371-19261301"], \
["Housekeeping_gabarapa", "chr5:25726588-25732854"], \
["Housekeeping_b2m", "chr4:11855920-11862017"]]
d = d+housekeeping_genes
return d
def get_genes(sig_gene_file):
f = open(sig_gene_file, "r")
c = f.read().split('\n')
if '' in c:
c.remove('')
d = map(lambda x: [x.split("\t")[0], x.split("\t")[1]], c)
return d
def main(project_name, sample_info_file, path_start, transcript_file, top_genes):
"""
Loads file containing significantly differentially expressed genes generated by rnaseq_de_report.py and
Creates a text file of igv commands that can be run within IGV and by selecting File->Run Batch Script
Generates PDF snapshots of raw reads for all top differentially expressed genes
"""
if path_start == "./":
path_start = os.getcwd()
if path_start[-1] != "/":
path_start = path_start+"/"
out_dir = path_start+project_name+"/"+project_name+"_DE_Report/IGV_Plots/"
if not os.path.exists(out_dir):
os.makedirs(out_dir)
#Get sample info. Fields: [customer_id, gigpad_id, lane, index, ercc_mix, top_dir, batch, label, ref_genome, library_type]
runs = get_sample_info(sample_info_file)
ref_genome_list = map(lambda x: x[8], runs)
#Check whether all samples are of same reference genome
if False in map(lambda y: y==ref_genome_list[0], ref_genome_list):
print "Make sure all samples in project are of the same reference genome"
sys.exit()
else:
ref_genome = ref_genome_list[0]
ref_index, fa, gtf, ref, ERCC_gtf = get_genome_ref_files(ref_genome)
if os.path.exists(transcript_file):
top_genes = get_genes(transcript_file)
outp = open(out_dir+"igv_snapshots_of_transcripts_script.txt", "w")
outp.write("".join(map(lambda x: get_snapshot_by_coordinates(runs, path_start, out_dir, x[0], x[1], ref_genome, gtf), top_genes)))
outp.close()
print "Created file "+out_dir+"igv_snapshots_of_transcripts_script.txt\nOpen IGV and run using File->Run Batch Script"
else:
if transcript_file != "":
print "Cannot find "+transcript_file
if top_genes == "yes":
sig_gene_file = path_start+project_name+"/"+project_name+"_DE_Report/"+project_name+"_Significant_DE_gene_list.txt"
if not os.path.exists(sig_gene_file):
print "Cannot find "+sig_gene_file
print "Has DE report been created for "+project_name+"?"
sys.exit()
top_genes = get_sig_genes(sig_gene_file, ref_genome)
outp = open(out_dir+"igv_snapshots_of_top_genes_script.txt", "w")
outp.write("".join(map(lambda x: get_snapshot_by_coordinates(runs, path_start, out_dir, x[0], x[1], ref_genome, gtf), top_genes)))
outp.close()
print "Created file "+out_dir+"igv_snapshots_of_top_genes_script.txt\nOpen IGV and run using File->Run Batch Script"
else:
print "Did not attempt to create igv script file for top differentially expressed genes."
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Creates a text file with IGV commands to take PDF snapshots of raw reads for top differentially expressed genes for RNA-seq samples associated with a project.")
parser.add_argument("--path_start", default="./", type=str, help="Directory path to where project directory created by rnaseq_de.py is located (default=./)")
parser.add_argument("--transcripts", default="", type=str, help="A tab-delimited txt file whose first column contains transcript names and second column contains genome coordinates in format chr7:5566776-5603415.")
parser.add_argument("--top_genes", default="yes", type=str, help="Create a script file corresponding to top differentially expressed genes created by rnaseq_de.py? (options: 'yes', 'no'; default: 'yes')")
parser.add_argument("project_name", type=str, help="Name of project that all samples correspond to. Often a PCPGM batch, but it could correspond to a mixture of batches.")
parser.add_argument("samples_in", help="A tab-delimited txt file containing sample information. See example file: sample_info_file.txt")
args = parser.parse_args()
main(args.project_name, args.samples_in, args.path_start, args.transcripts, args.top_genes)