make_igv_script_file.py

#!/usr/bin/python
import sys
import subprocess
import os
import math
import string
from rnaseq_align_and_qc import *


def get_sorted_bam_files(runs, path_start):
	"""
	Gets a list of all sorted bam files from the original list of samples derived from get_sample_info()
	Used to create file list for igv script
	"""
	sample_bam_list = []
	for k in runs:
		curr_sample, gigpad, lane, index, ercc_mix, top_dir, batch, label, ref_genome, library_type = k
		sample_bam_list.append(path_start+batch+"/"+curr_sample+"/tophat_out/"+curr_sample+"_accepted_hits.sorted.bam")
	return ",".join(sample_bam_list)


def get_snapshot_by_coordinates(runs, path_start, out_dir, transcript_name, transcript_position, ref_genome, gtf):
	"""
	For a given transcript identified by name and coordinates, load in a set of sorted bams files (from list of runs of loaded sample_info_file) and take PDF snapshots of raw reads
	Tried several iterations of this as a loop, and the only one that didn't make IGV crash was to go to position first and then reload the bam files each time
	Max number of samples tried in one window is ~95. Many more samples than this would likely not render properly.
	"""
	curr_chr = transcript_position.split(":")[0]
	pos_range = map(lambda x: float(x), transcript_position.split(":")[1].split("-"))
	delta = pos_range[1]-pos_range[0]
	if "," in transcript_name:
		transcript_name = transcript_name.replace(",", "_")
	#Choose correct ref_genome name for igv
	if ref_genome == "hg19":
		igv_ref_genome = "hg19"
	elif ref_genome == "mm10":
		igv_ref_genome = "mm10"
	elif ref_genome == "Zv9":
		igv_ref_genome = "danRer7"	
	#Included a loop to make several plots when span of "pos" is greater than 25kb. Otherwise, nothing gets plotted and IGV "Zoom-In" notice appears. Likely needs adjustment depending on number of samples loaded.
	max_span = 25000.
	if delta < max_span:
		l = "new\ngenome "+igv_ref_genome+" \ngoto "+transcript_position+"\nload "+gtf+","+get_sorted_bam_files(runs, path_start)+"\nsnapshotDirectory "+out_dir+" \nexpand \nsnapshot "+transcript_name+".png \n"
	else:
		i = int(math.ceil(delta/max_span))
		l = ""
		for j in range(i):
			curr_pos = curr_chr+":"+str(int(pos_range[0]+j*max_span))+"-"+str(int(pos_range[0]+(j+1)*max_span))
			l = l+"new\ngenome "+igv_ref_genome+" \ngoto "+curr_pos+"\nload "+gtf+","+get_sorted_bam_files(runs, path_start)+"\nsnapshotDirectory "+out_dir+" \nexpand \nsnapshot "+transcript_name+"_"+str(j)+".png \n"
	return l


def get_sig_genes(sig_gene_file, ref_genome):
	f = open(sig_gene_file, "r")
	c = f.read().split('\n')
	if '' in c:
		c.remove('')
	d = map(lambda x: [x.split(" ")[1], x.split(" ")[2]], c)
	if ref_genome == "hg19":
		housekeeping_genes = [["Housekeeping_ACTB", "chr7:5566776-5603415"], \
		["Housekeeping_RPL19", "chr17:37356350-37360980"], \
		["Housekeeping_GAPDH", "chr12:6643092-6647539"], \
		["Housekeeping_GABARAP", "chr17:7143332-7145772"], \
		["Housekeeping_B2M", "chr15:45003674-45010359"]]
	elif ref_genome == "mm10":
		housekeeping_genes = [["Housekeeping_Actb", "chr5:143662795-143670403"], \
		["Housekeeping_Rpl19", "chr11:97886024-97893807"], \
		["Housekeeping_Gapdh", "chr6:125109870-125117601"], \
		["Housekeeping_Gabarap", "chr11:69802872-69810451"], \
		["Housekeeping_B2m", "chr2:121971423-121980818"]]
	elif ref_genome == "Zv9":
		housekeeping_genes = [["Housekeeping_bactin1", "chr1:7723188-7730705"], \
		["Housekeeping_rpl19", "chr3:15847924-15854420"], \
		["Housekeeping_gapdh", "chr16:19252371-19261301"], \
		["Housekeeping_gabarapa", "chr5:25726588-25732854"], \
		["Housekeeping_b2m", "chr4:11855920-11862017"]]
	d = d+housekeeping_genes
	return d
	
def get_genes(sig_gene_file):
	f = open(sig_gene_file, "r")
	c = f.read().split('\n')
	if '' in c:
		c.remove('')
	d = map(lambda x: [x.split("\t")[0], x.split("\t")[1]], c)
	return d

def main(project_name, sample_info_file, path_start, transcript_file, top_genes):
	"""
	Loads file containing significantly differentially expressed genes generated by rnaseq_de_report.py and
	Creates a text file of igv commands that can be run within IGV and by selecting File->Run Batch Script
	Generates PDF snapshots of raw reads for all top differentially expressed genes
	"""
	if path_start == "./":
		path_start = os.getcwd()
	if path_start[-1] != "/":
		path_start = path_start+"/"
	out_dir = path_start+project_name+"/"+project_name+"_DE_Report/IGV_Plots/"
	if not os.path.exists(out_dir):
		os.makedirs(out_dir)

	#Get sample info. Fields: [customer_id, gigpad_id, lane, index, ercc_mix, top_dir, batch, label, ref_genome, library_type]
	runs = get_sample_info(sample_info_file)
	ref_genome_list = map(lambda x: x[8], runs)
	#Check whether all samples are of same reference genome
	if False in map(lambda y: y==ref_genome_list[0], ref_genome_list):
		print "Make sure all samples in project are of the same reference genome"
		sys.exit()
	else:
		ref_genome = ref_genome_list[0]
		ref_index, fa, gtf, ref, ERCC_gtf = get_genome_ref_files(ref_genome)

	if os.path.exists(transcript_file):
		top_genes = get_genes(transcript_file)
		outp = open(out_dir+"igv_snapshots_of_transcripts_script.txt", "w")
		outp.write("".join(map(lambda x: get_snapshot_by_coordinates(runs, path_start, out_dir, x[0], x[1], ref_genome, gtf), top_genes)))
		outp.close()
		print "Created file "+out_dir+"igv_snapshots_of_transcripts_script.txt\nOpen IGV and run using File->Run Batch Script"
	else:
		if transcript_file != "":
			print "Cannot find "+transcript_file
	
	if top_genes == "yes":
		sig_gene_file = path_start+project_name+"/"+project_name+"_DE_Report/"+project_name+"_Significant_DE_gene_list.txt"
		if not os.path.exists(sig_gene_file):
			print "Cannot find "+sig_gene_file
			print "Has DE report been created for "+project_name+"?"
			sys.exit()
		top_genes = get_sig_genes(sig_gene_file, ref_genome)
		outp = open(out_dir+"igv_snapshots_of_top_genes_script.txt", "w")
		outp.write("".join(map(lambda x: get_snapshot_by_coordinates(runs, path_start, out_dir, x[0], x[1], ref_genome, gtf), top_genes)))
		outp.close()
		print "Created file "+out_dir+"igv_snapshots_of_top_genes_script.txt\nOpen IGV and run using File->Run Batch Script"
	else:
		print "Did not attempt to create igv script file for top differentially expressed genes."


if __name__ == "__main__":
	parser = argparse.ArgumentParser(description="Creates a text file with IGV commands to take PDF snapshots of raw reads for top differentially expressed genes for RNA-seq samples associated with a project.")
	parser.add_argument("--path_start", default="./", type=str, help="Directory path to where project directory created by rnaseq_de.py is located (default=./)")
	parser.add_argument("--transcripts", default="", type=str, help="A tab-delimited txt file whose first column contains transcript names and second column contains genome coordinates in format chr7:5566776-5603415.")
	parser.add_argument("--top_genes", default="yes", type=str, help="Create a script file corresponding to top differentially expressed genes created by rnaseq_de.py? (options: 'yes', 'no'; default: 'yes')")
	parser.add_argument("project_name", type=str, help="Name of project that all samples correspond to. Often a PCPGM batch, but it could correspond to a mixture of batches.")
	parser.add_argument("samples_in", help="A tab-delimited txt file containing sample information. See example file: sample_info_file.txt")
	args = parser.parse_args()
	main(args.project_name, args.samples_in, args.path_start, args.transcripts, args.top_genes)