-
Notifications
You must be signed in to change notification settings - Fork 2
/
splicejunction2bed.py
122 lines (95 loc) · 4.55 KB
/
splicejunction2bed.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
# -*- coding: utf-8 -*-
# The input are the splice junction files from STAR will look like such:
# column 1: chromosome
# column 2: first base of the intron (1-based)
# column 3: last base of the intron (1-based)
# column 4: strand (0: undefined, 1: +, 2: -)
# column 5: intron motif: 0: non-canonical; 1: GT/AG, 2: CT/AC, 3: GC/AG, 4: CT/GC, 5:AT/AC, 6: GT/AT
# column 6: 0: unannotated, 1: annotated (only if splice junctions database is used)
# column 7: number of uniquely mapping reads crossing the junction
# column 8: number of multi-mapping reads crossing the junction
# column 9: maximum spliced alignment overhang
# input looks like.....
# 0 1 2 3 4 5 6 7 8
# chr1 11672 12009 1 1 1 0 2 67
# chr1 12228 12612 1 1 1 0 1 31
# chr1 14830 14969 2 2 1 75 162 71
#############################################
# The output will look like such so that score is the number of unique mappers
# and the name is the original 1-based star junction and if it was annotated
#############################################
# chr1 11671 12008 chr1:11672-12009|1 0 +
# chr1 12227 12611 chr1:12228-12612|1 0 +
# chr1 14829 14968 chr1:14830-14969|1 75 -
import argparse, sys, os.path
from pathlib import Path
def main():
parser = argparse.ArgumentParser()
parser.add_argument("-i","--input", help="File with the regions in bed format")
parser.add_argument("-o","--output", help="Name of the output bed you want")
parser.add_argument("-m","--motifFilter", help=" filter out splice junctions with \n\t\t non-canonical motifs",action="store_true")
parser.add_argument("-n","--name", help="File with the regions in bed format",action="store_true")
args = parser.parse_args()
infile = args.input
outfile = args.output
motifON = args.motifFilter
nameBase = args.name
if infile is not None and outfile is not None:
run(infile, outfile, motifON,nameBase)
else:
usage()
def usage():
print("\nUsage: python splicejunction2bed [options] <mandatory>")
print("Used for turning STAR's SJ.out.tab into bed files")
print("Options:")
print("\t-h, --help:\n\t\t show this help message and exit")
print("\t-mf, --motifFilter:\n\t\t filter out splice junctions with \n\t\t non-canonical motifs")
print("\t-n, --name:\n\t\t change name column to the name of the file")
print("Mandatory:")
print("\t-i, --input:\n\t\t File with the regions in bed format")
print("\t-o, --output:\n\t\t Name of the gtf file output file. Directory where the file will be created should exist!")
def run(infile, outfile, motifON, nameBase):
inf = open(infile, 'r')
outf = open(outfile,'w')
if nameBase:
baseFileName = Path(infile).stem
print(baseFileName)
for line in inf:
linea_split = line.split()
chrom = linea_split[0]
# if the intromotif filter is on
# column 5: intron motif: 0: non-canonical; 1: GT/AG, 2: CT/AC, 3: GC/AG, 4: CT/GC, 5:AT/AC, 6: GT/AT
if (motifON) & (str(linea_split[4]) == "0"):
continue
else:
# convert from one based to zero based with -1
ini_pos = str(int(linea_split[1]) - 1)
fin_pos = str(int(linea_split[2]) + 1)
# strand needs to be converted to -/+/*
strand = linea_split[3]
# # column 4: strand (0: undefined, 1: +, 2: -)
if(strand == "1"):
strand = "+"
elif(strand == "2"):
strand = "-"
elif(strand == "0"):
strand = "*"
if not nameBase:
# name here will be the original coords separated by underscore
name = str(linea_split[0]) + ":" + str(linea_split[1]) + "-" + str(linea_split[2])
# and then : and the annotation
name = name + "|" + str(linea_split[5])
else:
name = baseFileName
# score is the number of uniquemappers
score = str(linea_split[6])
# chr1 11671 12008 chr1_11672_12009:1 0 +
# chr1 12227 12611 chr1_12228_12612:1 0 +
# chr1 14829 14968 chr1_14830_14969:1 75 -
write_line = [chrom, ini_pos, fin_pos, name, score, strand]
write_line = "\t".join(write_line)
outf.write(write_line + "\n")
inf.close()
outf.close()
if __name__ == "__main__":
main()