Skip to content

Commit

Permalink
Merge pull request #105 from yhoogstrate/some_blacklists
Browse files Browse the repository at this point in the history
and fixed #106
  • Loading branch information
yhoogstrate authored Jan 9, 2018
2 parents 0d321c0 + 6bd7747 commit 1a201c3
Show file tree
Hide file tree
Showing 34 changed files with 773 additions and 75 deletions.
5 changes: 5 additions & 0 deletions Changelog
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
2018-01-09 Youri Hoogstrate v0.15.0
* Bugfix resulting in higher number of detected frame shifts
* `dr-disco integrate --fasta <fa file>` provides edit distance to
canonical splice junction motif (quick impementation)

2017-12-20 Youri Hoogstrate v0.14.6
* New improvement to entropy filter

Expand Down
12 changes: 7 additions & 5 deletions bin/dr-disco
Original file line number Diff line number Diff line change
Expand Up @@ -140,12 +140,14 @@ def CLI_classify(table_input_file, table_output_file, only_valid, blacklist_regi
@click.argument('table_input_file', type=click.Path(exists=True))
@click.argument('table_output_file')
@click.option('--gtf', help="Use gene annotation (GTF file)")
def CLI_integrate(table_input_file, table_output_file, gtf):
@click.option('--fasta', help="Use FASTA sequence file to estimate edit distances to splice junction motifs")
def CLI_integrate(table_input_file, table_output_file, gtf, fasta):
cl = DetectOutput(table_input_file)
if gtf:
cl.integrate(table_output_file, str(gtf))
else:
cl.integrate(table_output_file, None)

gtf = str(gtf) if gtf else None
fasta = str(fasta) if fasta else None

cl.integrate(table_output_file, gtf, fasta)


if __name__ == '__main__':
Expand Down
6 changes: 6 additions & 0 deletions drdisco/DetectFrameShifts.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,13 @@ def evaluate(self, _from, _to, offset):
"""
Offset may be convenient because STAR sometimes has problems aligning/clipping the first 2 bases after an exon
Values of 4 and larger do not make sense.
Args:
_from ([chr, pos, strand]): donor break position
_to ([chr, pos, strand]): acceptor position position
"""

from_l_fgd = []
to_l_fgd = []

Expand Down
150 changes: 108 additions & 42 deletions drdisco/DetectOutput.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,10 @@

from drdisco import log
from drdisco.DetectFrameShifts import DetectFrameShifts
from drdisco.utils import reverse_complement, is_gzip
import gzip
import HTSeq
from pyfaidx import Fasta


"""[License: GNU General Public License v3 (GPLv3)]
Expand Down Expand Up @@ -37,18 +39,9 @@
"""


def is_gzip(filename):
try:
f = gzip.GzipFile(filename, 'rb')
f.read()
return True
except Exception:
return False


class DetectOutputEntry:
def __init__(self, line_in_results_file):
self.line = line_in_results_file.strip().split("\t")
self.line = line_in_results_file.strip("\r\n").split("\t")
self.parse()

def parse(self):
Expand Down Expand Up @@ -130,13 +123,15 @@ def parse(self):
self.break_A_max_AS = int(self.line[44])
self.break_B_max_AS = int(self.line[45])

self.edit_dist_to_splice_motif = ""

self.structure = self.line[46]

inv = {'-': '+', '+': '-'}
if self.acceptorA > self.donorA:
self.RNAstrandA = self.strandA
self.RNAstrandB = inv[self.strandB]
elif self.donorA < self.acceptorA:
elif self.acceptorA < self.donorA:
self.RNAstrandA = inv[self.strandA]
self.RNAstrandB = self.strandB
else:
Expand All @@ -151,14 +146,15 @@ def parse(self):
def get_donors_acceptors(self, gtf_file):
idx = {}
for a in self.structure.split('&'):
for b in a.split(':', 3)[3].strip('()').split(','):
c = b.split(':')
c[0] = c[0].replace('_1', '_[12]').replace('_2', '_[12]')
if c[0] != 'discordant_mates':
if c[0] not in idx:
idx[c[0]] = 0
if a != '':
for b in a.split(':', 3)[3].strip('()').split(','):
c = b.split(':')
c[0] = c[0].replace('_1', '_[12]').replace('_2', '_[12]')
if c[0] != 'discordant_mates':
if c[0] not in idx:
idx[c[0]] = 0

idx[c[0]] += int(c[1])
idx[c[0]] += int(c[1])

def pos_to_gene_str(pos_chr, pos_pos):
if pos_chr[0:3] == 'chr':
Expand Down Expand Up @@ -188,6 +184,66 @@ def pos_to_gene_str(pos_chr, pos_pos):
else:
return genesB + '<->' + genesA

def is_on_splice_junction_motif(self, fasta_fh):
"""
motif:
5' exon:
[ ...{AC}{A}{G} ] {G}{T}{AG}{A}{G}{T} . . . {C}{A}{G} [ {G}... ]
"""

pos5_in_exon_length = 3
pos5_post_exon_length = 6

pos3_pre_exon_length = 3
pos3_in_exon_length = 1

if self.donorA > self.donorB:
pos5p = [self.chrA, self.posA, self.strandA]
pos3p = [self.chrB, self.posB, self.strandB]
elif self.donorA < self.donorB:
pos5p = [self.chrB, self.posB, self.strandB]
pos3p = [self.chrA, self.posA, self.strandA]
else:
pos5p = None

if pos5p:
if pos5p[2] == '-':
seq_in_5p_exon = str(fasta_fh[pos5p[0]][pos5p[1] - pos5_in_exon_length:pos5p[1]]).upper()
seq_post_5p_exon = str(fasta_fh[pos5p[0]][pos5p[1]:pos5p[1] + pos5_post_exon_length]).upper()
else:
seq_in_5p_exon = reverse_complement(str(fasta_fh[pos5p[0]][pos5p[1]:pos5p[1] + pos5_in_exon_length]))
seq_post_5p_exon = reverse_complement(str(fasta_fh[pos5p[0]][pos5p[1] - pos5_post_exon_length:pos5p[1]]))

if pos3p[2] == '+':
seq_pre_3p_exon = str(fasta_fh[pos3p[0]][pos3p[1] - pos3_pre_exon_length:pos3p[1]]).upper()
seq_in_3p_exon = str(fasta_fh[pos3p[0]][pos3p[1]:pos3p[1] + pos3_in_exon_length]).upper()
else:
seq_in_3p_exon = reverse_complement(str(fasta_fh[pos3p[0]][pos3p[1] - pos3_in_exon_length:pos3p[1]]))
seq_pre_3p_exon = reverse_complement(str(fasta_fh[pos3p[0]][pos3p[1]:pos3p[1] + pos3_pre_exon_length]))

def calc_dist(pat, subseq):
d = 0

if len(pat) != len(subseq):
raise Exception("invalid pattern size")
for i in range(len(pat)):
if subseq[i] not in pat[i]:
d += 1

return d

dist = calc_dist(["AC", "A", "G"], seq_in_5p_exon) + calc_dist(["G", "T", "AG", "A", "G", "T"], seq_post_5p_exon) + calc_dist(["C", "A", "G"], seq_pre_3p_exon) + calc_dist(["G"], seq_in_3p_exon)
# print "[ ... " + seq_in_5p_exon + " ] " + seq_post_5p_exon + " ... ... " + seq_pre_3p_exon + " [ " + seq_in_3p_exon + " ... ] ---> " + str(dist)
self.edit_dist_to_splice_motif = str(dist)

return dist
else:
return ""

def __str__(self):
line = self.line
line[11] = self.status
Expand Down Expand Up @@ -382,17 +438,17 @@ def classify_intronic_exonic():

log.info("Classified " + str(k) + "/" + str(n) + " as valid")

def integrate(self, output_table, gtf_file):
def insert_in_index(index, entries, score):
def integrate(self, output_table, gtf_file, fasta_file):
def insert_in_index(index, entries, score, i):
if score not in index:
index[score] = {}

key = entries[0].chrA + ':' + str(entries[0].posA) + '(' + entries[0].strandA + ')-' + entries[0].chrB + ':' + str(entries[0].posB) + '(' + entries[0].strandB + ')'
key = entries[0].chrA + ':' + str(entries[0].posA) + '(' + entries[0].strandA + ')-' + entries[0].chrB + ':' + str(entries[0].posB) + '(' + entries[0].strandB + ')_' + str(i)
index[score][key] = entries

with open(output_table, 'w') as fh_out:
header = self.header.split("\t")
header = "\t".join(header[:-5] + ['full-gene-dysregulation', 'frameshift=0', 'frameshift=+1', 'frameshift=+2'] + header[-5:])
header = "\t".join(header[:-5] + ['full-gene-dysregulation', 'frameshift=0', 'frameshift=+1', 'frameshift=+2', 'splice-motif-edit-distance'] + header[-5:])

fh_out.write("shared-id\tfusion\t" + header)

Expand All @@ -403,6 +459,8 @@ def insert_in_index(index, entries, score):
gene_annotation = GeneAnnotation(gtf_file)
dfs = DetectFrameShifts(gtf_file) if gtf_file else None

ffs = Fasta(fasta_file) if fasta_file else None

intronic_linear = []
remainder = []

Expand All @@ -425,30 +483,34 @@ def insert_in_index(index, entries, score):
frameshifts_2 = [x[0][0] + '(+' + str(x[0][1]) + ')->' + x[1][0] + '(+' + str(x[1][1]) + ')' for x in frame_shifts[2]]

for additional_breaks in e.structure.split('&'):
params = additional_breaks.split(':(')
n_split_reads = sum([int(x.split(':')[1]) for x in params[1].rstrip(')').split(',') if x.split(':')[0] != 'discordant_mates'])
if additional_breaks != '':
params = additional_breaks.split(':(')
n_split_reads = sum([int(x.split(':')[1]) for x in params[1].rstrip(')').split(',') if x.split(':')[0] != 'discordant_mates'])

posAB = params[0].split(':')
posA, posB = int(posAB[1].split('/')[0]), int(posAB[2].split('/')[0])
posAB = params[0].split(':')
posA, posB = int(posAB[1].split('/')[0]), int(posAB[2].split('/')[0])

if params[0] not in done_breaks and n_split_reads > 0:
if e.donorA > e.donorB:
frame_shifts = dfs.evaluate([e.chrA, posA, e.RNAstrandA], [e.chrB, posB, e.RNAstrandB], 2)
else:
frame_shifts = dfs.evaluate([e.chrB, posB, e.RNAstrandB], [e.chrA, posA, e.RNAstrandA], 2)
if params[0] not in done_breaks and n_split_reads > 0:
if e.donorA > e.donorB: # nice, use same thing to swap if necessary
frame_shifts = dfs.evaluate([e.chrA, posA, e.RNAstrandA], [e.chrB, posB, e.RNAstrandB], 2)
else:
frame_shifts = dfs.evaluate([e.chrB, posB, e.RNAstrandB], [e.chrA, posA, e.RNAstrandA], 2)

fgd += [x[0] + '->' + x[1] for x in frame_shifts['fgd']]
frameshifts_0 += [x[0][0] + '->' + x[1][0] for x in frame_shifts[0]]
frameshifts_1 += [x[0][0] + '(+' + str(x[0][1]) + ')->' + x[1][0] + '(+' + str(x[1][1]) + ')' for x in frame_shifts[1]]
frameshifts_2 += [x[0][0] + '(+' + str(x[0][1]) + ')->' + x[1][0] + '(+' + str(x[1][1]) + ')' for x in frame_shifts[2]]
fgd += [x[0] + '->' + x[1] for x in frame_shifts['fgd']]
frameshifts_0 += [x[0][0] + '->' + x[1][0] for x in frame_shifts[0]]
frameshifts_1 += [x[0][0] + '(+' + str(x[0][1]) + ')->' + x[1][0] + '(+' + str(x[1][1]) + ')' for x in frame_shifts[1]]
frameshifts_2 += [x[0][0] + '(+' + str(x[0][1]) + ')->' + x[1][0] + '(+' + str(x[1][1]) + ')' for x in frame_shifts[2]]

done_breaks.add(params[0])
done_breaks.add(params[0])

e.fgd = ','.join(sorted(list(set(fgd))))
e.frameshift_0 = ','.join(sorted(list(set(frameshifts_0))))
e.frameshift_1 = ','.join(sorted(list(set(frameshifts_1))))
e.frameshift_2 = ','.join(sorted(list(set(frameshifts_2))))

if ffs:
e.is_on_splice_junction_motif(ffs)

if e.x_onic == 'intronic' and e.circ_lin == 'linear':
intronic_linear.append(e)
else:
Expand All @@ -470,7 +532,7 @@ def insert(pos, e):

# Reorder
idx2 = {}

q = 0
for e in intronic_linear:
results = {}
positions = [(e.chrA, e.posA, e.strandA), (e.chrB, e.posB, e.strandB)]
Expand All @@ -497,24 +559,28 @@ def insert(pos, e):
results[e2] += 1

top_result = (None, 9999999999999)
for r in results:
for r in sorted(results.keys()):
if results[r] >= 2:
d1 = (r.posA - e.posA)
d2 = (r.posB - e.posB)
sq_d = math.sqrt(pow(d1, 2) + pow(d2, 2))

shared_score = math.sqrt((pow(e.score, 2) + pow(r.score, 2)) * 0.5)
penalty = 1.0 * sq_d / shared_score

if penalty < top_result[1]:
top_result = (r, penalty)

if top_result[0]:
insert_in_index(idx2, [e, top_result[0]], e.score + top_result[0].score)
insert_in_index(idx2, [e, top_result[0]], e.score + top_result[0].score, q)
else:
insert_in_index(idx2, [e], e.score)
insert_in_index(idx2, [e], e.score, q)

q += 1

for e in remainder:
insert_in_index(idx2, [e], e.score)
insert_in_index(idx2, [e], e.score, q)
q += 1

log.info("Determining fusion gene names and generate output")
# Generate output
Expand All @@ -526,7 +592,7 @@ def insert(pos, e):
for entry in idx2[score][key]:
if entry not in exported:
acceptors_donors = entry.get_donors_acceptors(gene_annotation)
line = entry.line[:-5] + [entry.fgd, entry.frameshift_0, entry.frameshift_1, entry.frameshift_2] + entry.line[-5:]
line = entry.line[:-5] + [entry.fgd, entry.frameshift_0, entry.frameshift_1, entry.frameshift_2, entry.edit_dist_to_splice_motif] + entry.line[-5:]

fh_out.write(str(i) + "\t" + acceptors_donors + "\t" + "\t".join(line) + "\n")
exported.add(entry)
Expand Down
2 changes: 1 addition & 1 deletion drdisco/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
import logging
import sys

__version_info__ = ('0', '14', '6')
__version_info__ = ('0', '15', '0')
__version__ = '.'.join(__version_info__) if (len(__version_info__) == 3) else '.'.join(__version_info__[0:3]) + "-" + __version_info__[3]
__author__ = 'Youri Hoogstrate'
__homepage__ = 'https://github.com/yhoogstrate/dr-disco'
Expand Down
31 changes: 31 additions & 0 deletions drdisco/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#!/usr/bin/env python
# *- coding: utf-8 -*-
# vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4 textwidth=79:


import gzip


alt_map = {'ins': '0'}
complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'}


def reverse_complement(seq):
seq = seq.upper()
for k, v in alt_map.iteritems():
seq = seq.replace(k, v)
bases = list(seq)
bases = reversed([complement.get(base, base) for base in bases])
bases = ''.join(bases)
for k, v in alt_map.iteritems():
bases = bases.replace(v, k)
return bases


def is_gzip(filename):
try:
f = gzip.GzipFile(filename, 'rb')
f.read()
return True
except Exception:
return False
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@ HTSeq==0.6.1
numpy
pysam==0.10.0
scipy
pyfaidx==0.5.1
6 changes: 6 additions & 0 deletions share/blacklist-junctions.hg38.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4050,3 +4050,9 @@ chr13 66903600 66903611 - chr13 67298227 67298238 + PCDH9 new exon
chr13 111865377 111865388 - chr13 112095469 112095481 + SOX1 new exon(s)
chr13 111865377 111865388 - chr13 112098310 112098322 + SOX1 new exon(s)
chr13 45338700 45338750 - chr13 45339500 45339560 - TPT1 weird ribosomal/LINC-rna low entropy stuff again
chr2 81548742 81548753 - chr2 81797812 81797823 + most likely new gene brca
chr2 81548742 81548753 - chr2 81822264 81822275 + most likely new gene brca
chr2 81548742 81548753 - chr2 81848747 81848758 + most likely new gene brca
chr2 81570408 81570419 - chr2 81797812 81797823 + most likely new gene brca
chr2 81570408 81570419 - chr2 81848747 81848758 + most likely new gene brca
chr2 173533339 173533340 - chr2 173761578 173761579 + most likely new gene brca
4 changes: 2 additions & 2 deletions tests/chim_overhang/test_01_integrate.out.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
shared-id fusion chr-A pos-A direction-A pos-A-acceptor pos-A-donor chr-B pos-B direction-B pos-B-acceptor pos-B-donor genomic-distance filter-status circRNA intronic/exonic score soft+hardclips n-split-reads n-discordant-reads alignment-score mismatches n-edges n-nodes-A n-nodes-B n-splice-junc-A n-splice-junc-B entropy-bp-edge entropy-all-edges bp-pos-stddev entropy-disco-bps lr-A-slope lr-A-intercept lr-A-rvalue lr-A-pvalue lr-A-stderr lr-B-slope lr-B-intercept lr-B-rvalue lr-B-pvalue lr-B-stderr disco/split clips/score nodes/edge full-gene-dysregulation frameshift=0 frameshift=+1 frameshift=+2 median-AS-A median-AS-B max-AS-A max-AS-B data-structure
1 chr2:17281790->chr11:5566704 chr2 17281790 + 0 22 chr11 5566704 - 22 0 inf entropy=0.7372<0.7382,chim_overhang=21<25 linear intronic 33 22 11 0 1530 10 1 1 1 0 0 0.7372 0.7372 0.0000 0.0000 0.3818 18.0000 0.8367 0.0013 0.0833 2.0455 38.7727 0.9652 0.0000 0.1847 0.0000 0.3333 2.0000 21 49 21 57 chr2:17281790/17281791(+)->chr11:5566704/5566705(-):(spanning_paired_1:11,spanning_paired_2:11)
shared-id fusion chr-A pos-A direction-A pos-A-acceptor pos-A-donor chr-B pos-B direction-B pos-B-acceptor pos-B-donor genomic-distance filter-status circRNA intronic/exonic score soft+hardclips n-split-reads n-discordant-reads alignment-score mismatches n-edges n-nodes-A n-nodes-B n-splice-junc-A n-splice-junc-B entropy-bp-edge entropy-all-edges bp-pos-stddev entropy-disco-bps lr-A-slope lr-A-intercept lr-A-rvalue lr-A-pvalue lr-A-stderr lr-B-slope lr-B-intercept lr-B-rvalue lr-B-pvalue lr-B-stderr disco/split clips/score nodes/edge full-gene-dysregulation frameshift=0 frameshift=+1 frameshift=+2 splice-motif-edit-distance median-AS-A median-AS-B max-AS-A max-AS-B data-structure
1 chr2:17281790->chr11:5566704 chr2 17281790 + 0 22 chr11 5566704 - 22 0 inf entropy=0.7372<0.7382,chim_overhang=21<25 linear intronic 33 22 11 0 1530 10 1 1 1 0 0 0.7372 0.7372 0.0000 0.0000 0.3818 18.0000 0.8367 0.0013 0.0833 2.0455 38.7727 0.9652 0.0000 0.1847 0.0000 0.3333 2.0000 21 49 21 57 chr2:17281790/17281791(+)->chr11:5566704/5566705(-):(spanning_paired_1:11,spanning_paired_2:11)
Loading

0 comments on commit 1a201c3

Please sign in to comment.