Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Some blacklists #105

Merged
merged 27 commits into from
Jan 9, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions Changelog
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
2018-01-09 Youri Hoogstrate v0.15.0
* Bugfix resulting in higher number of detected frame shifts
* `dr-disco integrate --fasta <fa file>` provides edit distance to
canonical splice junction motif (quick impementation)

2017-12-20 Youri Hoogstrate v0.14.6
* New improvement to entropy filter

Expand Down
12 changes: 7 additions & 5 deletions bin/dr-disco
Original file line number Diff line number Diff line change
Expand Up @@ -140,12 +140,14 @@ def CLI_classify(table_input_file, table_output_file, only_valid, blacklist_regi
@click.argument('table_input_file', type=click.Path(exists=True))
@click.argument('table_output_file')
@click.option('--gtf', help="Use gene annotation (GTF file)")
def CLI_integrate(table_input_file, table_output_file, gtf):
@click.option('--fasta', help="Use FASTA sequence file to estimate edit distances to splice junction motifs")
def CLI_integrate(table_input_file, table_output_file, gtf, fasta):
cl = DetectOutput(table_input_file)
if gtf:
cl.integrate(table_output_file, str(gtf))
else:
cl.integrate(table_output_file, None)

gtf = str(gtf) if gtf else None
fasta = str(fasta) if fasta else None

cl.integrate(table_output_file, gtf, fasta)


if __name__ == '__main__':
Expand Down
6 changes: 6 additions & 0 deletions drdisco/DetectFrameShifts.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,13 @@ def evaluate(self, _from, _to, offset):
"""
Offset may be convenient because STAR sometimes has problems aligning/clipping the first 2 bases after an exon
Values of 4 and larger do not make sense.

Args:
_from ([chr, pos, strand]): donor break position
_to ([chr, pos, strand]): acceptor position position

"""

from_l_fgd = []
to_l_fgd = []

Expand Down
150 changes: 108 additions & 42 deletions drdisco/DetectOutput.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,10 @@

from drdisco import log
from drdisco.DetectFrameShifts import DetectFrameShifts
from drdisco.utils import reverse_complement, is_gzip
import gzip
import HTSeq
from pyfaidx import Fasta


"""[License: GNU General Public License v3 (GPLv3)]
Expand Down Expand Up @@ -37,18 +39,9 @@
"""


def is_gzip(filename):
try:
f = gzip.GzipFile(filename, 'rb')
f.read()
return True
except Exception:
return False


class DetectOutputEntry:
def __init__(self, line_in_results_file):
self.line = line_in_results_file.strip().split("\t")
self.line = line_in_results_file.strip("\r\n").split("\t")
self.parse()

def parse(self):
Expand Down Expand Up @@ -130,13 +123,15 @@ def parse(self):
self.break_A_max_AS = int(self.line[44])
self.break_B_max_AS = int(self.line[45])

self.edit_dist_to_splice_motif = ""

self.structure = self.line[46]

inv = {'-': '+', '+': '-'}
if self.acceptorA > self.donorA:
self.RNAstrandA = self.strandA
self.RNAstrandB = inv[self.strandB]
elif self.donorA < self.acceptorA:
elif self.acceptorA < self.donorA:
self.RNAstrandA = inv[self.strandA]
self.RNAstrandB = self.strandB
else:
Expand All @@ -151,14 +146,15 @@ def parse(self):
def get_donors_acceptors(self, gtf_file):
idx = {}
for a in self.structure.split('&'):
for b in a.split(':', 3)[3].strip('()').split(','):
c = b.split(':')
c[0] = c[0].replace('_1', '_[12]').replace('_2', '_[12]')
if c[0] != 'discordant_mates':
if c[0] not in idx:
idx[c[0]] = 0
if a != '':
for b in a.split(':', 3)[3].strip('()').split(','):
c = b.split(':')
c[0] = c[0].replace('_1', '_[12]').replace('_2', '_[12]')
if c[0] != 'discordant_mates':
if c[0] not in idx:
idx[c[0]] = 0

idx[c[0]] += int(c[1])
idx[c[0]] += int(c[1])

def pos_to_gene_str(pos_chr, pos_pos):
if pos_chr[0:3] == 'chr':
Expand Down Expand Up @@ -188,6 +184,66 @@ def pos_to_gene_str(pos_chr, pos_pos):
else:
return genesB + '<->' + genesA

def is_on_splice_junction_motif(self, fasta_fh):
"""

motif:

5' exon:

[ ...{AC}{A}{G} ] {G}{T}{AG}{A}{G}{T} . . . {C}{A}{G} [ {G}... ]

"""

pos5_in_exon_length = 3
pos5_post_exon_length = 6

pos3_pre_exon_length = 3
pos3_in_exon_length = 1

if self.donorA > self.donorB:
pos5p = [self.chrA, self.posA, self.strandA]
pos3p = [self.chrB, self.posB, self.strandB]
elif self.donorA < self.donorB:
pos5p = [self.chrB, self.posB, self.strandB]
pos3p = [self.chrA, self.posA, self.strandA]
else:
pos5p = None

if pos5p:
if pos5p[2] == '-':
seq_in_5p_exon = str(fasta_fh[pos5p[0]][pos5p[1] - pos5_in_exon_length:pos5p[1]]).upper()
seq_post_5p_exon = str(fasta_fh[pos5p[0]][pos5p[1]:pos5p[1] + pos5_post_exon_length]).upper()
else:
seq_in_5p_exon = reverse_complement(str(fasta_fh[pos5p[0]][pos5p[1]:pos5p[1] + pos5_in_exon_length]))
seq_post_5p_exon = reverse_complement(str(fasta_fh[pos5p[0]][pos5p[1] - pos5_post_exon_length:pos5p[1]]))

if pos3p[2] == '+':
seq_pre_3p_exon = str(fasta_fh[pos3p[0]][pos3p[1] - pos3_pre_exon_length:pos3p[1]]).upper()
seq_in_3p_exon = str(fasta_fh[pos3p[0]][pos3p[1]:pos3p[1] + pos3_in_exon_length]).upper()
else:
seq_in_3p_exon = reverse_complement(str(fasta_fh[pos3p[0]][pos3p[1] - pos3_in_exon_length:pos3p[1]]))
seq_pre_3p_exon = reverse_complement(str(fasta_fh[pos3p[0]][pos3p[1]:pos3p[1] + pos3_pre_exon_length]))

def calc_dist(pat, subseq):
d = 0

if len(pat) != len(subseq):
raise Exception("invalid pattern size")
for i in range(len(pat)):
if subseq[i] not in pat[i]:
d += 1

return d

dist = calc_dist(["AC", "A", "G"], seq_in_5p_exon) + calc_dist(["G", "T", "AG", "A", "G", "T"], seq_post_5p_exon) + calc_dist(["C", "A", "G"], seq_pre_3p_exon) + calc_dist(["G"], seq_in_3p_exon)
# print "[ ... " + seq_in_5p_exon + " ] " + seq_post_5p_exon + " ... ... " + seq_pre_3p_exon + " [ " + seq_in_3p_exon + " ... ] ---> " + str(dist)
self.edit_dist_to_splice_motif = str(dist)

return dist
else:
return ""

def __str__(self):
line = self.line
line[11] = self.status
Expand Down Expand Up @@ -382,17 +438,17 @@ def classify_intronic_exonic():

log.info("Classified " + str(k) + "/" + str(n) + " as valid")

def integrate(self, output_table, gtf_file):
def insert_in_index(index, entries, score):
def integrate(self, output_table, gtf_file, fasta_file):
def insert_in_index(index, entries, score, i):
if score not in index:
index[score] = {}

key = entries[0].chrA + ':' + str(entries[0].posA) + '(' + entries[0].strandA + ')-' + entries[0].chrB + ':' + str(entries[0].posB) + '(' + entries[0].strandB + ')'
key = entries[0].chrA + ':' + str(entries[0].posA) + '(' + entries[0].strandA + ')-' + entries[0].chrB + ':' + str(entries[0].posB) + '(' + entries[0].strandB + ')_' + str(i)
index[score][key] = entries

with open(output_table, 'w') as fh_out:
header = self.header.split("\t")
header = "\t".join(header[:-5] + ['full-gene-dysregulation', 'frameshift=0', 'frameshift=+1', 'frameshift=+2'] + header[-5:])
header = "\t".join(header[:-5] + ['full-gene-dysregulation', 'frameshift=0', 'frameshift=+1', 'frameshift=+2', 'splice-motif-edit-distance'] + header[-5:])

fh_out.write("shared-id\tfusion\t" + header)

Expand All @@ -403,6 +459,8 @@ def insert_in_index(index, entries, score):
gene_annotation = GeneAnnotation(gtf_file)
dfs = DetectFrameShifts(gtf_file) if gtf_file else None

ffs = Fasta(fasta_file) if fasta_file else None

intronic_linear = []
remainder = []

Expand All @@ -425,30 +483,34 @@ def insert_in_index(index, entries, score):
frameshifts_2 = [x[0][0] + '(+' + str(x[0][1]) + ')->' + x[1][0] + '(+' + str(x[1][1]) + ')' for x in frame_shifts[2]]

for additional_breaks in e.structure.split('&'):
params = additional_breaks.split(':(')
n_split_reads = sum([int(x.split(':')[1]) for x in params[1].rstrip(')').split(',') if x.split(':')[0] != 'discordant_mates'])
if additional_breaks != '':
params = additional_breaks.split(':(')
n_split_reads = sum([int(x.split(':')[1]) for x in params[1].rstrip(')').split(',') if x.split(':')[0] != 'discordant_mates'])

posAB = params[0].split(':')
posA, posB = int(posAB[1].split('/')[0]), int(posAB[2].split('/')[0])
posAB = params[0].split(':')
posA, posB = int(posAB[1].split('/')[0]), int(posAB[2].split('/')[0])

if params[0] not in done_breaks and n_split_reads > 0:
if e.donorA > e.donorB:
frame_shifts = dfs.evaluate([e.chrA, posA, e.RNAstrandA], [e.chrB, posB, e.RNAstrandB], 2)
else:
frame_shifts = dfs.evaluate([e.chrB, posB, e.RNAstrandB], [e.chrA, posA, e.RNAstrandA], 2)
if params[0] not in done_breaks and n_split_reads > 0:
if e.donorA > e.donorB: # nice, use same thing to swap if necessary
frame_shifts = dfs.evaluate([e.chrA, posA, e.RNAstrandA], [e.chrB, posB, e.RNAstrandB], 2)
else:
frame_shifts = dfs.evaluate([e.chrB, posB, e.RNAstrandB], [e.chrA, posA, e.RNAstrandA], 2)

fgd += [x[0] + '->' + x[1] for x in frame_shifts['fgd']]
frameshifts_0 += [x[0][0] + '->' + x[1][0] for x in frame_shifts[0]]
frameshifts_1 += [x[0][0] + '(+' + str(x[0][1]) + ')->' + x[1][0] + '(+' + str(x[1][1]) + ')' for x in frame_shifts[1]]
frameshifts_2 += [x[0][0] + '(+' + str(x[0][1]) + ')->' + x[1][0] + '(+' + str(x[1][1]) + ')' for x in frame_shifts[2]]
fgd += [x[0] + '->' + x[1] for x in frame_shifts['fgd']]
frameshifts_0 += [x[0][0] + '->' + x[1][0] for x in frame_shifts[0]]
frameshifts_1 += [x[0][0] + '(+' + str(x[0][1]) + ')->' + x[1][0] + '(+' + str(x[1][1]) + ')' for x in frame_shifts[1]]
frameshifts_2 += [x[0][0] + '(+' + str(x[0][1]) + ')->' + x[1][0] + '(+' + str(x[1][1]) + ')' for x in frame_shifts[2]]

done_breaks.add(params[0])
done_breaks.add(params[0])

e.fgd = ','.join(sorted(list(set(fgd))))
e.frameshift_0 = ','.join(sorted(list(set(frameshifts_0))))
e.frameshift_1 = ','.join(sorted(list(set(frameshifts_1))))
e.frameshift_2 = ','.join(sorted(list(set(frameshifts_2))))

if ffs:
e.is_on_splice_junction_motif(ffs)

if e.x_onic == 'intronic' and e.circ_lin == 'linear':
intronic_linear.append(e)
else:
Expand All @@ -470,7 +532,7 @@ def insert(pos, e):

# Reorder
idx2 = {}

q = 0
for e in intronic_linear:
results = {}
positions = [(e.chrA, e.posA, e.strandA), (e.chrB, e.posB, e.strandB)]
Expand All @@ -497,24 +559,28 @@ def insert(pos, e):
results[e2] += 1

top_result = (None, 9999999999999)
for r in results:
for r in sorted(results.keys()):
if results[r] >= 2:
d1 = (r.posA - e.posA)
d2 = (r.posB - e.posB)
sq_d = math.sqrt(pow(d1, 2) + pow(d2, 2))

shared_score = math.sqrt((pow(e.score, 2) + pow(r.score, 2)) * 0.5)
penalty = 1.0 * sq_d / shared_score

if penalty < top_result[1]:
top_result = (r, penalty)

if top_result[0]:
insert_in_index(idx2, [e, top_result[0]], e.score + top_result[0].score)
insert_in_index(idx2, [e, top_result[0]], e.score + top_result[0].score, q)
else:
insert_in_index(idx2, [e], e.score)
insert_in_index(idx2, [e], e.score, q)

q += 1

for e in remainder:
insert_in_index(idx2, [e], e.score)
insert_in_index(idx2, [e], e.score, q)
q += 1

log.info("Determining fusion gene names and generate output")
# Generate output
Expand All @@ -526,7 +592,7 @@ def insert(pos, e):
for entry in idx2[score][key]:
if entry not in exported:
acceptors_donors = entry.get_donors_acceptors(gene_annotation)
line = entry.line[:-5] + [entry.fgd, entry.frameshift_0, entry.frameshift_1, entry.frameshift_2] + entry.line[-5:]
line = entry.line[:-5] + [entry.fgd, entry.frameshift_0, entry.frameshift_1, entry.frameshift_2, entry.edit_dist_to_splice_motif] + entry.line[-5:]

fh_out.write(str(i) + "\t" + acceptors_donors + "\t" + "\t".join(line) + "\n")
exported.add(entry)
Expand Down
2 changes: 1 addition & 1 deletion drdisco/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
import logging
import sys

__version_info__ = ('0', '14', '6')
__version_info__ = ('0', '15', '0')
__version__ = '.'.join(__version_info__) if (len(__version_info__) == 3) else '.'.join(__version_info__[0:3]) + "-" + __version_info__[3]
__author__ = 'Youri Hoogstrate'
__homepage__ = 'https://github.com/yhoogstrate/dr-disco'
Expand Down
31 changes: 31 additions & 0 deletions drdisco/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#!/usr/bin/env python
# *- coding: utf-8 -*-
# vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4 textwidth=79:


import gzip


alt_map = {'ins': '0'}
complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'}


def reverse_complement(seq):
seq = seq.upper()
for k, v in alt_map.iteritems():
seq = seq.replace(k, v)
bases = list(seq)
bases = reversed([complement.get(base, base) for base in bases])
bases = ''.join(bases)
for k, v in alt_map.iteritems():
bases = bases.replace(v, k)
return bases


def is_gzip(filename):
try:
f = gzip.GzipFile(filename, 'rb')
f.read()
return True
except Exception:
return False
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@ HTSeq==0.6.1
numpy
pysam==0.10.0
scipy
pyfaidx==0.5.1
6 changes: 6 additions & 0 deletions share/blacklist-junctions.hg38.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4050,3 +4050,9 @@ chr13 66903600 66903611 - chr13 67298227 67298238 + PCDH9 new exon
chr13 111865377 111865388 - chr13 112095469 112095481 + SOX1 new exon(s)
chr13 111865377 111865388 - chr13 112098310 112098322 + SOX1 new exon(s)
chr13 45338700 45338750 - chr13 45339500 45339560 - TPT1 weird ribosomal/LINC-rna low entropy stuff again
chr2 81548742 81548753 - chr2 81797812 81797823 + most likely new gene brca
chr2 81548742 81548753 - chr2 81822264 81822275 + most likely new gene brca
chr2 81548742 81548753 - chr2 81848747 81848758 + most likely new gene brca
chr2 81570408 81570419 - chr2 81797812 81797823 + most likely new gene brca
chr2 81570408 81570419 - chr2 81848747 81848758 + most likely new gene brca
chr2 173533339 173533340 - chr2 173761578 173761579 + most likely new gene brca
4 changes: 2 additions & 2 deletions tests/chim_overhang/test_01_integrate.out.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
shared-id fusion chr-A pos-A direction-A pos-A-acceptor pos-A-donor chr-B pos-B direction-B pos-B-acceptor pos-B-donor genomic-distance filter-status circRNA intronic/exonic score soft+hardclips n-split-reads n-discordant-reads alignment-score mismatches n-edges n-nodes-A n-nodes-B n-splice-junc-A n-splice-junc-B entropy-bp-edge entropy-all-edges bp-pos-stddev entropy-disco-bps lr-A-slope lr-A-intercept lr-A-rvalue lr-A-pvalue lr-A-stderr lr-B-slope lr-B-intercept lr-B-rvalue lr-B-pvalue lr-B-stderr disco/split clips/score nodes/edge full-gene-dysregulation frameshift=0 frameshift=+1 frameshift=+2 median-AS-A median-AS-B max-AS-A max-AS-B data-structure
1 chr2:17281790->chr11:5566704 chr2 17281790 + 0 22 chr11 5566704 - 22 0 inf entropy=0.7372<0.7382,chim_overhang=21<25 linear intronic 33 22 11 0 1530 10 1 1 1 0 0 0.7372 0.7372 0.0000 0.0000 0.3818 18.0000 0.8367 0.0013 0.0833 2.0455 38.7727 0.9652 0.0000 0.1847 0.0000 0.3333 2.0000 21 49 21 57 chr2:17281790/17281791(+)->chr11:5566704/5566705(-):(spanning_paired_1:11,spanning_paired_2:11)
shared-id fusion chr-A pos-A direction-A pos-A-acceptor pos-A-donor chr-B pos-B direction-B pos-B-acceptor pos-B-donor genomic-distance filter-status circRNA intronic/exonic score soft+hardclips n-split-reads n-discordant-reads alignment-score mismatches n-edges n-nodes-A n-nodes-B n-splice-junc-A n-splice-junc-B entropy-bp-edge entropy-all-edges bp-pos-stddev entropy-disco-bps lr-A-slope lr-A-intercept lr-A-rvalue lr-A-pvalue lr-A-stderr lr-B-slope lr-B-intercept lr-B-rvalue lr-B-pvalue lr-B-stderr disco/split clips/score nodes/edge full-gene-dysregulation frameshift=0 frameshift=+1 frameshift=+2 splice-motif-edit-distance median-AS-A median-AS-B max-AS-A max-AS-B data-structure
1 chr2:17281790->chr11:5566704 chr2 17281790 + 0 22 chr11 5566704 - 22 0 inf entropy=0.7372<0.7382,chim_overhang=21<25 linear intronic 33 22 11 0 1530 10 1 1 1 0 0 0.7372 0.7372 0.0000 0.0000 0.3818 18.0000 0.8367 0.0013 0.0833 2.0455 38.7727 0.9652 0.0000 0.1847 0.0000 0.3333 2.0000 21 49 21 57 chr2:17281790/17281791(+)->chr11:5566704/5566705(-):(spanning_paired_1:11,spanning_paired_2:11)
Loading