yhoogstrate · yhoogstrate · Jan 9, 2018 · Jan 3, 2018 · Jan 3, 2018 · Jan 3, 2018
diff --git a/Changelog b/Changelog
@@ -1,3 +1,8 @@
+2018-01-09	Youri Hoogstrate	v0.15.0
+	 * Bugfix resulting in higher number of detected frame shifts
+	 * `dr-disco integrate --fasta <fa file>` provides edit distance to
+	    canonical splice junction motif (quick impementation)
+
 2017-12-20	Youri Hoogstrate	v0.14.6
 	 * New improvement to entropy filter
 

diff --git a/bin/dr-disco b/bin/dr-disco
@@ -140,12 +140,14 @@ def CLI_classify(table_input_file, table_output_file, only_valid, blacklist_regi
 @click.argument('table_input_file', type=click.Path(exists=True))
 @click.argument('table_output_file')
 @click.option('--gtf', help="Use gene annotation (GTF file)")
-def CLI_integrate(table_input_file, table_output_file, gtf):
+@click.option('--fasta', help="Use FASTA sequence file to estimate edit distances to splice junction motifs")
+def CLI_integrate(table_input_file, table_output_file, gtf, fasta):
     cl = DetectOutput(table_input_file)
-    if gtf:
-        cl.integrate(table_output_file, str(gtf))
-    else:
-        cl.integrate(table_output_file, None)
+
+    gtf = str(gtf) if gtf else None
+    fasta = str(fasta) if fasta else None
+
+    cl.integrate(table_output_file, gtf, fasta)
 
 
 if __name__ == '__main__':

diff --git a/drdisco/DetectFrameShifts.py b/drdisco/DetectFrameShifts.py
@@ -189,7 +189,13 @@ def evaluate(self, _from, _to, offset):
         """
         Offset may be convenient because STAR sometimes has problems aligning/clipping the first 2 bases after an exon
         Values of 4 and larger do not make sense.
+
+        Args:
+            _from ([chr, pos, strand]): donor break position
+            _to ([chr, pos, strand]): acceptor position position
+
         """
+
         from_l_fgd = []
         to_l_fgd = []
 

diff --git a/drdisco/DetectOutput.py b/drdisco/DetectOutput.py
@@ -6,8 +6,10 @@
 
 from drdisco import log
 from drdisco.DetectFrameShifts import DetectFrameShifts
+from drdisco.utils import reverse_complement, is_gzip
 import gzip
 import HTSeq
+from pyfaidx import Fasta
 
 
 """[License: GNU General Public License v3 (GPLv3)]
@@ -37,18 +39,9 @@
 """
 
 
-def is_gzip(filename):
-    try:
-        f = gzip.GzipFile(filename, 'rb')
-        f.read()
-        return True
-    except Exception:
-        return False
-
-
 class DetectOutputEntry:
     def __init__(self, line_in_results_file):
-        self.line = line_in_results_file.strip().split("\t")
+        self.line = line_in_results_file.strip("\r\n").split("\t")
         self.parse()
 
     def parse(self):
@@ -130,13 +123,15 @@ def parse(self):
         self.break_A_max_AS = int(self.line[44])
         self.break_B_max_AS = int(self.line[45])
 
+        self.edit_dist_to_splice_motif = ""
+
         self.structure = self.line[46]
 
         inv = {'-': '+', '+': '-'}
         if self.acceptorA > self.donorA:
             self.RNAstrandA = self.strandA
             self.RNAstrandB = inv[self.strandB]
-        elif self.donorA < self.acceptorA:
+        elif self.acceptorA < self.donorA:
             self.RNAstrandA = inv[self.strandA]
             self.RNAstrandB = self.strandB
         else:
@@ -151,14 +146,15 @@ def parse(self):
     def get_donors_acceptors(self, gtf_file):
         idx = {}
         for a in self.structure.split('&'):
-            for b in a.split(':', 3)[3].strip('()').split(','):
-                c = b.split(':')
-                c[0] = c[0].replace('_1', '_[12]').replace('_2', '_[12]')
-                if c[0] != 'discordant_mates':
-                    if c[0] not in idx:
-                        idx[c[0]] = 0
+            if a != '':
+                for b in a.split(':', 3)[3].strip('()').split(','):
+                    c = b.split(':')
+                    c[0] = c[0].replace('_1', '_[12]').replace('_2', '_[12]')
+                    if c[0] != 'discordant_mates':
+                        if c[0] not in idx:
+                            idx[c[0]] = 0
 
-                    idx[c[0]] += int(c[1])
+                        idx[c[0]] += int(c[1])
 
         def pos_to_gene_str(pos_chr, pos_pos):
             if pos_chr[0:3] == 'chr':
@@ -188,6 +184,66 @@ def pos_to_gene_str(pos_chr, pos_pos):
         else:
             return genesB + '<->' + genesA
 
+    def is_on_splice_junction_motif(self, fasta_fh):
+        """
+
+motif:
+
+5' exon:
+
+[ ...{AC}{A}{G} ] {G}{T}{AG}{A}{G}{T} . . . {C}{A}{G} [ {G}... ]
+
+        """
+
+        pos5_in_exon_length = 3
+        pos5_post_exon_length = 6
+
+        pos3_pre_exon_length = 3
+        pos3_in_exon_length = 1
+
+        if self.donorA > self.donorB:
+            pos5p = [self.chrA, self.posA, self.strandA]
+            pos3p = [self.chrB, self.posB, self.strandB]
+        elif self.donorA < self.donorB:
+            pos5p = [self.chrB, self.posB, self.strandB]
+            pos3p = [self.chrA, self.posA, self.strandA]
+        else:
+            pos5p = None
+
+        if pos5p:
+            if pos5p[2] == '-':
+                seq_in_5p_exon = str(fasta_fh[pos5p[0]][pos5p[1] - pos5_in_exon_length:pos5p[1]]).upper()
+                seq_post_5p_exon = str(fasta_fh[pos5p[0]][pos5p[1]:pos5p[1] + pos5_post_exon_length]).upper()
+            else:
+                seq_in_5p_exon = reverse_complement(str(fasta_fh[pos5p[0]][pos5p[1]:pos5p[1] + pos5_in_exon_length]))
+                seq_post_5p_exon = reverse_complement(str(fasta_fh[pos5p[0]][pos5p[1] - pos5_post_exon_length:pos5p[1]]))
+
+            if pos3p[2] == '+':
+                seq_pre_3p_exon = str(fasta_fh[pos3p[0]][pos3p[1] - pos3_pre_exon_length:pos3p[1]]).upper()
+                seq_in_3p_exon = str(fasta_fh[pos3p[0]][pos3p[1]:pos3p[1] + pos3_in_exon_length]).upper()
+            else:
+                seq_in_3p_exon = reverse_complement(str(fasta_fh[pos3p[0]][pos3p[1] - pos3_in_exon_length:pos3p[1]]))
+                seq_pre_3p_exon = reverse_complement(str(fasta_fh[pos3p[0]][pos3p[1]:pos3p[1] + pos3_pre_exon_length]))
+
+            def calc_dist(pat, subseq):
+                d = 0
+
+                if len(pat) != len(subseq):
+                    raise Exception("invalid pattern size")
+                for i in range(len(pat)):
+                    if subseq[i] not in pat[i]:
+                        d += 1
+
+                return d
+
+            dist = calc_dist(["AC", "A", "G"], seq_in_5p_exon) + calc_dist(["G", "T", "AG", "A", "G", "T"], seq_post_5p_exon) + calc_dist(["C", "A", "G"], seq_pre_3p_exon) + calc_dist(["G"], seq_in_3p_exon)
+            # print "[ ... " + seq_in_5p_exon + " ] " + seq_post_5p_exon + " ... ... " + seq_pre_3p_exon + " [ " + seq_in_3p_exon + " ... ] ---> " + str(dist)
+            self.edit_dist_to_splice_motif = str(dist)
+
+            return dist
+        else:
+            return ""
+
     def __str__(self):
         line = self.line
         line[11] = self.status
@@ -382,17 +438,17 @@ def classify_intronic_exonic():
 
         log.info("Classified " + str(k) + "/" + str(n) + " as valid")
 
-    def integrate(self, output_table, gtf_file):
-        def insert_in_index(index, entries, score):
+    def integrate(self, output_table, gtf_file, fasta_file):
+        def insert_in_index(index, entries, score, i):
             if score not in index:
                 index[score] = {}
 
-            key = entries[0].chrA + ':' + str(entries[0].posA) + '(' + entries[0].strandA + ')-' + entries[0].chrB + ':' + str(entries[0].posB) + '(' + entries[0].strandB + ')'
+            key = entries[0].chrA + ':' + str(entries[0].posA) + '(' + entries[0].strandA + ')-' + entries[0].chrB + ':' + str(entries[0].posB) + '(' + entries[0].strandB + ')_' + str(i)
             index[score][key] = entries
 
         with open(output_table, 'w') as fh_out:
             header = self.header.split("\t")
-            header = "\t".join(header[:-5] + ['full-gene-dysregulation', 'frameshift=0', 'frameshift=+1', 'frameshift=+2'] + header[-5:])
+            header = "\t".join(header[:-5] + ['full-gene-dysregulation', 'frameshift=0', 'frameshift=+1', 'frameshift=+2', 'splice-motif-edit-distance'] + header[-5:])
 
             fh_out.write("shared-id\tfusion\t" + header)
 
@@ -403,6 +459,8 @@ def insert_in_index(index, entries, score):
             gene_annotation = GeneAnnotation(gtf_file)
             dfs = DetectFrameShifts(gtf_file) if gtf_file else None
 
+            ffs = Fasta(fasta_file) if fasta_file else None
+
             intronic_linear = []
             remainder = []
 
@@ -425,30 +483,34 @@ def insert_in_index(index, entries, score):
                     frameshifts_2 = [x[0][0] + '(+' + str(x[0][1]) + ')->' + x[1][0] + '(+' + str(x[1][1]) + ')' for x in frame_shifts[2]]
 
                     for additional_breaks in e.structure.split('&'):
-                        params = additional_breaks.split(':(')
-                        n_split_reads = sum([int(x.split(':')[1]) for x in params[1].rstrip(')').split(',') if x.split(':')[0] != 'discordant_mates'])
+                        if additional_breaks != '':
+                            params = additional_breaks.split(':(')
+                            n_split_reads = sum([int(x.split(':')[1]) for x in params[1].rstrip(')').split(',') if x.split(':')[0] != 'discordant_mates'])
 
-                        posAB = params[0].split(':')
-                        posA, posB = int(posAB[1].split('/')[0]), int(posAB[2].split('/')[0])
+                            posAB = params[0].split(':')
+                            posA, posB = int(posAB[1].split('/')[0]), int(posAB[2].split('/')[0])
 
-                        if params[0] not in done_breaks and n_split_reads > 0:
-                            if e.donorA > e.donorB:
-                                frame_shifts = dfs.evaluate([e.chrA, posA, e.RNAstrandA], [e.chrB, posB, e.RNAstrandB], 2)
-                            else:
-                                frame_shifts = dfs.evaluate([e.chrB, posB, e.RNAstrandB], [e.chrA, posA, e.RNAstrandA], 2)
+                            if params[0] not in done_breaks and n_split_reads > 0:
+                                if e.donorA > e.donorB:  # nice, use same thing to swap if necessary
+                                    frame_shifts = dfs.evaluate([e.chrA, posA, e.RNAstrandA], [e.chrB, posB, e.RNAstrandB], 2)
+                                else:
+                                    frame_shifts = dfs.evaluate([e.chrB, posB, e.RNAstrandB], [e.chrA, posA, e.RNAstrandA], 2)
 
-                            fgd += [x[0] + '->' + x[1] for x in frame_shifts['fgd']]
-                            frameshifts_0 += [x[0][0] + '->' + x[1][0] for x in frame_shifts[0]]
-                            frameshifts_1 += [x[0][0] + '(+' + str(x[0][1]) + ')->' + x[1][0] + '(+' + str(x[1][1]) + ')' for x in frame_shifts[1]]
-                            frameshifts_2 += [x[0][0] + '(+' + str(x[0][1]) + ')->' + x[1][0] + '(+' + str(x[1][1]) + ')' for x in frame_shifts[2]]
+                                fgd += [x[0] + '->' + x[1] for x in frame_shifts['fgd']]
+                                frameshifts_0 += [x[0][0] + '->' + x[1][0] for x in frame_shifts[0]]
+                                frameshifts_1 += [x[0][0] + '(+' + str(x[0][1]) + ')->' + x[1][0] + '(+' + str(x[1][1]) + ')' for x in frame_shifts[1]]
+                                frameshifts_2 += [x[0][0] + '(+' + str(x[0][1]) + ')->' + x[1][0] + '(+' + str(x[1][1]) + ')' for x in frame_shifts[2]]
 
-                        done_breaks.add(params[0])
+                            done_breaks.add(params[0])
 
                     e.fgd = ','.join(sorted(list(set(fgd))))
                     e.frameshift_0 = ','.join(sorted(list(set(frameshifts_0))))
                     e.frameshift_1 = ','.join(sorted(list(set(frameshifts_1))))
                     e.frameshift_2 = ','.join(sorted(list(set(frameshifts_2))))
 
+                if ffs:
+                    e.is_on_splice_junction_motif(ffs)
+
                 if e.x_onic == 'intronic' and e.circ_lin == 'linear':
                     intronic_linear.append(e)
                 else:
@@ -470,7 +532,7 @@ def insert(pos, e):
 
             # Reorder
             idx2 = {}
-
+            q = 0
             for e in intronic_linear:
                 results = {}
                 positions = [(e.chrA, e.posA, e.strandA), (e.chrB, e.posB, e.strandB)]
@@ -497,24 +559,28 @@ def insert(pos, e):
                                 results[e2] += 1
 
                 top_result = (None, 9999999999999)
-                for r in results:
+                for r in sorted(results.keys()):
                     if results[r] >= 2:
                         d1 = (r.posA - e.posA)
                         d2 = (r.posB - e.posB)
                         sq_d = math.sqrt(pow(d1, 2) + pow(d2, 2))
 
                         shared_score = math.sqrt((pow(e.score, 2) + pow(r.score, 2)) * 0.5)
                         penalty = 1.0 * sq_d / shared_score
+
                         if penalty < top_result[1]:
                             top_result = (r, penalty)
 
                 if top_result[0]:
-                    insert_in_index(idx2, [e, top_result[0]], e.score + top_result[0].score)
+                    insert_in_index(idx2, [e, top_result[0]], e.score + top_result[0].score, q)
                 else:
-                    insert_in_index(idx2, [e], e.score)
+                    insert_in_index(idx2, [e], e.score, q)
+
+                q += 1
 
             for e in remainder:
-                insert_in_index(idx2, [e], e.score)
+                insert_in_index(idx2, [e], e.score, q)
+                q += 1
 
             log.info("Determining fusion gene names and generate output")
             # Generate output
@@ -526,7 +592,7 @@ def insert(pos, e):
                     for entry in idx2[score][key]:
                         if entry not in exported:
                             acceptors_donors = entry.get_donors_acceptors(gene_annotation)
-                            line = entry.line[:-5] + [entry.fgd, entry.frameshift_0, entry.frameshift_1, entry.frameshift_2] + entry.line[-5:]
+                            line = entry.line[:-5] + [entry.fgd, entry.frameshift_0, entry.frameshift_1, entry.frameshift_2, entry.edit_dist_to_splice_motif] + entry.line[-5:]
 
                             fh_out.write(str(i) + "\t" + acceptors_donors + "\t" + "\t".join(line) + "\n")
                             exported.add(entry)

diff --git a/drdisco/__init__.py b/drdisco/__init__.py
@@ -31,7 +31,7 @@
 import logging
 import sys
 
-__version_info__ = ('0', '14', '6')
+__version_info__ = ('0', '15', '0')
 __version__ = '.'.join(__version_info__) if (len(__version_info__) == 3) else '.'.join(__version_info__[0:3]) + "-" + __version_info__[3]
 __author__ = 'Youri Hoogstrate'
 __homepage__ = 'https://github.com/yhoogstrate/dr-disco'

diff --git a/drdisco/utils.py b/drdisco/utils.py
@@ -0,0 +1,31 @@
+#!/usr/bin/env python
+# *- coding: utf-8 -*-
+# vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4 textwidth=79:
+
+
+import gzip
+
+
+alt_map = {'ins': '0'}
+complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'}
+
+
+def reverse_complement(seq):
+    seq = seq.upper()
+    for k, v in alt_map.iteritems():
+        seq = seq.replace(k, v)
+    bases = list(seq)
+    bases = reversed([complement.get(base, base) for base in bases])
+    bases = ''.join(bases)
+    for k, v in alt_map.iteritems():
+        bases = bases.replace(v, k)
+    return bases
+
+
+def is_gzip(filename):
+    try:
+        f = gzip.GzipFile(filename, 'rb')
+        f.read()
+        return True
+    except Exception:
+        return False
diff --git a/requirements.txt b/requirements.txt
@@ -5,3 +5,4 @@ HTSeq==0.6.1
 numpy
 pysam==0.10.0
 scipy
+pyfaidx==0.5.1
diff --git a/share/blacklist-junctions.hg38.txt b/share/blacklist-junctions.hg38.txt
@@ -4050,3 +4050,9 @@ chr13	66903600	66903611	-	chr13	67298227	67298238	+	PCDH9 new exon
 chr13	111865377	111865388	-	chr13	112095469	112095481	+	SOX1 new exon(s)
 chr13	111865377	111865388	-	chr13	112098310	112098322	+	SOX1 new exon(s)
 chr13	45338700	45338750	-	chr13	45339500	45339560	-	TPT1 weird ribosomal/LINC-rna low entropy stuff again
+chr2	81548742	81548753	-	chr2	81797812	81797823	+	most likely new gene	brca
+chr2	81548742	81548753	-	chr2	81822264	81822275	+	most likely new gene	brca
+chr2	81548742	81548753	-	chr2	81848747	81848758	+	most likely new gene	brca
+chr2	81570408	81570419	-	chr2	81797812	81797823	+	most likely new gene	brca
+chr2	81570408	81570419	-	chr2	81848747	81848758	+	most likely new gene	brca
+chr2	173533339	173533340	-	chr2	173761578	173761579	+	most likely new gene	brca
diff --git a/tests/chim_overhang/test_01_integrate.out.txt b/tests/chim_overhang/test_01_integrate.out.txt
@@ -1,2 +1,2 @@
-shared-id	fusion	chr-A	pos-A	direction-A	pos-A-acceptor	pos-A-donor	chr-B	pos-B	direction-B	pos-B-acceptor	pos-B-donor	genomic-distance	filter-status	circRNA	intronic/exonic	score	soft+hardclips	n-split-reads	n-discordant-reads	alignment-score	mismatches	n-edges	n-nodes-A	n-nodes-B	n-splice-junc-A	n-splice-junc-B	entropy-bp-edge	entropy-all-edges	bp-pos-stddev	entropy-disco-bps	lr-A-slope	lr-A-intercept	lr-A-rvalue	lr-A-pvalue	lr-A-stderr	lr-B-slope	lr-B-intercept	lr-B-rvalue	lr-B-pvalue	lr-B-stderr	disco/split	clips/score	nodes/edge	full-gene-dysregulation	frameshift=0	frameshift=+1	frameshift=+2	median-AS-A	median-AS-B	max-AS-A	max-AS-B	data-structure
-1	chr2:17281790->chr11:5566704	chr2	17281790	+	0	22	chr11	5566704	-	22	0	inf	entropy=0.7372<0.7382,chim_overhang=21<25	linear	intronic	33	22	11	0	1530	10	1	1	1	0	0	0.7372	0.7372	0.0000	0.0000	0.3818	18.0000	0.8367	0.0013	0.0833	2.0455	38.7727	0.9652	0.0000	0.1847	0.0000	0.3333	2.0000					21	49	21	57	chr2:17281790/17281791(+)->chr11:5566704/5566705(-):(spanning_paired_1:11,spanning_paired_2:11)
+shared-id	fusion	chr-A	pos-A	direction-A	pos-A-acceptor	pos-A-donor	chr-B	pos-B	direction-B	pos-B-acceptor	pos-B-donor	genomic-distance	filter-status	circRNA	intronic/exonic	score	soft+hardclips	n-split-reads	n-discordant-reads	alignment-score	mismatches	n-edges	n-nodes-A	n-nodes-B	n-splice-junc-A	n-splice-junc-B	entropy-bp-edge	entropy-all-edges	bp-pos-stddev	entropy-disco-bps	lr-A-slope	lr-A-intercept	lr-A-rvalue	lr-A-pvalue	lr-A-stderr	lr-B-slope	lr-B-intercept	lr-B-rvalue	lr-B-pvalue	lr-B-stderr	disco/split	clips/score	nodes/edge	full-gene-dysregulation	frameshift=0	frameshift=+1	frameshift=+2	splice-motif-edit-distance	median-AS-A	median-AS-B	max-AS-A	max-AS-B	data-structure
+1	chr2:17281790->chr11:5566704	chr2	17281790	+	0	22	chr11	5566704	-	22	0	inf	entropy=0.7372<0.7382,chim_overhang=21<25	linear	intronic	33	22	11	0	1530	10	1	1	1	0	0	0.7372	0.7372	0.0000	0.0000	0.3818	18.0000	0.8367	0.0013	0.0833	2.0455	38.7727	0.9652	0.0000	0.1847	0.0000	0.3333	2.0000						21	49	21	57	chr2:17281790/17281791(+)->chr11:5566704/5566705(-):(spanning_paired_1:11,spanning_paired_2:11)
-Original file line number
+Diff line change
@@ Expand Up / @@ -5,3 +5,4 @@ HTSeq==0.6.1 @@
     numpy
     pysam==0.10.0
     scipy
+    pyfaidx==0.5.1