From d5ee065b7d5a0fdebb9747a0668b471a7fc67092 Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Wed, 3 Jan 2018 09:21:20 +0100 Subject: [PATCH 01/25] small update --- share/blacklist-junctions.hg38.txt | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/share/blacklist-junctions.hg38.txt b/share/blacklist-junctions.hg38.txt index 0597898b..0aeec0b9 100644 --- a/share/blacklist-junctions.hg38.txt +++ b/share/blacklist-junctions.hg38.txt @@ -3117,3 +3117,11 @@ chr9 105524058 105524069 - chr14 35664542 35664553 - recurrent in normals 363,30 chr15 20371590 20371591 - chr15 22282859 22282879 - chr15 20383456 20383457 - chr15 22282859 22282879 - chr15 20407454 20407617 - chr15 22282859 22282879 - +chr2 81548742 81548753 - chr2 81797812 81797823 + most likely new gene brca +chr2 81548742 81548753 - chr2 81822264 81822275 + most likely new gene brca +chr2 81548742 81548753 - chr2 81848747 81848758 + most likely new gene brca +chr2 81570408 81570419 - chr2 81797812 81797823 + most likely new gene brca +chr2 81570408 81570419 - chr2 81848747 81848758 + most likely new gene brca + + + From ba3d99205d434460c9bc2cde831ed8ecf92f090b Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Wed, 3 Jan 2018 14:34:02 +0100 Subject: [PATCH 02/25] sj test data --- .../test_splice_site_motif_01.in.fa | 71 +++++++++++++++++++ .../test_splice_site_motif_01.in.sam | 47 ++++++++++++ 2 files changed, 118 insertions(+) create mode 100644 tests/splite_site_motif/test_splice_site_motif_01.in.fa create mode 100644 tests/splite_site_motif/test_splice_site_motif_01.in.sam diff --git a/tests/splite_site_motif/test_splice_site_motif_01.in.fa b/tests/splite_site_motif/test_splice_site_motif_01.in.fa new file mode 100644 index 00000000..6836280a --- /dev/null +++ b/tests/splite_site_motif/test_splice_site_motif_01.in.fa @@ -0,0 +1,71 @@ +>chr2 +NNNNNNNNNTTTTTTTTTTT +TTTTTTTTTTTTTTTTTTTT +TTTTTTTTTTTTTTTTTTTT +TTTTTTTTTTTTTTTTTTTT +TTTTTTTTTTTTTTTTTTTT +TTTTTTTTTTTTTTTTTTTT +TTTTTTTTTTTTTTTTTTTT +TTTTTTTTTTTTTTTTTTTT +TTTTTTTTTTTTTTTTTTTG +TTCTTAAAGGTAAGTTTTTT +NNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNCAGG +GCCCCAGGATCCTCTATTAA +ATGTGTGGTCCATGAACCAG +CAGCTTCAGCATGACCTGAG +TTTTTTTTTTTTTTTTTTTT +TTTTTTTTTTTTTTTTTTTT +TTTTTTTTTTTTTTTTTTTT +TTTTTTTTTTTTTTTTTTTT +TTTTTTTTTTTTTTTTTTTT +TTTTTTTTTTTTTTTTTTTT +TTTTTTTTTTTTTTTTTTTT +TTTTTTTTTTTTTTTTTTTT +TTTTTTTTTTTTTTTTTTTT +TTTTTTTTTTTTTTTTTTTT +TTTTTTTTTTTTTTTTTTTT +TTTTTTTTTTTTTTTTTTTT +NNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNN diff --git a/tests/splite_site_motif/test_splice_site_motif_01.in.sam b/tests/splite_site_motif/test_splice_site_motif_01.in.sam new file mode 100644 index 00000000..6cd7ef84 --- /dev/null +++ b/tests/splite_site_motif/test_splice_site_motif_01.in.sam @@ -0,0 +1,47 @@ +@HD VN:1.4 SO:coordinate +@SQ SN:chr1 LN:248956422 +@SQ SN:chr2 LN:242193529 +@SQ SN:chr3 LN:198295559 +@SQ SN:chr4 LN:190214555 +@SQ SN:chr5 LN:181538259 +@SQ SN:chr6 LN:170805979 +@SQ SN:chr7 LN:159345973 +@SQ SN:chr8 LN:145138636 +@SQ SN:chr9 LN:138394717 +@SQ SN:chr10 LN:133797422 +@SQ SN:chr11 LN:135086622 +@SQ SN:chr12 LN:133275309 +@SQ SN:chr13 LN:114364328 +@SQ SN:chr14 LN:107043718 +@SQ SN:chr15 LN:101991189 +@SQ SN:chr16 LN:90338345 +@SQ SN:chr17 LN:83257441 +@SQ SN:chr18 LN:80373285 +@SQ SN:chr19 LN:58617616 +@SQ SN:chr20 LN:64444167 +@SQ SN:chr21 LN:46709983 +@SQ SN:chr22 LN:50818468 +@SQ SN:chrX LN:156040895 +@SQ SN:chrY LN:57227415 +@SQ SN:chrM LN:16569 +@CO user command line: STAR --outSAMstrandField intronMotif --outFilterIntronMotifs RemoveNoncanonicalUnannotated --chimSegmentMin 12 --chimJunctionOverhangMin 12 --alignSJDBoverhangMin 10 --alignMatesGapMax 200000 --alignIntronMax 200000 --outSAMtype BAM Unsorted --outSAMunmapped Within --alignEndsType Local --chimOutType WithinBAM --twopassMode Basic --twopass1readsN -1 --quantMode GeneCounts --runThreadN 10 +read_001 163 chr2 10 3 75M = 176 180 GGTGCGCGCACCCACTGACCTGCGCCCACTGTCTGGCACTCCGTAGTGAGATGAACCCGGTACCTCAGATGGAAA ?E;DDEFBDEBCF:CEHGHFH??DEDDFHBFCDDFF?JFEDDDA8F9EDG>EFFGGEBDD7DG88-.G88@@HG? NH:i:2 HI:i:2 AS:i:87 nM:i:0 NM:i:0 +read_002 161 chr2 42 3 75M = 1100 0 CTGGCACTCCGTAGTGAGATGAACCCGGTACCTCAGATGGAAATGCAGAAATCACCCATCTTCTGCGTCACTCAA ?DADDEFBFECCGG?DGFDDGFFBGHGFHEGFDGGFHHFEGFHGGFGGFGGFCEGDFFIHGAHIFGAFHEHFHGE NH:i:2 HI:i:2 AS:i:73 nM:i:0 NM:i:0 +read_003 385 chr2 142 3 48M27S = 1099 0 TCCTATTCGGCCATCTTGGCTCCTCCGCTCTATTTGCAGTTCTTAAAGGGCCCCAGGATCCTCTATTAAATGTGT 9DADFEFFFEGFFFGHGGH9HF78GAGFFFHHHG:GHFHEGEFE.DGGHGCGIHFHGEGFGHFIHIGGHH@GDG? NH:i:2 HI:i:2 AS:i:47 nM:i:0 NM:i:0 +read_004 385 chr2 149 3 41M34S = 1026 0 CGGCCATCTTGGCTCCTCCGCTCTATTTGCAGTTCTTAAAGGGCCCCAGGATCCTCTATTAAATGTGTGGTCCAT ?CDDDEFFFEGAGFEFDF;FH?DGFHGFFEHHAGFGHHHGGGICGIGGEHGDIFFGEEIHGHGKFFGHHHGGGGE NH:i:2 HI:i:2 AS:i:40 nM:i:0 NM:i:0 +read_005 385 chr2 149 3 41M34S = 1026 0 CGGCCATCTTGGCTCCTCCGCTCTATTTGCAGTTCTTAAAGGGCCCCAGGATCCTCTATTAAATGTGTGGTCCAT ?CDDFAFGDEGFFFFHGFFFGCFGBGGGFEHCFGFGEJHEGH?EGFGGHHAEGFGHHHGHEHGGFBGFDHGGEFG NH:i:2 HI:i:2 AS:i:40 nM:i:0 NM:i:0 +read_006 385 chr2 149 3 41M34S = 1026 0 CGGCCATCTTGGCTCCTCCGCTCTATTTGCAGTTCTTAAAGGGCCCCAGGATCCTCTATTAAATGTGTGGTCCAT ?EEDFEFFFEGHFFEGGGHFHGDGEGHFFGHHFGFGGEHIGGIEGIGGDGGGIGFGHFIFGHHGJFFGEHGGHGH NH:i:2 HI:i:2 AS:i:40 nM:i:0 NM:i:0 +read_007 385 chr2 163 3 27M48S = 1210 0 CCTCCGCTCTATTTGCAGTTCTTAAAGGGCCCCAGGATCCTCTATTAAATGTGTGGTCCATGAACCAGCAGCTTC >ACDF?FACDF?F NH:i:2 HI:i:1 AS:i:107 nM:i:0 NM:i:0 +read_006 83 chr2 1026 3 75M = 1000 -101 TGGTCCATGAACCAGCAGCTTCAGCATGACCTGAGAGCTCATAACCTCGTCTCTACAAAAAATACAAAAAAAGTT FGHFGGEGHIHGGGGIHG?IEIFH7GGHFGGHGCFKHHGFGHGDIGGFHEIGHEGGHHHGHHEFGGGGFFFGDDE NH:i:2 HI:i:1 AS:i:107 nM:i:0 NM:i:0 +read_003 83 chr2 1099 3 75M = 1000 -174 TTAGNCAGGCATGGTGGTACGCGCCTGTGGTCTCAGCAACTTGGGAGGCTGAGGTGAGTGGATTGCTTGAACCTG FEHF!GF:FECFGGEGGFHGEEGGGGGDEGGGDHBGFFFFGG=GFGGDHFFGFFGHGHGDHGHGGFGGFGEDDBE NH:i:2 HI:i:1 AS:i:95 nM:i:2 NM:i:0 +read_002 81 chr2 1100 3 75M = 42 0 TAGCCAGGCATGGTGGTACGCGCCTGTGGTCTCAGCAACTTGGGAGGCTGAGGTGAGTGGATTGCTTGAACCTGG FGGHEGHGGGHDFF:GHGEGGIEGGGGHIFGDFGEAHHFGGHGEFGGFHGFGCGGGFHEGFHGEAGGGFFE>DDD NH:i:2 HI:i:1 AS:i:69 nM:i:2 NM:i:0 +read_007 83 chr2 1210 3 75M = 1000 -285 CTCCGGCCTGGGAGATGGAGCCGGACACTGTCTCAAAAGAAAAAAAAAAGAAATGCAGAACCTCAGGCTGTTCCC F9BCFDEHGFCDGFD:DFCGEFFGD90FEDGG@ACGEH Date: Wed, 3 Jan 2018 15:38:25 +0100 Subject: [PATCH 03/25] splice --- drdisco/DetectOutput.py | 55 ++++++++++++- share/blacklist-junctions.hg38.txt | 3 +- tests/integrate/splice_junction_motif.fa | 17 ++++ .../test_splice_site_motif_01.dbed | 2 + .../test_splice_site_motif_01.fixed.bam | Bin 0 -> 2631 bytes .../test_splice_site_motif_01.in.fa | 0 .../test_splice_site_motif_01.in.sam | 0 .../test_splice_site_motif_01.out.txt | 0 tests/test_splice_site_motif.py | 77 ++++++++++++++++++ 9 files changed, 150 insertions(+), 4 deletions(-) create mode 100644 tests/integrate/splice_junction_motif.fa create mode 100644 tests/splice_site_motif/test_splice_site_motif_01.dbed create mode 100644 tests/splice_site_motif/test_splice_site_motif_01.fixed.bam rename tests/{splite_site_motif => splice_site_motif}/test_splice_site_motif_01.in.fa (100%) rename tests/{splite_site_motif => splice_site_motif}/test_splice_site_motif_01.in.sam (100%) create mode 100644 tests/splice_site_motif/test_splice_site_motif_01.out.txt create mode 100755 tests/test_splice_site_motif.py diff --git a/drdisco/DetectOutput.py b/drdisco/DetectOutput.py index 582fd405..568fcbfa 100644 --- a/drdisco/DetectOutput.py +++ b/drdisco/DetectOutput.py @@ -126,7 +126,7 @@ def parse(self): if self.acceptorA > self.donorA: self.RNAstrandA = self.strandA self.RNAstrandB = inv[self.strandB] - elif self.donorA < self.acceptorA: + elif self.acceptorA < self.donorA: self.RNAstrandA = inv[self.strandA] self.RNAstrandB = self.strandB else: @@ -178,6 +178,52 @@ def pos_to_gene_str(pos_chr, pos_pos): else: return genesB + '<->' + genesA + def is_on_splice_junction_motif(self, fasta_fh): + """ + +motif: + +5' exon: + +[ ...{AC}{A}{G} ] {G}{T}{AG}{A}{G}{T} . . . {C}{A}{G} [ {G}... ] + + """ + + pos5_in_exon_length = 3 + pos5_post_exon_length = 6 + + pos3_pre_exon_length = 3 + pos3_in_exon_length = 1 + + print + print + print + print "calculating change on intron/exon junction using fasta" + print "rna-strand: + - means breakA -> breakB, means A=5' B=3'" + + if self.donorA > self.donorB: + pos5p = [self.chrA, self.posA, self.strandA] + pos3p = [self.chrB, self.posB, self.strandB] + elif self.donorA < self.donorB: + pos5p = [self.chrB, self.posB, self.strandB] + pos3p = [self.chrA, self.posA, self.strandA] + else: + pos5p = None + + print self.RNAstrandA, self.RNAstrandB + + if pos5p: + print "lets proceed" + print "from" , pos5p + if pos5p[2] == '-': + # read sequence downstream + print " ... exon ]{} {} {}" + else: + print "{} {} {} [ exon ..." + # read sequence upstream + print + print + def __str__(self): line = self.line line[11] = self.status @@ -354,7 +400,7 @@ def classify_intronic_exonic(): log.info("Classified " + str(k) + "/" + str(n) + " as valid") - def integrate(self, output_table, gtf_file): + def integrate(self, output_table, gtf_file, fasta_file): def insert_in_index(index, entries, score): if score not in index: index[score] = {} @@ -374,6 +420,8 @@ def insert_in_index(index, entries, score): # index used to annotate gene names: TMPRSS2->ERG gene_annotation = GeneAnnotation(gtf_file) dfs = DetectFrameShifts(gtf_file) if gtf_file else None + + ffs = HTSeq.FastaReader(fasta_file) if fasta_file else None intronic_linear = [] remainder = [] @@ -421,6 +469,9 @@ def insert_in_index(index, entries, score): e.frameshift_1 = ','.join(sorted(list(set(frameshifts_1)))) e.frameshift_2 = ','.join(sorted(list(set(frameshifts_2)))) + if ffs: + e.is_on_splice_junction_motif(ffs) + if e.x_onic == 'intronic' and e.circ_lin == 'linear': intronic_linear.append(e) else: diff --git a/share/blacklist-junctions.hg38.txt b/share/blacklist-junctions.hg38.txt index 0aeec0b9..6c5a15f1 100644 --- a/share/blacklist-junctions.hg38.txt +++ b/share/blacklist-junctions.hg38.txt @@ -3122,6 +3122,5 @@ chr2 81548742 81548753 - chr2 81822264 81822275 + most likely new gene brca chr2 81548742 81548753 - chr2 81848747 81848758 + most likely new gene brca chr2 81570408 81570419 - chr2 81797812 81797823 + most likely new gene brca chr2 81570408 81570419 - chr2 81848747 81848758 + most likely new gene brca - - +chr2 173533339 173533340 - chr2 173761578 173761579 + most likely new gene brca diff --git a/tests/integrate/splice_junction_motif.fa b/tests/integrate/splice_junction_motif.fa new file mode 100644 index 00000000..6c7f04fd --- /dev/null +++ b/tests/integrate/splice_junction_motif.fa @@ -0,0 +1,17 @@ +>perfect_fwd +tttttttttttttttttttttttttttttttttttttttt +tttttttttttttttttttttttttttttttttttttttt +ttttttttttCAggTGAGTttttttttttttttttttttt +ttttttttttCAggtttttttttttttttttttttttttt +tttttttttttttttttttttttttttttttttttttttt +tttttttttttttttttttttttttttttttttttttttt +tttttttttttttttttttttttttttttttttttttttt +>imperfect_fwd +tttttttttttttttttttttttttttttttttttttttt +tttttttttttttttttttttttttttttttttttttttt +tttttttttttttttttttttttttttttttttttttttt +ttttttttttCAggTAAGAttttttttttttttttttttt +ttttttttttCAggtttttttttttttttttttttttttt +tttttttttttttttttttttttttttttttttttttttt +tttttttttttttttttttttttttttttttttttttttt +tttttttttttttttttttttttttttttttttttttttt diff --git a/tests/splice_site_motif/test_splice_site_motif_01.dbed b/tests/splice_site_motif/test_splice_site_motif_01.dbed new file mode 100644 index 00000000..83a72e86 --- /dev/null +++ b/tests/splice_site_motif/test_splice_site_motif_01.dbed @@ -0,0 +1,2 @@ +chr-A pos-A direction-A pos-A-acceptor pos-A-donor chr-B pos-B direction-B pos-B-acceptor pos-B-donor genomic-distance filter-status circRNA intronic/exonic score soft+hardclips n-split-reads n-discordant-reads alignment-score mismatches n-edges n-nodes-A n-nodes-B n-splice-junc-A n-splice-junc-B entropy-bp-edge entropy-all-edges bp-pos-stddev entropy-disco-bps lr-A-slope lr-A-intercept lr-A-rvalue lr-A-pvalue lr-A-stderr lr-B-slope lr-B-intercept lr-B-rvalue lr-B-pvalue lr-B-stderr disco/split clips/score nodes/edge median-AS-A median-AS-B max-AS-A max-AS-B data-structure +chr2 189 - 0 14 chr2 999 + 14 0 810 unclassified linear intronic 19 12 6 1 1019 4 1 1 1 0 0 0.7580 0.7580 0.0000 1.0000 8.0357 16.8929 0.9213 0.0032 1.5165 7.5714 22.0000 0.9311 0.0023 1.3263 0.1667 0.3158 2.0000 41 34 75 75 chr2:189/190(-)->chr2:999/1000(+):(discordant_mates:2,spanning_paired_1:5,spanning_paired_1_t:1,spanning_paired_2:5,spanning_paired_2_t:1) diff --git a/tests/splice_site_motif/test_splice_site_motif_01.fixed.bam b/tests/splice_site_motif/test_splice_site_motif_01.fixed.bam new file mode 100644 index 0000000000000000000000000000000000000000..0400ae10efb3ae522e21630133cab906a53d6a23 GIT binary patch literal 2631 zcmV-N3b^$jiwFb&00000{{{d;LjnN(1HD#FYa~S&E?qa-j;wmngBPib;2tJ)Rabw_ z#eBJOLwd4ivR{Y{z1@{e!F1JZSIuVkqBnnm=t1zJh!=l=u%IUq@e&0+cUQRa-Nimt2;-*JY6 zel{+VvpeWx62*Q%Ft!L>6Bxywk7JAQDgwn3j(y*^2~ra-Mnp3f;Ts}Cz8B%Sh`SD%Y81Ms5&g#oe$gN^=x=no3)!C)hw1fd(pkv(H<)a$9Bhy=EfyY*HAm$>@8 zSj0UO5Mmq>WZM%H5cq)>y}#|;*y`s@X8IJST#X8S5T&y<>>rGoge6ULI8G%z^?;QJ zoTjX7qa~ZrCbA0L*D(@`L#V0@>hYAqw4{&=WoJ1a>nQC)Sy^XfvdcG3IcNN6R28B5 z2+g;1tlLQ^?1o@-JH({#!^$HF{cXXGE1|5F9{hE!$cU{sga z&aNoWrJ{UFMRhI})tMFXId2VL_}1`+Zw+7E)};QKP$l?6kK6s3(hv62_hn^XXl0A# zUgWqfnEAhVeRJEnIq2u5dH=yf_NbSQSwTydnYV9kA8Grjv+F+y?kXayuCuw@Us>DS zb7qp3FcU>lt#QIQ?Ze^0`abOS#7qs>lX}6|8J*;iaa9VQ2*nO-{yLi|THZJ{OSn%9 z@sJJ#&r&Wp%hJg$uEh(bTh4V<)yj|_75d$tFb)xJY&$caRa*BR(sG>gqmDG1hdWyv z!e-o-NvfM=JDn!!BZ%GV(=tt4w!M~Q4(rBx0&j6C%DSzZneif>PPO3<)7K1dGrh%g zc~AxI>V-_2hWBR5*lO1*U9YE?Hch2WM6L^@l4YkhR4rK!U=L~AgPFdpjMp~hbW_Z@ zlGu+`8gQf3(eW3vuJpM?uzyr(Qojl(F0zQ3ky9GY@i*>xg&-aO^<0Z0@cQA89g4wA;{8hv!Y)|8^IL}y z_~XeB&$bAIFN)<$jfA`4+kbXjareL{5o?(Qe6t3?zq8$qm@f+e03VA81ONa400936 z0763o04xT@n9Xk!M;OM}feQsxY^~xAH1^K-1X4A2Y$p*t;LWymfFOYnNL-?bL~YbI zMS)9ic(n&#D)qpvr?&T&LofUnJ@gN#)Jmw=o>A%Z?mG55Yyw6=BxXNm9f{}nempbt z%yFCo#wi*n)=CBEXXn~r?pKW8F!r`Lw>B0QYR==6juXaGGR|z9#}c=W4VmQ~H+H>t ziHAKNxn5X2SCQw1kctWXR?To}v{9x~5mE?cnBusp%ACTh@XN}RLeEO2w3bYWX`##I zGiPzBR*kOQ=&Vk4)>fz5-PPN>>#NQUj9Z`Gxxe$(_U?m?uQwlTJKgr3yI<_p=W9#N z`C4PWRqM_#&VM++a5Sy{nsw^dtDXPxW8ouQ9FUsU^_$Kr1%#!>4o zHc}i5`#9>&A&zV*G$z5(d<`7RS6U?5kw8$IE6pGfIPe7NbI1-;p?rXLUk(VN3h*0- zlEGan{f=-nC&AI06o-`i#bW59sWg!K$tzS74x|L~!b-{xL3xta34PXclqpnV+Poti zts#z|a&SBu$Fbx*#_UP-kE1!nMeeeo@6Y;NsEWE^e3%87s;WHB1S?Ogvd5~7%hQUh zst`=5GjNKksrXf1Eek1iSxT=AfSi&@vTW}-KPis-Qn#@@j-$S?l8a->d9N_?$P`C+ z=o?R={QC3f&xa=SCno>EWM1ctI~-2J0@p?^i%YhBg}eaq)q+ztz&&9i2tK2jV*-ru zBour_`?ku^S5R2gQjKCJ>9ABV@Epln2V+38ra7?E0gX!sTy#Gl%WFQGAV)J6@)vX1;1ua%b?OCAntC)-(Ld4#s%AEU7l zjxS#!IWjnw(K^-_8{G+wZa35sp?J#4e5I!f|uxv!|RfpRGB6p;Pbway9jHz(0gktcDain2m+nC6XzfrlF=UtF*Ex%^*1Nc-3?y9eqePqwvu|9If_a3-yG_n8}Ekc@k}pZA&Wa%U>Bm(r6c9&$+YLYIf7QXECaVjJ0Abd%#K9zk@U`^rOK7u41` zokGn}f2X51;k~cmdAjQ3-CQ6&#*#XT^;Ct_X%B6|PY$lQWP-Ci4?H5&Is2&tW;kYi p{{zj4(LRb4001A02m}BC000301^_}s0stET0{{R300000001l^{`>#{ literal 0 HcmV?d00001 diff --git a/tests/splite_site_motif/test_splice_site_motif_01.in.fa b/tests/splice_site_motif/test_splice_site_motif_01.in.fa similarity index 100% rename from tests/splite_site_motif/test_splice_site_motif_01.in.fa rename to tests/splice_site_motif/test_splice_site_motif_01.in.fa diff --git a/tests/splite_site_motif/test_splice_site_motif_01.in.sam b/tests/splice_site_motif/test_splice_site_motif_01.in.sam similarity index 100% rename from tests/splite_site_motif/test_splice_site_motif_01.in.sam rename to tests/splice_site_motif/test_splice_site_motif_01.in.sam diff --git a/tests/splice_site_motif/test_splice_site_motif_01.out.txt b/tests/splice_site_motif/test_splice_site_motif_01.out.txt new file mode 100644 index 00000000..e69de29b diff --git a/tests/test_splice_site_motif.py b/tests/test_splice_site_motif.py new file mode 100755 index 00000000..22fb722d --- /dev/null +++ b/tests/test_splice_site_motif.py @@ -0,0 +1,77 @@ +#!/usr/bin/env python +# *- coding: utf-8 -*- +# vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4 textwidth=79: + +""" +Dr. Disco - testing fix-chimeric + +[License: GNU General Public License v3 (GPLv3)] + + This file is part of Dr. Disco. + + FuMa is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Dr. Disco is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + You should have received a copy of the GNU General Public License + along with this program. If not, see . +""" + + +from drdisco.IntronDecomposition import IntronDecomposition + +import unittest +import filecmp +import os +import subprocess +from drdisco.DetectOutput import DetectOutput +from utils import main, sam_to_fixed_bam + + +TEST_DIR = "tests/splice_site_motif/" +T_TEST_DIR = "tmp/" + TEST_DIR + + +# Nosetests doesn't use main() +if not os.path.exists(T_TEST_DIR): + os.makedirs(T_TEST_DIR) + + +class TestIntronicBreakDetection(unittest.TestCase): + def test_sj_01(self): + test_id = 'splice_site_motif_01' + + input_sam = TEST_DIR + "test_" + test_id + ".in.sam" + input_bam = TEST_DIR + "test_" + test_id + ".fixed.bam" + input_file = TEST_DIR + "test_" + test_id + ".dbed" + + gtf_file = None + fasta_file = TEST_DIR + "test_" + test_id + ".in.fa" + output_file = T_TEST_DIR + "test_" + test_id + ".out.txt" + + test_file = TEST_DIR + "test_" + test_id + ".out.txt" + + # sam -> fixed bam + sam_to_fixed_bam(input_sam, input_bam, T_TEST_DIR) + + # fixed bam -> dr-disco detect + ic = IntronDecomposition(input_bam) + ic.decompose(0) + fh = open(input_file, "w") + ic.export(fh) + fh.close() + + # dr-disco-detect (skip classify) -> dr-disco integrate + cl = DetectOutput(input_file) + cl.integrate(output_file, gtf_file, fasta_file) + + self.assertTrue(filecmp.cmp(test_file, output_file), msg="diff '" + test_file + "' '" + output_file + "':\n" + subprocess.Popen(['diff', test_file, output_file], stdout=subprocess.PIPE).stdout.read()) + + +if __name__ == '__main__': + main() From 3df87f4fcdb6923df334cec31bad4f83733832e5 Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Wed, 3 Jan 2018 16:25:32 +0100 Subject: [PATCH 04/25] forward seems te go okayist --- drdisco/DetectOutput.py | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/drdisco/DetectOutput.py b/drdisco/DetectOutput.py index 568fcbfa..2757e9f2 100644 --- a/drdisco/DetectOutput.py +++ b/drdisco/DetectOutput.py @@ -210,17 +210,32 @@ def is_on_splice_junction_motif(self, fasta_fh): else: pos5p = None - print self.RNAstrandA, self.RNAstrandB if pos5p: + sequences = dict( (s.name, s) for s in fasta_fh ) + print "lets proceed" print "from" , pos5p if pos5p[2] == '-': - # read sequence downstream - print " ... exon ]{} {} {}" + #print " ... exon ] {G} {T} {AG} {A} {G} {T}" + seq_in_5p_exon = str(sequences['chr2'][pos5p[1]-pos5_in_exon_length:pos5p[1]]) + seq_post_5p_exon = str(sequences['chr2'][pos5p[1]:pos5p[1]+pos5_post_exon_length]) + else: + #print "{T} {G} {A} {GA} {T} {G} [ exon ..." + seq_in_5p_exon = str(sequences['chr2'][pos5p[1]:pos5p[1]+pos5_in_exon_length]) + " " + seq_post_5p_exon = str(sequences['chr2'][pos5p[1]-pos5_post_exon_length:pos5p[1]]) + " " + + if pos3p[2] == '+': + #print "{C} {A} {G} [ exon ..." + seq_pre_3p_exon = str(sequences['chr2'][pos3p[1]-pos3_pre_exon_length:pos3p[1]]) + seq_in_3p_exon = str(sequences['chr2'][pos3p[1]:pos3p[1]+pos3_in_exon_length]) else: - print "{} {} {} [ exon ..." - # read sequence upstream + #print "... exon ] {G} {A} {C}" + seq_in_3p_exon = str(sequences['chr2'][pos3p[1]-pos3_in_exon_length:pos3p[1]]) + " " + seq_pre_3p_exon = str(sequences['chr2'][pos3p[1]:pos3p[1]+pos3_pre_exon_length]) + " " + + print "[ ... " + seq_in_5p_exon + " ] " + seq_post_5p_exon + " ... ... " + seq_pre_3p_exon + " [ " + seq_in_3p_exon + " ... ]" + print print From 2b210528d35ac143ebe03ee739689d74d0f613b4 Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Wed, 3 Jan 2018 17:44:09 +0100 Subject: [PATCH 05/25] almost there with obbtaining data --- drdisco/DetectOutput.py | 16 ++--- .../test_splice_site_motif_01.dbed | 3 +- .../test_splice_site_motif_01.fixed.bam | Bin 2631 -> 0 bytes ...txt => test_splice_site_motif_01.out.dbed} | 0 .../test_splice_site_motif_02.in.fa | 38 ++++++++++++ .../test_splice_site_motif_02.in.sam | 57 ++++++++++++++++++ tests/test_splice_site_motif.py | 36 ++++++++++- 7 files changed, 137 insertions(+), 13 deletions(-) delete mode 100644 tests/splice_site_motif/test_splice_site_motif_01.fixed.bam rename tests/splice_site_motif/{test_splice_site_motif_01.out.txt => test_splice_site_motif_01.out.dbed} (100%) create mode 100644 tests/splice_site_motif/test_splice_site_motif_02.in.fa create mode 100644 tests/splice_site_motif/test_splice_site_motif_02.in.sam diff --git a/drdisco/DetectOutput.py b/drdisco/DetectOutput.py index 2757e9f2..b6ebef27 100644 --- a/drdisco/DetectOutput.py +++ b/drdisco/DetectOutput.py @@ -218,21 +218,21 @@ def is_on_splice_junction_motif(self, fasta_fh): print "from" , pos5p if pos5p[2] == '-': #print " ... exon ] {G} {T} {AG} {A} {G} {T}" - seq_in_5p_exon = str(sequences['chr2'][pos5p[1]-pos5_in_exon_length:pos5p[1]]) - seq_post_5p_exon = str(sequences['chr2'][pos5p[1]:pos5p[1]+pos5_post_exon_length]) + seq_in_5p_exon = str(sequences[pos5p[0]][pos5p[1]-pos5_in_exon_length:pos5p[1]]) + seq_post_5p_exon = str(sequences[pos5p[0]][pos5p[1]:pos5p[1]+pos5_post_exon_length]) else: #print "{T} {G} {A} {GA} {T} {G} [ exon ..." - seq_in_5p_exon = str(sequences['chr2'][pos5p[1]:pos5p[1]+pos5_in_exon_length]) + " " - seq_post_5p_exon = str(sequences['chr2'][pos5p[1]-pos5_post_exon_length:pos5p[1]]) + " " + seq_in_5p_exon = str(sequences[pos5p[0]][pos5p[1]:pos5p[1]+pos5_in_exon_length]) + "-" + seq_post_5p_exon = str(sequences[pos5p[0]][pos5p[1]-pos5_post_exon_length:pos5p[1]]) + "-" if pos3p[2] == '+': #print "{C} {A} {G} [ exon ..." - seq_pre_3p_exon = str(sequences['chr2'][pos3p[1]-pos3_pre_exon_length:pos3p[1]]) - seq_in_3p_exon = str(sequences['chr2'][pos3p[1]:pos3p[1]+pos3_in_exon_length]) + seq_pre_3p_exon = str(sequences[pos3p[0]][pos3p[1]-pos3_pre_exon_length:pos3p[1]]) + seq_in_3p_exon = str(sequences[pos3p[0]][pos3p[1]:pos3p[1]+pos3_in_exon_length]) else: #print "... exon ] {G} {A} {C}" - seq_in_3p_exon = str(sequences['chr2'][pos3p[1]-pos3_in_exon_length:pos3p[1]]) + " " - seq_pre_3p_exon = str(sequences['chr2'][pos3p[1]:pos3p[1]+pos3_pre_exon_length]) + " " + seq_in_3p_exon = str(sequences[pos3p[0]][pos3p[1]-pos3_in_exon_length:pos3p[1]]) + "-" + seq_pre_3p_exon = str(sequences[pos3p[0]][pos3p[1]:pos3p[1]+pos3_pre_exon_length]) + "-" print "[ ... " + seq_in_5p_exon + " ] " + seq_post_5p_exon + " ... ... " + seq_pre_3p_exon + " [ " + seq_in_3p_exon + " ... ]" diff --git a/tests/splice_site_motif/test_splice_site_motif_01.dbed b/tests/splice_site_motif/test_splice_site_motif_01.dbed index 83a72e86..8b137891 100644 --- a/tests/splice_site_motif/test_splice_site_motif_01.dbed +++ b/tests/splice_site_motif/test_splice_site_motif_01.dbed @@ -1,2 +1 @@ -chr-A pos-A direction-A pos-A-acceptor pos-A-donor chr-B pos-B direction-B pos-B-acceptor pos-B-donor genomic-distance filter-status circRNA intronic/exonic score soft+hardclips n-split-reads n-discordant-reads alignment-score mismatches n-edges n-nodes-A n-nodes-B n-splice-junc-A n-splice-junc-B entropy-bp-edge entropy-all-edges bp-pos-stddev entropy-disco-bps lr-A-slope lr-A-intercept lr-A-rvalue lr-A-pvalue lr-A-stderr lr-B-slope lr-B-intercept lr-B-rvalue lr-B-pvalue lr-B-stderr disco/split clips/score nodes/edge median-AS-A median-AS-B max-AS-A max-AS-B data-structure -chr2 189 - 0 14 chr2 999 + 14 0 810 unclassified linear intronic 19 12 6 1 1019 4 1 1 1 0 0 0.7580 0.7580 0.0000 1.0000 8.0357 16.8929 0.9213 0.0032 1.5165 7.5714 22.0000 0.9311 0.0023 1.3263 0.1667 0.3158 2.0000 41 34 75 75 chr2:189/190(-)->chr2:999/1000(+):(discordant_mates:2,spanning_paired_1:5,spanning_paired_1_t:1,spanning_paired_2:5,spanning_paired_2_t:1) + diff --git a/tests/splice_site_motif/test_splice_site_motif_01.fixed.bam b/tests/splice_site_motif/test_splice_site_motif_01.fixed.bam deleted file mode 100644 index 0400ae10efb3ae522e21630133cab906a53d6a23..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 2631 zcmV-N3b^$jiwFb&00000{{{d;LjnN(1HD#FYa~S&E?qa-j;wmngBPib;2tJ)Rabw_ z#eBJOLwd4ivR{Y{z1@{e!F1JZSIuVkqBnnm=t1zJh!=l=u%IUq@e&0+cUQRa-Nimt2;-*JY6 zel{+VvpeWx62*Q%Ft!L>6Bxywk7JAQDgwn3j(y*^2~ra-Mnp3f;Ts}Cz8B%Sh`SD%Y81Ms5&g#oe$gN^=x=no3)!C)hw1fd(pkv(H<)a$9Bhy=EfyY*HAm$>@8 zSj0UO5Mmq>WZM%H5cq)>y}#|;*y`s@X8IJST#X8S5T&y<>>rGoge6ULI8G%z^?;QJ zoTjX7qa~ZrCbA0L*D(@`L#V0@>hYAqw4{&=WoJ1a>nQC)Sy^XfvdcG3IcNN6R28B5 z2+g;1tlLQ^?1o@-JH({#!^$HF{cXXGE1|5F9{hE!$cU{sga z&aNoWrJ{UFMRhI})tMFXId2VL_}1`+Zw+7E)};QKP$l?6kK6s3(hv62_hn^XXl0A# zUgWqfnEAhVeRJEnIq2u5dH=yf_NbSQSwTydnYV9kA8Grjv+F+y?kXayuCuw@Us>DS zb7qp3FcU>lt#QIQ?Ze^0`abOS#7qs>lX}6|8J*;iaa9VQ2*nO-{yLi|THZJ{OSn%9 z@sJJ#&r&Wp%hJg$uEh(bTh4V<)yj|_75d$tFb)xJY&$caRa*BR(sG>gqmDG1hdWyv z!e-o-NvfM=JDn!!BZ%GV(=tt4w!M~Q4(rBx0&j6C%DSzZneif>PPO3<)7K1dGrh%g zc~AxI>V-_2hWBR5*lO1*U9YE?Hch2WM6L^@l4YkhR4rK!U=L~AgPFdpjMp~hbW_Z@ zlGu+`8gQf3(eW3vuJpM?uzyr(Qojl(F0zQ3ky9GY@i*>xg&-aO^<0Z0@cQA89g4wA;{8hv!Y)|8^IL}y z_~XeB&$bAIFN)<$jfA`4+kbXjareL{5o?(Qe6t3?zq8$qm@f+e03VA81ONa400936 z0763o04xT@n9Xk!M;OM}feQsxY^~xAH1^K-1X4A2Y$p*t;LWymfFOYnNL-?bL~YbI zMS)9ic(n&#D)qpvr?&T&LofUnJ@gN#)Jmw=o>A%Z?mG55Yyw6=BxXNm9f{}nempbt z%yFCo#wi*n)=CBEXXn~r?pKW8F!r`Lw>B0QYR==6juXaGGR|z9#}c=W4VmQ~H+H>t ziHAKNxn5X2SCQw1kctWXR?To}v{9x~5mE?cnBusp%ACTh@XN}RLeEO2w3bYWX`##I zGiPzBR*kOQ=&Vk4)>fz5-PPN>>#NQUj9Z`Gxxe$(_U?m?uQwlTJKgr3yI<_p=W9#N z`C4PWRqM_#&VM++a5Sy{nsw^dtDXPxW8ouQ9FUsU^_$Kr1%#!>4o zHc}i5`#9>&A&zV*G$z5(d<`7RS6U?5kw8$IE6pGfIPe7NbI1-;p?rXLUk(VN3h*0- zlEGan{f=-nC&AI06o-`i#bW59sWg!K$tzS74x|L~!b-{xL3xta34PXclqpnV+Poti zts#z|a&SBu$Fbx*#_UP-kE1!nMeeeo@6Y;NsEWE^e3%87s;WHB1S?Ogvd5~7%hQUh zst`=5GjNKksrXf1Eek1iSxT=AfSi&@vTW}-KPis-Qn#@@j-$S?l8a->d9N_?$P`C+ z=o?R={QC3f&xa=SCno>EWM1ctI~-2J0@p?^i%YhBg}eaq)q+ztz&&9i2tK2jV*-ru zBour_`?ku^S5R2gQjKCJ>9ABV@Epln2V+38ra7?E0gX!sTy#Gl%WFQGAV)J6@)vX1;1ua%b?OCAntC)-(Ld4#s%AEU7l zjxS#!IWjnw(K^-_8{G+wZa35sp?J#4e5I!f|uxv!|RfpRGB6p;Pbway9jHz(0gktcDain2m+nC6XzfrlF=UtF*Ex%^*1Nc-3?y9eqePqwvu|9If_a3-yG_n8}Ekc@k}pZA&Wa%U>Bm(r6c9&$+YLYIf7QXECaVjJ0Abd%#K9zk@U`^rOK7u41` zokGn}f2X51;k~cmdAjQ3-CQ6&#*#XT^;Ct_X%B6|PY$lQWP-Ci4?H5&Is2&tW;kYi p{{zj4(LRb4001A02m}BC000301^_}s0stET0{{R300000001l^{`>#{ diff --git a/tests/splice_site_motif/test_splice_site_motif_01.out.txt b/tests/splice_site_motif/test_splice_site_motif_01.out.dbed similarity index 100% rename from tests/splice_site_motif/test_splice_site_motif_01.out.txt rename to tests/splice_site_motif/test_splice_site_motif_01.out.dbed diff --git a/tests/splice_site_motif/test_splice_site_motif_02.in.fa b/tests/splice_site_motif/test_splice_site_motif_02.in.fa new file mode 100644 index 00000000..1765dee6 --- /dev/null +++ b/tests/splice_site_motif/test_splice_site_motif_02.in.fa @@ -0,0 +1,38 @@ +>chr21 +NNNNNNNNNNTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT +TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT +TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT +TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT +TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT +TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT +TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT +TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT +TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT +TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT +TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT +TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTCCTGNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN +NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNACTTACC +TGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA +AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA +AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA +AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA +AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA diff --git a/tests/splice_site_motif/test_splice_site_motif_02.in.sam b/tests/splice_site_motif/test_splice_site_motif_02.in.sam new file mode 100644 index 00000000..fb7531fb --- /dev/null +++ b/tests/splice_site_motif/test_splice_site_motif_02.in.sam @@ -0,0 +1,57 @@ +@HD VN:1.4 SO:coordinate +@SQ SN:chrM LN:16571 +@SQ SN:chr1 LN:249250621 +@SQ SN:chr2 LN:243199373 +@SQ SN:chr3 LN:198022430 +@SQ SN:chr4 LN:191154276 +@SQ SN:chr5 LN:180915260 +@SQ SN:chr6 LN:171115067 +@SQ SN:chr7 LN:159138663 +@SQ SN:chr8 LN:146364022 +@SQ SN:chr9 LN:141213431 +@SQ SN:chr10 LN:135534747 +@SQ SN:chr11 LN:135006516 +@SQ SN:chr12 LN:133851895 +@SQ SN:chr13 LN:115169878 +@SQ SN:chr14 LN:107349540 +@SQ SN:chr15 LN:102531392 +@SQ SN:chr16 LN:90354753 +@SQ SN:chr17 LN:81195210 +@SQ SN:chr18 LN:78077248 +@SQ SN:chr19 LN:59128983 +@SQ SN:chr20 LN:63025520 +@SQ SN:chr21 LN:48129895 +@SQ SN:chr22 LN:51304566 +@SQ SN:chrX LN:155270560 +@SQ SN:chrY LN:59373566 +@PG ID:STAR PN:STAR VN:STAR_2.4.2a_modified CL:STAR --runThreadN 9 --genomeDir STAR_index_hg19/ --readFilesIn R1_paired.fastq.gz R2_paired.fastq.gz --readFilesCommand zcat --outFileNamePrefix paired/ --outSAMtype BAM SortedByCoordinate --outSAMstrandField intronMotif --outFilterIntronMotifs None --alignIntronMax 200000 --alignMatesGapMax 200000 --alignSJDBoverhangMin 10 --alignEndsType Local --chimSegmentMin 12 --chimJunctionOverhangMin 12 --sjdbGTFfile gencode.v19.annotation.gtf --sjdbOverhang 100 --quantMode GeneCounts --twopass1readsN 18446744073709551615 --twopassMode Basic +@CO user command line: STAR --genomeDir STAR_index_hg19/ --sjdbOverhang 100 --sjdbGTFfile gencode.v19.annotation.gtf --quantMode GeneCounts --outSAMstrandField intronMotif --outFilterIntronMotifs None --chimSegmentMin 12 --chimJunctionOverhangMin 12 --alignSJDBoverhangMin 10 --alignMatesGapMax 200000 --alignIntronMax 200000 --outSAMtype BAM SortedByCoordinate --alignEndsType Local --readFilesCommand zcat --twopassMode Basic --twopass1readsN -1 --outFileNamePrefix paired/ --runThreadN 9 --readFilesIn L004_R1_paired.fastq.gz L004_R2_paired.fastq.gz +D00xxx:000:x00x0xxxx:3:1106:8700:84313 99 chr21 11 3 109M = 529 579 CATGTTTGGGGGTGGCATGTGCTTCTCCTCCATGTAGCTGCCGTAGTTCATCCCAACGGTGTCTGGGCTGCCCACCATCTTCCCGCCTTTGGCCACACTGCATTCATCA ?>BGGGGFGGG@CBDGFGGGGGGGGGGGGGGGGGGGGGGGGGGGGB/FEFGCGGGGGF@FGF>F0EFEGDGGFGGGGGGEEGGG.CAGGGG0BCGGG;GGGGGGGGGGG NH:i:2 HI:i:1 AS:i:166 nM:i:0 +D00xxx:000:x00x0xxxx:6:1110:4542:86939 99 chr21 14 3 115M243N11M = 490 576 GTTTGGGGGTGGCATGTGCTTCTCCTCCATGTAGCTGCCGTAGTTCATCCCAACGGTGTCTGGGCTGCCCACCATCTTCCCGCCTTTGGCCACACTGCATTCATCAGGAGAGTTCCTTGAGCCATT <3?@BGGGGGDAGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFGGGGGGGGGGGGFGGGGGGGGGGGGBG@FGGGGGGGGGGGGGG@CGG=GBGD@GBGGEE/EGGGGE NH:i:2 HI:i:1 AS:i:224 nM:i:0 XS:A:- +D00xxx:000:x00x0xxxx:5:1209:11373:21218 99 chr21 58 3 71M243N20M = 495 532 TCATCCCAACGGTGTCTGGGCTGCCCACCATCTTCCCGCCTTTGGCCACACTGCATTCATCAGGAGAGTTCCTTGAGCCATTCACCTGGCT >BBCFFGGGGGGGGGGGGGEGGBGGG@EBCGGGEGGGGGBFGGF1FCGGGGGGGGGGGEGFGGGGGFDFGEGGGCGGGGGGGGFGGGEGGG NH:i:2 HI:i:1 AS:i:184 nM:i:0 XS:A:- +D00xxx:000:x00x0xxxx:4:2205:7737:30184 97 chr21 348 3 125M = 1607 0 AGAAAGGGGCGGAAGTCTCCTTACCTTGAGCCATTCACCTGGCTAGGGTTACATTCCATTTTGATGGTGACCCTGGCTGGGGGTTGAGACAGCCAATCCTGCTGAGGGACGCGTGGGCTCATCTT >ABACGGGGGGGGDGGGGGGGGGGGGGEGGGGGGGGGGGGGDGGGBGGGGGGGGGGGDGGGGGBG>0EFGGGGGGGGEGGBCGADGEGGGEGEGGGGGBBEGBDEGEGGGGAA>>CGGGGGBEGG NH:i:2 HI:i:1 AS:i:123 nM:i:0 +D00xxx:000:x00x0xxxx:1:1314:13313:61767 97 chr21 395 3 126M = 1633 0 GTTACATTCCATTTTGATGGTGACCCTGGCTGGGGGTTGAGACAGCCAATCCTGCTGAGGGACGCGTGGGCTCATCTTGGAAGTCTGTCCATAGTCGCTGGAGGAGGACGCGGTCATCTCTGTCTT A:@BBGGGGGGGGGGGGGGGGGGGGGGGGGGFGDGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFGCGGFEEGGGGGGGGGGGGGGGGGBGGGGGGGDBGBBGGGGGD0FGGGGGGD NH:i:2 HI:i:1 AS:i:78 nM:i:0 +D00xxx:000:x00x0xxxx:6:1110:4542:86939 147 chr21 490 3 100M25S = 14 -576 CGCTGGAGGAGGACGCGGTCATCTCTGTCTTAGCCAGGTGTGGCGTTCCGTAGGCACACTCAAACAACGACTGGTCCTCACTCACAACTGATAAGGCTTCCTGCCGCGCTCCAGGCGGCGCTCCC GGGGGGGGGGGDDGGGGGGGGEGED=GGEEGEGEGGGGEGGGGDGGGDGF>GGGEF>GGGEGGGGGGGGGGGGGGGGCGGGEGFGEEGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG@ABBB NH:i:2 HI:i:1 AS:i:224 nM:i:0 XS:A:- +D00xxx:000:x00x0xxxx:2:2102:3982:10406 147 chr21 490 3 100M26S = 465 -125 CGCTGGAGGAGGACGCGGTCATCTCTGTCTTAGCCAGGTGTGGCGTTCCGTAGGCACACTCAAACAACGACTGGTCCTCACTCACAACTGATAAGGCTTCCTGCCGCGCTCCAGGCGGCGCTCCCC GGGGGGGGGGGGGGGGBGGGGGGGGGGGGGBGGGEGGGGGGGGGGGGGGGGGFGGGGGGGGGGGGGFGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGBCBBB NH:i:2 HI:i:1 AS:i:221 nM:i:0 +D00xxx:000:x00x0xxxx:5:1209:11373:21218 147 chr21 495 3 95M31S = 58 -532 GAGGAGGACGCGGTCATCTCTGTCTTAGCCAGGTGTGGCGTTCCGTAGGCACACTCAAACAACGACTGGTCCTCACTCACAACTGATAAGGCTTCCTGCCGCGCTCCAGGCGGCGCTCCCCGCCCC GGGGGA:GGGGGGGGGGGDEBGG@EBGEEGGGGGGGGGGGDBGGGGGGEGGGGE=GGGGGGGGGGGGGE=EGGGGGGD@D0=GGED;GEEGGGGGGFGGGGGGGEGEGGGEBB@GGGG@BBBB NH:i:2 HI:i:1 AS:i:184 nM:i:0 XS:A:- +D00xxx:000:x00x0xxxx:2:1105:9810:99004 147 chr21 495 3 95M31S = 424 -166 GAGGAGGACGCGGTCATCTCTGTCTTAGCCAGGTGTGGCGTTCCGTAGGCACACTCAAACAACGACTGGTCCTCACTCACAACTGATAAGGCTTCCTGCCGCGCTCCAGGCGGCGCTCCCCGCCCC GGGGGGGGGGGGGGEGEGGGGGEGGGEGGGGGGGGGGGGGGGGGGEGGGGGGGGGGGGEGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFGGGGGGGGGGGGGGGGGGCCCCC NH:i:2 HI:i:1 AS:i:216 nM:i:0 +D00xxx:000:x00x0xxxx:3:1106:8700:84313 147 chr21 529 3 61M65S = 11 -579 GTGGCGTTCCGTAGGCACACTCAAACAACGACTGGTCCTCACTCACAACTGATAAGGCTTCCTGCCGCGCTCCAGGCGGCGCTCCCCGCCCCTCGCCCTCCGCCTCCGCCTCCGCCTCCTGCTTAG GGGG>.B>GGGGEGGGGGGGGGGGGGGGGEGGEGGGEEDC=G@G@D@@GEGGE=EGGD@CDGDGAGGGGGBGGGGGGGGGGGGGGGDGGGGGGGGGBGGGGGGG@FGFCD@GGGGGGFGGGA0BBA NH:i:2 HI:i:1 AS:i:166 nM:i:0 +D00xxx:000:x00x0xxxx:7:2203:2111:6137 256 chr21 557 3 33M50S * 0 0 CGACTGGTCCTCACTCACAACTGATAAGGCTTCCTGCCGCGCCGCGCTCCTCACACCCGCTTTCAACTCCGGGCGGGGCAGGG BBBBGGGGGGGGGGGGBGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGBDGGEGGGGGGGGGGGGGGGGG NH:i:2 HI:i:1 AS:i:32 nM:i:0 +D00xxx:000:x00x0xxxx:1:2103:12116:101378 256 chr21 557 3 33M50S * 0 0 CGACTGGTCCTCACTCACAACTGATAAGGCTTCCTGCCGCGCCGCGCTCCTCACACCCGCTTTCAACTCCGGGCGGGGCAGGG ?@BBGGFGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGCGGGGGGGGGBFBFGCGGGGGGGGGGGGGGGGG NH:i:2 HI:i:1 AS:i:32 nM:i:0 +D00xxx:000:x00x0xxxx:7:2203:2111:6137 0 chr21 1600 3 33S50M * 0 0 CGACTGGTCCTCACTCACAACTGATAAGGCTTCCTGCCGCGCCGCGCTCCTCACACCCGCTTTCAACTCCGGGCGGGGCAGGG BBBBGGGGGGGGGGGGBGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGBDGGEGGGGGGGGGGGGGGGGG NH:i:2 HI:i:2 AS:i:47 nM:i:1 +D00xxx:000:x00x0xxxx:1:2103:12116:101378 0 chr21 1600 3 33S50M * 0 0 CGACTGGTCCTCACTCACAACTGATAAGGCTTCCTGCCGCGCCGCGCTCCTCACACCCGCTTTCAACTCCGGGCGGGGCAGGG ?@BBGGFGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGCGGGGGGGGGBFBFGCGGGGGGGGGGGGGGGGG NH:i:2 HI:i:2 AS:i:47 nM:i:1 +D00xxx:000:x00x0xxxx:3:2102:7384:25660 145 chr21 1600 3 95M = 483 0 CTGCCGCGCCGCGCTCCTCACACCCGCTTTCAACTCCGGGCGGGGCAGGGGGCATCGGCGGGTCCCAGGCGCCCAGGTTCCCCTCCCCAGCCCGG C6GGGGAD:ADDGCC.C=GC>C?ADDGGGFE<.C8GGG@GCBFGGGDGGFGGBB/FGGGGGGBEGGGGGGGGGGGBBCCB NH:i:2 HI:i:2 AS:i:91 nM:i:1 +D00xxx:000:x00x0xxxx:4:2205:7737:30184 145 chr21 1607 3 93M = 348 0 GCCGCGCTCCTCACACCCGCTTTCAACTCCGGGCGGGGCAGGGGGCATCGGCGGGTCCCAGGCGCCCAGGTTCCCCTCCCCAGCCCGGACCCC AAGGGEGGGBGGAG?>A;GGGGEEBDC>GGGAGGGD6:GGGGGC>:DGDGGGGBE@DA><;GGGDDADDGGCDA@GGGGDGGGGGBGGGGGGDGGGGGGDGGGEEGFGGGGGGGGEGFGGGGGGGGGCBBCC NH:i:2 HI:i:2 AS:i:118 nM:i:0 +D00xxx:000:x00x0xxxx:3:1106:8700:84313 401 chr21 1731 3 61S65M = 11 0 GTGGCGTTCCGTAGGCACACTCAAACAACGACTGGTCCTCACTCACAACTGATAAGGCTTCCTGCCGCGCTCCAGGCGGCGCTCCCCGCCCCTCGCCCTCCGCCTCCGCCTCCGCCTCCTGCTTAG GGGG>.B>GGGGEGGGGGGGGGGGGGGGGEGGEGGGEEDC=G@G@D@@GEGGE=EGGD@CDGDGAGGGGGBGGGGGGGGGGGGGGGDGGGGGGGGGBGGGGGGG@FGFCD@GGGGGGFGGGA0BBA NH:i:2 HI:i:2 AS:i:63 nM:i:0 +D00xxx:000:x00x0xxxx:6:1110:4542:86939 401 chr21 1731 3 100S25M = 14 0 CGCTGGAGGAGGACGCGGTCATCTCTGTCTTAGCCAGGTGTGGCGTTCCGTAGGCACACTCAAACAACGACTGGTCCTCACTCACAACTGATAAGGCTTCCTGCCGCGCTCCAGGCGGCGCTCCC GGGGGGGGGGGDDGGGGGGGGEGED=GGEEGEGEGGGGEGGGGDGGGDGF>GGGEF>GGGEGGGGGGGGGGGGGGGGCGGGEGFGEEGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG@ABBB NH:i:2 HI:i:2 AS:i:24 nM:i:0 +D00xxx:000:x00x0xxxx:5:1209:11373:21218 401 chr21 1731 3 95S31M = 58 0 GAGGAGGACGCGGTCATCTCTGTCTTAGCCAGGTGTGGCGTTCCGTAGGCACACTCAAACAACGACTGGTCCTCACTCACAACTGATAAGGCTTCCTGCCGCGCTCCAGGCGGCGCTCCCCGCCCC GGGGGA:GGGGGGGGGGGDEBGG@EBGEEGGGGGGGGGGGDBGGGGGGEGGGGE=GGGGGGGGGGGGGE=EGGGGGGD@D0=GGED;GEEGGGGGGFGGGGGGGEGEGGGEBB@GGGG@BBBB NH:i:2 HI:i:2 AS:i:30 nM:i:0 +D00xxx:000:x00x0xxxx:2:2102:3982:10406 401 chr21 1731 3 100S26M = 465 0 CGCTGGAGGAGGACGCGGTCATCTCTGTCTTAGCCAGGTGTGGCGTTCCGTAGGCACACTCAAACAACGACTGGTCCTCACTCACAACTGATAAGGCTTCCTGCCGCGCTCCAGGCGGCGCTCCCC GGGGGGGGGGGGGGGGBGGGGGGGGGGGGGBGGGEGGGGGGGGGGGGGGGGGFGGGGGGGGGGGGGFGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGBCBBB NH:i:2 HI:i:2 AS:i:25 nM:i:0 +D00xxx:000:x00x0xxxx:2:1105:9810:99004 401 chr21 1731 3 95S31M = 424 0 GAGGAGGACGCGGTCATCTCTGTCTTAGCCAGGTGTGGCGTTCCGTAGGCACACTCAAACAACGACTGGTCCTCACTCACAACTGATAAGGCTTCCTGCCGCGCTCCAGGCGGCGCTCCCCGCCCC GGGGGGGGGGGGGGEGEGGGGGEGGGEGGGGGGGGGGGGGGGGGGEGGGGGGGGGGGGEGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFGGGGGGGGGGGGGGGGGGCCCCC NH:i:2 HI:i:2 AS:i:30 nM:i:0 diff --git a/tests/test_splice_site_motif.py b/tests/test_splice_site_motif.py index 22fb722d..fdd197ec 100755 --- a/tests/test_splice_site_motif.py +++ b/tests/test_splice_site_motif.py @@ -47,14 +47,44 @@ def test_sj_01(self): test_id = 'splice_site_motif_01' input_sam = TEST_DIR + "test_" + test_id + ".in.sam" - input_bam = TEST_DIR + "test_" + test_id + ".fixed.bam" - input_file = TEST_DIR + "test_" + test_id + ".dbed" + input_bam = T_TEST_DIR + "test_" + test_id + ".fixed.bam" + input_file = T_TEST_DIR + "test_" + test_id + ".dbed" gtf_file = None fasta_file = TEST_DIR + "test_" + test_id + ".in.fa" output_file = T_TEST_DIR + "test_" + test_id + ".out.txt" - test_file = TEST_DIR + "test_" + test_id + ".out.txt" + test_file = TEST_DIR + "test_" + test_id + ".out.dbed" + + # sam -> fixed bam + sam_to_fixed_bam(input_sam, input_bam, T_TEST_DIR) + + # fixed bam -> dr-disco detect + ic = IntronDecomposition(input_bam) + ic.decompose(0) + fh = open(input_file, "w") + ic.export(fh) + fh.close() + + # dr-disco-detect (skip classify) -> dr-disco integrate + cl = DetectOutput(input_file) + cl.integrate(output_file, gtf_file, fasta_file) + + self.assertTrue(filecmp.cmp(test_file, output_file), msg="diff '" + test_file + "' '" + output_file + "':\n" + subprocess.Popen(['diff', test_file, output_file], stdout=subprocess.PIPE).stdout.read()) + + + def test_sj_02(self): + test_id = 'splice_site_motif_02' + + input_sam = TEST_DIR + "test_" + test_id + ".in.sam" + input_bam = T_TEST_DIR + "test_" + test_id + ".fixed.bam" + input_file = T_TEST_DIR + "test_" + test_id + ".dbed" + + gtf_file = None + fasta_file = TEST_DIR + "test_" + test_id + ".in.fa" + output_file = T_TEST_DIR + "test_" + test_id + ".out.txt" + + test_file = TEST_DIR + "test_" + test_id + ".out.dbed" # sam -> fixed bam sam_to_fixed_bam(input_sam, input_bam, T_TEST_DIR) From c978590973b0b340dbe6c582b26f86f78edde057 Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Fri, 5 Jan 2018 10:27:33 +0100 Subject: [PATCH 06/25] tempsav --- drdisco/DetectOutput.py | 2 +- tests/splice_site_motif/test_splice_site_motif_02.in.fa | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drdisco/DetectOutput.py b/drdisco/DetectOutput.py index 9851f727..45c44ff9 100644 --- a/drdisco/DetectOutput.py +++ b/drdisco/DetectOutput.py @@ -225,7 +225,7 @@ def is_on_splice_junction_motif(self, fasta_fh): sequences = dict( (s.name, s) for s in fasta_fh ) print "lets proceed" - print "from" , pos5p + print "from" , pos5p, " to" , pos3p if pos5p[2] == '-': #print " ... exon ] {G} {T} {AG} {A} {G} {T}" seq_in_5p_exon = str(sequences[pos5p[0]][pos5p[1]-pos5_in_exon_length:pos5p[1]]) diff --git a/tests/splice_site_motif/test_splice_site_motif_02.in.fa b/tests/splice_site_motif/test_splice_site_motif_02.in.fa index 1765dee6..10fce745 100644 --- a/tests/splice_site_motif/test_splice_site_motif_02.in.fa +++ b/tests/splice_site_motif/test_splice_site_motif_02.in.fa @@ -33,6 +33,6 @@ NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNACTTACC TGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA -AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA +AAAAAAAAAAAAAAAAAAAAAAAAACTTACCTGAAAAAAAAAAAAAAAAA AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA From 2089066efe0613517006dab1c6286a2f9f65d21c Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Mon, 8 Jan 2018 14:50:05 +0100 Subject: [PATCH 07/25] new function seems to work, time for testing and deploying --- drdisco/DetectOutput.py | 38 ++++++++++++------- drdisco/__init__.py | 2 +- drdisco/utils.py | 17 +++++++++ .../test_splice_site_motif_01.out.dbed | 2 + .../test_splice_site_motif_02.out.dbed | 2 + tests/test_splice_site_motif.py | 4 +- 6 files changed, 49 insertions(+), 16 deletions(-) create mode 100644 drdisco/utils.py create mode 100644 tests/splice_site_motif/test_splice_site_motif_02.out.dbed diff --git a/drdisco/DetectOutput.py b/drdisco/DetectOutput.py index 45c44ff9..842eaf82 100644 --- a/drdisco/DetectOutput.py +++ b/drdisco/DetectOutput.py @@ -6,6 +6,7 @@ from drdisco import log from drdisco.DetectFrameShifts import DetectFrameShifts +from drdisco.utils import reverse_complement import gzip import HTSeq @@ -130,6 +131,8 @@ def parse(self): self.break_A_max_AS = int(self.line[44]) self.break_B_max_AS = int(self.line[45]) + self.edit_dist_to_splice_motif = "" + self.structure = self.line[46] inv = {'-': '+', '+': '-'} @@ -228,26 +231,35 @@ def is_on_splice_junction_motif(self, fasta_fh): print "from" , pos5p, " to" , pos3p if pos5p[2] == '-': #print " ... exon ] {G} {T} {AG} {A} {G} {T}" - seq_in_5p_exon = str(sequences[pos5p[0]][pos5p[1]-pos5_in_exon_length:pos5p[1]]) - seq_post_5p_exon = str(sequences[pos5p[0]][pos5p[1]:pos5p[1]+pos5_post_exon_length]) + seq_in_5p_exon = str(sequences[pos5p[0]][pos5p[1]-pos5_in_exon_length:pos5p[1]]).upper() + seq_post_5p_exon = str(sequences[pos5p[0]][pos5p[1]:pos5p[1]+pos5_post_exon_length]).upper() else: #print "{T} {G} {A} {GA} {T} {G} [ exon ..." - seq_in_5p_exon = str(sequences[pos5p[0]][pos5p[1]:pos5p[1]+pos5_in_exon_length]) + "-" - seq_post_5p_exon = str(sequences[pos5p[0]][pos5p[1]-pos5_post_exon_length:pos5p[1]]) + "-" + seq_in_5p_exon = reverse_complement(str(sequences[pos5p[0]][pos5p[1]:pos5p[1]+pos5_in_exon_length])) # + "-" + seq_post_5p_exon = reverse_complement(str(sequences[pos5p[0]][pos5p[1]-pos5_post_exon_length:pos5p[1]]))# + "-" if pos3p[2] == '+': #print "{C} {A} {G} [ exon ..." - seq_pre_3p_exon = str(sequences[pos3p[0]][pos3p[1]-pos3_pre_exon_length:pos3p[1]]) - seq_in_3p_exon = str(sequences[pos3p[0]][pos3p[1]:pos3p[1]+pos3_in_exon_length]) + seq_pre_3p_exon = str(sequences[pos3p[0]][pos3p[1]-pos3_pre_exon_length:pos3p[1]]).upper() + seq_in_3p_exon = str(sequences[pos3p[0]][pos3p[1]:pos3p[1]+pos3_in_exon_length]).upper() else: #print "... exon ] {G} {A} {C}" - seq_in_3p_exon = str(sequences[pos3p[0]][pos3p[1]-pos3_in_exon_length:pos3p[1]]) + "-" - seq_pre_3p_exon = str(sequences[pos3p[0]][pos3p[1]:pos3p[1]+pos3_pre_exon_length]) + "-" + seq_in_3p_exon = reverse_complement(str(sequences[pos3p[0]][pos3p[1]-pos3_in_exon_length:pos3p[1]])) # + "-" + seq_pre_3p_exon = reverse_complement(str(sequences[pos3p[0]][pos3p[1]:pos3p[1]+pos3_pre_exon_length])) # + "-" - print "[ ... " + seq_in_5p_exon + " ] " + seq_post_5p_exon + " ... ... " + seq_pre_3p_exon + " [ " + seq_in_3p_exon + " ... ]" + def calc_dist(pat, subseq): + d = 0 + + if len(pat) != len(subseq): + raise Exception("invalid pattern size") + for i in range(len(pat)): + if subseq[i] not in pat[i]: + d += 1 - print - print + return d + + self.edit_dist_to_splice_motif = str(calc_dist(["AC","A","G"], seq_in_5p_exon) + calc_dist(["G","T","AG","A","G","T" ], seq_post_5p_exon) + calc_dist(["C","A","G"], seq_pre_3p_exon) + calc_dist(["G"], seq_in_3p_exon)) + return self.edit_dist_to_splice_motif def __str__(self): line = self.line @@ -453,7 +465,7 @@ def insert_in_index(index, entries, score): with open(output_table, 'w') as fh_out: header = self.header.split("\t") - header = "\t".join(header[:-5] + ['full-gene-dysregulation', 'frameshift=0', 'frameshift=+1', 'frameshift=+2'] + header[-5:]) + header = "\t".join(header[:-5] + ['full-gene-dysregulation', 'frameshift=0', 'frameshift=+1', 'frameshift=+2', 'splice-motif-edit-distance'] + header[-5:]) fh_out.write("shared-id\tfusion\t" + header) @@ -592,7 +604,7 @@ def insert(pos, e): for entry in idx2[score][key]: if entry not in exported: acceptors_donors = entry.get_donors_acceptors(gene_annotation) - line = entry.line[:-5] + [entry.fgd, entry.frameshift_0, entry.frameshift_1, entry.frameshift_2] + entry.line[-5:] + line = entry.line[:-5] + [entry.fgd, entry.frameshift_0, entry.frameshift_1, entry.frameshift_2, entry.edit_dist_to_splice_motif] + entry.line[-5:] fh_out.write(str(i) + "\t" + acceptors_donors + "\t" + "\t".join(line) + "\n") exported.add(entry) diff --git a/drdisco/__init__.py b/drdisco/__init__.py index d6410ca3..b0225b00 100644 --- a/drdisco/__init__.py +++ b/drdisco/__init__.py @@ -31,7 +31,7 @@ import logging import sys -__version_info__ = ('0', '14', '6') +__version_info__ = ('0', '15', '0') __version__ = '.'.join(__version_info__) if (len(__version_info__) == 3) else '.'.join(__version_info__[0:3]) + "-" + __version_info__[3] __author__ = 'Youri Hoogstrate' __homepage__ = 'https://github.com/yhoogstrate/dr-disco' diff --git a/drdisco/utils.py b/drdisco/utils.py new file mode 100644 index 00000000..c9c24b23 --- /dev/null +++ b/drdisco/utils.py @@ -0,0 +1,17 @@ +#!/usr/bin/env python +# *- coding: utf-8 -*- +# vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4 textwidth=79: + +alt_map = {'ins':'0'} +complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'} + +def reverse_complement(seq): + seq = seq.upper() + for k,v in alt_map.iteritems(): + seq = seq.replace(k,v) + bases = list(seq) + bases = reversed([complement.get(base,base) for base in bases]) + bases = ''.join(bases) + for k,v in alt_map.iteritems(): + bases = bases.replace(v,k) + return bases diff --git a/tests/splice_site_motif/test_splice_site_motif_01.out.dbed b/tests/splice_site_motif/test_splice_site_motif_01.out.dbed index e69de29b..7d10b731 100644 --- a/tests/splice_site_motif/test_splice_site_motif_01.out.dbed +++ b/tests/splice_site_motif/test_splice_site_motif_01.out.dbed @@ -0,0 +1,2 @@ +shared-id fusion chr-A pos-A direction-A pos-A-acceptor pos-A-donor chr-B pos-B direction-B pos-B-acceptor pos-B-donor genomic-distance filter-status circRNA intronic/exonic score soft+hardclips n-split-reads n-discordant-reads alignment-score mismatches n-edges n-nodes-A n-nodes-B n-splice-junc-A n-splice-junc-B entropy-bp-edge entropy-all-edges bp-pos-stddev entropy-disco-bps lr-A-slope lr-A-intercept lr-A-rvalue lr-A-pvalue lr-A-stderr lr-B-slope lr-B-intercept lr-B-rvalue lr-B-pvalue lr-B-stderr disco/split clips/score nodes/edge full-gene-dysregulation frameshift=0 frameshift=+1 frameshift=+2 splice-motif-edit-distance median-AS-A median-AS-B max-AS-A max-AS-B data-structure +1 chr2:189->chr2:999 chr2 189 - 0 14 chr2 999 + 14 0 810 unclassified linear intronic 19 12 6 1 1019 4 1 1 1 0 0 0.7580 0.7580 0.0000 1.0000 8.0357 16.8929 0.9213 0.0032 1.5165 7.5714 22.0000 0.9311 0.0023 1.3263 0.1667 0.3158 2.0000 0 41 34 75 75 chr2:189/190(-)->chr2:999/1000(+):(discordant_mates:2,spanning_paired_1:5,spanning_paired_1_t:1,spanning_paired_2:5,spanning_paired_2_t:1) diff --git a/tests/splice_site_motif/test_splice_site_motif_02.out.dbed b/tests/splice_site_motif/test_splice_site_motif_02.out.dbed new file mode 100644 index 00000000..aabf22ef --- /dev/null +++ b/tests/splice_site_motif/test_splice_site_motif_02.out.dbed @@ -0,0 +1,2 @@ +shared-id fusion chr-A pos-A direction-A pos-A-acceptor pos-A-donor chr-B pos-B direction-B pos-B-acceptor pos-B-donor genomic-distance filter-status circRNA intronic/exonic score soft+hardclips n-split-reads n-discordant-reads alignment-score mismatches n-edges n-nodes-A n-nodes-B n-splice-junc-A n-splice-junc-B entropy-bp-edge entropy-all-edges bp-pos-stddev entropy-disco-bps lr-A-slope lr-A-intercept lr-A-rvalue lr-A-pvalue lr-A-stderr lr-B-slope lr-B-intercept lr-B-rvalue lr-B-pvalue lr-B-stderr disco/split clips/score nodes/edge full-gene-dysregulation frameshift=0 frameshift=+1 frameshift=+2 splice-motif-edit-distance median-AS-A median-AS-B max-AS-A max-AS-B data-structure +1 chr21:1730->chr21:589 chr21 589 - 10 0 chr21 1730 + 0 10 1141 unclassified linear intronic 24 14 7 5 2387 4 2 1 2 0 0 0.8277 0.9070 0.0000 0.9398 8.3000 73.6000 0.7947 0.1082 3.6601 8.5000 18.6000 0.8063 0.0993 3.5995 0.7143 0.2917 1.5000 0 95 50 126 120 chr21:589/590(-)->chr21:1730/1731(+):(spanning_paired_1:5,spanning_paired_2:5)&chr21:589/590(-)->chr21:1599/1600(+):(discordant_mates:10,spanning_singleton_1_r:2,spanning_singleton_2_r:2) diff --git a/tests/test_splice_site_motif.py b/tests/test_splice_site_motif.py index fdd197ec..1e8ae8f5 100755 --- a/tests/test_splice_site_motif.py +++ b/tests/test_splice_site_motif.py @@ -52,8 +52,8 @@ def test_sj_01(self): gtf_file = None fasta_file = TEST_DIR + "test_" + test_id + ".in.fa" - output_file = T_TEST_DIR + "test_" + test_id + ".out.txt" + output_file = T_TEST_DIR + "test_" + test_id + ".out.dbed" test_file = TEST_DIR + "test_" + test_id + ".out.dbed" # sam -> fixed bam @@ -82,8 +82,8 @@ def test_sj_02(self): gtf_file = None fasta_file = TEST_DIR + "test_" + test_id + ".in.fa" - output_file = T_TEST_DIR + "test_" + test_id + ".out.txt" + output_file = T_TEST_DIR + "test_" + test_id + ".out.dbed" test_file = TEST_DIR + "test_" + test_id + ".out.dbed" # sam -> fixed bam From 20b9110c9ed2703d54d215ce241d15c47d4dd924 Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Mon, 8 Jan 2018 14:52:53 +0100 Subject: [PATCH 08/25] without print statements --- drdisco/DetectOutput.py | 28 +++++++++------------------- 1 file changed, 9 insertions(+), 19 deletions(-) diff --git a/drdisco/DetectOutput.py b/drdisco/DetectOutput.py index 842eaf82..9780dcc2 100644 --- a/drdisco/DetectOutput.py +++ b/drdisco/DetectOutput.py @@ -208,12 +208,6 @@ def is_on_splice_junction_motif(self, fasta_fh): pos3_pre_exon_length = 3 pos3_in_exon_length = 1 - print - print - print - print "calculating change on intron/exon junction using fasta" - print "rna-strand: + - means breakA -> breakB, means A=5' B=3'" - if self.donorA > self.donorB: pos5p = [self.chrA, self.posA, self.strandA] pos3p = [self.chrB, self.posB, self.strandB] @@ -223,29 +217,23 @@ def is_on_splice_junction_motif(self, fasta_fh): else: pos5p = None - + if pos5p: sequences = dict( (s.name, s) for s in fasta_fh ) - print "lets proceed" - print "from" , pos5p, " to" , pos3p if pos5p[2] == '-': - #print " ... exon ] {G} {T} {AG} {A} {G} {T}" seq_in_5p_exon = str(sequences[pos5p[0]][pos5p[1]-pos5_in_exon_length:pos5p[1]]).upper() seq_post_5p_exon = str(sequences[pos5p[0]][pos5p[1]:pos5p[1]+pos5_post_exon_length]).upper() else: - #print "{T} {G} {A} {GA} {T} {G} [ exon ..." - seq_in_5p_exon = reverse_complement(str(sequences[pos5p[0]][pos5p[1]:pos5p[1]+pos5_in_exon_length])) # + "-" - seq_post_5p_exon = reverse_complement(str(sequences[pos5p[0]][pos5p[1]-pos5_post_exon_length:pos5p[1]]))# + "-" + seq_in_5p_exon = reverse_complement(str(sequences[pos5p[0]][pos5p[1]:pos5p[1]+pos5_in_exon_length])) + seq_post_5p_exon = reverse_complement(str(sequences[pos5p[0]][pos5p[1]-pos5_post_exon_length:pos5p[1]])) if pos3p[2] == '+': - #print "{C} {A} {G} [ exon ..." seq_pre_3p_exon = str(sequences[pos3p[0]][pos3p[1]-pos3_pre_exon_length:pos3p[1]]).upper() seq_in_3p_exon = str(sequences[pos3p[0]][pos3p[1]:pos3p[1]+pos3_in_exon_length]).upper() else: - #print "... exon ] {G} {A} {C}" - seq_in_3p_exon = reverse_complement(str(sequences[pos3p[0]][pos3p[1]-pos3_in_exon_length:pos3p[1]])) # + "-" - seq_pre_3p_exon = reverse_complement(str(sequences[pos3p[0]][pos3p[1]:pos3p[1]+pos3_pre_exon_length])) # + "-" + seq_in_3p_exon = reverse_complement(str(sequences[pos3p[0]][pos3p[1]-pos3_in_exon_length:pos3p[1]])) + seq_pre_3p_exon = reverse_complement(str(sequences[pos3p[0]][pos3p[1]:pos3p[1]+pos3_pre_exon_length])) def calc_dist(pat, subseq): d = 0 @@ -258,8 +246,10 @@ def calc_dist(pat, subseq): return d - self.edit_dist_to_splice_motif = str(calc_dist(["AC","A","G"], seq_in_5p_exon) + calc_dist(["G","T","AG","A","G","T" ], seq_post_5p_exon) + calc_dist(["C","A","G"], seq_pre_3p_exon) + calc_dist(["G"], seq_in_3p_exon)) - return self.edit_dist_to_splice_motif + # print "[ ... " + seq_in_5p_exon + " ] " + seq_post_5p_exon + " ... ... " + seq_pre_3p_exon + " [ " + seq_in_3p_exon + " ... ]" , + dist = calc_dist(["AC","A","G"], seq_in_5p_exon) + calc_dist(["G","T","AG","A","G","T" ], seq_post_5p_exon) + calc_dist(["C","A","G"], seq_pre_3p_exon) + calc_dist(["G"], seq_in_3p_exon) + self.edit_dist_to_splice_motif = str(dist) + return dist def __str__(self): line = self.line From 8c1b359a85e8895aa0c9b4ae803ab9d5f3fa4c8e Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Mon, 8 Jan 2018 14:56:01 +0100 Subject: [PATCH 09/25] patch test --- tests/chim_overhang/test_01_integrate.out.txt | 4 ++-- tests/chim_overhang/test_02_integrate.out.txt | 4 ++-- tests/chim_overhang/test_03_integrate.out.txt | 4 ++-- tests/test_chim_overhang.py | 6 +++--- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/tests/chim_overhang/test_01_integrate.out.txt b/tests/chim_overhang/test_01_integrate.out.txt index 5eb9a6d8..7f99ed6b 100644 --- a/tests/chim_overhang/test_01_integrate.out.txt +++ b/tests/chim_overhang/test_01_integrate.out.txt @@ -1,2 +1,2 @@ -shared-id fusion chr-A pos-A direction-A pos-A-acceptor pos-A-donor chr-B pos-B direction-B pos-B-acceptor pos-B-donor genomic-distance filter-status circRNA intronic/exonic score soft+hardclips n-split-reads n-discordant-reads alignment-score mismatches n-edges n-nodes-A n-nodes-B n-splice-junc-A n-splice-junc-B entropy-bp-edge entropy-all-edges bp-pos-stddev entropy-disco-bps lr-A-slope lr-A-intercept lr-A-rvalue lr-A-pvalue lr-A-stderr lr-B-slope lr-B-intercept lr-B-rvalue lr-B-pvalue lr-B-stderr disco/split clips/score nodes/edge full-gene-dysregulation frameshift=0 frameshift=+1 frameshift=+2 median-AS-A median-AS-B max-AS-A max-AS-B data-structure -1 chr2:17281790->chr11:5566704 chr2 17281790 + 0 22 chr11 5566704 - 22 0 inf entropy=0.7372<0.7382,chim_overhang=21<25 linear intronic 33 22 11 0 1530 10 1 1 1 0 0 0.7372 0.7372 0.0000 0.0000 0.3818 18.0000 0.8367 0.0013 0.0833 2.0455 38.7727 0.9652 0.0000 0.1847 0.0000 0.3333 2.0000 21 49 21 57 chr2:17281790/17281791(+)->chr11:5566704/5566705(-):(spanning_paired_1:11,spanning_paired_2:11) +shared-id fusion chr-A pos-A direction-A pos-A-acceptor pos-A-donor chr-B pos-B direction-B pos-B-acceptor pos-B-donor genomic-distance filter-status circRNA intronic/exonic score soft+hardclips n-split-reads n-discordant-reads alignment-score mismatches n-edges n-nodes-A n-nodes-B n-splice-junc-A n-splice-junc-B entropy-bp-edge entropy-all-edges bp-pos-stddev entropy-disco-bps lr-A-slope lr-A-intercept lr-A-rvalue lr-A-pvalue lr-A-stderr lr-B-slope lr-B-intercept lr-B-rvalue lr-B-pvalue lr-B-stderr disco/split clips/score nodes/edge full-gene-dysregulation frameshift=0 frameshift=+1 frameshift=+2 splice-motif-edit-distance median-AS-A median-AS-B max-AS-A max-AS-B data-structure +1 chr2:17281790->chr11:5566704 chr2 17281790 + 0 22 chr11 5566704 - 22 0 inf entropy=0.7372<0.7382,chim_overhang=21<25 linear intronic 33 22 11 0 1530 10 1 1 1 0 0 0.7372 0.7372 0.0000 0.0000 0.3818 18.0000 0.8367 0.0013 0.0833 2.0455 38.7727 0.9652 0.0000 0.1847 0.0000 0.3333 2.0000 21 49 21 57 chr2:17281790/17281791(+)->chr11:5566704/5566705(-):(spanning_paired_1:11,spanning_paired_2:11) diff --git a/tests/chim_overhang/test_02_integrate.out.txt b/tests/chim_overhang/test_02_integrate.out.txt index 482a9db1..328caaad 100644 --- a/tests/chim_overhang/test_02_integrate.out.txt +++ b/tests/chim_overhang/test_02_integrate.out.txt @@ -1,2 +1,2 @@ -shared-id fusion chr-A pos-A direction-A pos-A-acceptor pos-A-donor chr-B pos-B direction-B pos-B-acceptor pos-B-donor genomic-distance filter-status circRNA intronic/exonic score soft+hardclips n-split-reads n-discordant-reads alignment-score mismatches n-edges n-nodes-A n-nodes-B n-splice-junc-A n-splice-junc-B entropy-bp-edge entropy-all-edges bp-pos-stddev entropy-disco-bps lr-A-slope lr-A-intercept lr-A-rvalue lr-A-pvalue lr-A-stderr lr-B-slope lr-B-intercept lr-B-rvalue lr-B-pvalue lr-B-stderr disco/split clips/score nodes/edge full-gene-dysregulation frameshift=0 frameshift=+1 frameshift=+2 median-AS-A median-AS-B max-AS-A max-AS-B data-structure -1 chr5:105166412->chr7:12213276 chr5 105166412 + 0 22 chr7 12213276 + 22 0 inf entropy=0.7372<0.7382,chim_overhang=16<25 linear intronic 33 22 11 0 1528 14 1 1 1 0 0 0.7372 0.7372 0.0000 0.0000 0.1273 15.0000 0.8367 0.0013 0.0278 0.6091 53.9545 0.9419 0.0000 0.0724 0.0000 0.3333 2.0000 16 57 16 59 chr5:105166412/105166413(+)->chr7:12213276/12213277(+):(spanning_paired_1:11,spanning_paired_2:11) +shared-id fusion chr-A pos-A direction-A pos-A-acceptor pos-A-donor chr-B pos-B direction-B pos-B-acceptor pos-B-donor genomic-distance filter-status circRNA intronic/exonic score soft+hardclips n-split-reads n-discordant-reads alignment-score mismatches n-edges n-nodes-A n-nodes-B n-splice-junc-A n-splice-junc-B entropy-bp-edge entropy-all-edges bp-pos-stddev entropy-disco-bps lr-A-slope lr-A-intercept lr-A-rvalue lr-A-pvalue lr-A-stderr lr-B-slope lr-B-intercept lr-B-rvalue lr-B-pvalue lr-B-stderr disco/split clips/score nodes/edge full-gene-dysregulation frameshift=0 frameshift=+1 frameshift=+2 splice-motif-edit-distance median-AS-A median-AS-B max-AS-A max-AS-B data-structure +1 chr5:105166412->chr7:12213276 chr5 105166412 + 0 22 chr7 12213276 + 22 0 inf entropy=0.7372<0.7382,chim_overhang=16<25 linear intronic 33 22 11 0 1528 14 1 1 1 0 0 0.7372 0.7372 0.0000 0.0000 0.1273 15.0000 0.8367 0.0013 0.0278 0.6091 53.9545 0.9419 0.0000 0.0724 0.0000 0.3333 2.0000 16 57 16 59 chr5:105166412/105166413(+)->chr7:12213276/12213277(+):(spanning_paired_1:11,spanning_paired_2:11) diff --git a/tests/chim_overhang/test_03_integrate.out.txt b/tests/chim_overhang/test_03_integrate.out.txt index 656f3689..70bb5bf5 100644 --- a/tests/chim_overhang/test_03_integrate.out.txt +++ b/tests/chim_overhang/test_03_integrate.out.txt @@ -1,2 +1,2 @@ -shared-id fusion chr-A pos-A direction-A pos-A-acceptor pos-A-donor chr-B pos-B direction-B pos-B-acceptor pos-B-donor genomic-distance filter-status circRNA intronic/exonic score soft+hardclips n-split-reads n-discordant-reads alignment-score mismatches n-edges n-nodes-A n-nodes-B n-splice-junc-A n-splice-junc-B entropy-bp-edge entropy-all-edges bp-pos-stddev entropy-disco-bps lr-A-slope lr-A-intercept lr-A-rvalue lr-A-pvalue lr-A-stderr lr-B-slope lr-B-intercept lr-B-rvalue lr-B-pvalue lr-B-stderr disco/split clips/score nodes/edge full-gene-dysregulation frameshift=0 frameshift=+1 frameshift=+2 median-AS-A median-AS-B max-AS-A max-AS-B data-structure -1 chr8:86472668->chr17:40685761 chr8 86472668 - 6 8 chr17 40685761 + 8 6 inf chim_overhang=18<25 linear intronic 21 8 7 0 998 6 1 1 1 0 0 0.8277 0.8277 0.6547 0.0000 0.3214 53.7500 0.9186 0.0035 0.0619 0.2143 16.9286 0.8660 0.0117 0.0553 0.0000 0.1905 2.0000 55 18 56 18 chr8:86472668/86472669(-)->chr17:40685761/40685762(+):(spanning_paired_1:3,spanning_paired_1_t:4,spanning_paired_2:3,spanning_paired_2_t:4) +shared-id fusion chr-A pos-A direction-A pos-A-acceptor pos-A-donor chr-B pos-B direction-B pos-B-acceptor pos-B-donor genomic-distance filter-status circRNA intronic/exonic score soft+hardclips n-split-reads n-discordant-reads alignment-score mismatches n-edges n-nodes-A n-nodes-B n-splice-junc-A n-splice-junc-B entropy-bp-edge entropy-all-edges bp-pos-stddev entropy-disco-bps lr-A-slope lr-A-intercept lr-A-rvalue lr-A-pvalue lr-A-stderr lr-B-slope lr-B-intercept lr-B-rvalue lr-B-pvalue lr-B-stderr disco/split clips/score nodes/edge full-gene-dysregulation frameshift=0 frameshift=+1 frameshift=+2 splice-motif-edit-distance median-AS-A median-AS-B max-AS-A max-AS-B data-structure +1 chr8:86472668->chr17:40685761 chr8 86472668 - 6 8 chr17 40685761 + 8 6 inf chim_overhang=18<25 linear intronic 21 8 7 0 998 6 1 1 1 0 0 0.8277 0.8277 0.6547 0.0000 0.3214 53.7500 0.9186 0.0035 0.0619 0.2143 16.9286 0.8660 0.0117 0.0553 0.0000 0.1905 2.0000 55 18 56 18 chr8:86472668/86472669(-)->chr17:40685761/40685762(+):(spanning_paired_1:3,spanning_paired_1_t:4,spanning_paired_2:3,spanning_paired_2_t:4) diff --git a/tests/test_chim_overhang.py b/tests/test_chim_overhang.py index d9609133..dbf1a9f6 100755 --- a/tests/test_chim_overhang.py +++ b/tests/test_chim_overhang.py @@ -94,7 +94,7 @@ def test_01(self): # Step 04: dr-disco integrate cl = DetectOutput(drdisco_classify) - cl.integrate(drdisco_integrate, None) + cl.integrate(drdisco_integrate, None, None) self.assertTrue(filecmp.cmp(drdisco_integrate_test, drdisco_integrate), msg="diff '" + drdisco_integrate_test + "' '" + drdisco_integrate + "':\n" + subprocess.Popen(['diff', drdisco_integrate_test, drdisco_integrate], stdout=subprocess.PIPE).stdout.read()) @@ -133,7 +133,7 @@ def test_02(self): # Step 04: dr-disco integrate cl = DetectOutput(drdisco_classify) - cl.integrate(drdisco_integrate, None) + cl.integrate(drdisco_integrate, None, None) self.assertTrue(filecmp.cmp(drdisco_integrate_test, drdisco_integrate), msg="diff '" + drdisco_integrate_test + "' '" + drdisco_integrate + "':\n" + subprocess.Popen(['diff', drdisco_integrate_test, drdisco_integrate], stdout=subprocess.PIPE).stdout.read()) @@ -172,7 +172,7 @@ def test_03(self): # Step 04: dr-disco integrate cl = DetectOutput(drdisco_classify) - cl.integrate(drdisco_integrate, None) + cl.integrate(drdisco_integrate, None, None) self.assertTrue(filecmp.cmp(drdisco_integrate_test, drdisco_integrate), msg="diff '" + drdisco_integrate_test + "' '" + drdisco_integrate + "':\n" + subprocess.Popen(['diff', drdisco_integrate_test, drdisco_integrate], stdout=subprocess.PIPE).stdout.read()) From b47d580ae92435c694cca4733ae89e1398bc1a2e Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Mon, 8 Jan 2018 14:58:55 +0100 Subject: [PATCH 10/25] regressing integrate test --- .../test_in_frame_non_hybrid_protein.out.txt | 4 ++-- tests/integrate/test_terg_s041.out.txt | 12 ++++++------ tests/integrate/test_terg_s041_b.out.txt | 12 ++++++------ tests/test_drdisco_integrate.py | 8 ++++---- 4 files changed, 18 insertions(+), 18 deletions(-) diff --git a/tests/integrate/test_in_frame_non_hybrid_protein.out.txt b/tests/integrate/test_in_frame_non_hybrid_protein.out.txt index b3519f3a..eb0b0d71 100644 --- a/tests/integrate/test_in_frame_non_hybrid_protein.out.txt +++ b/tests/integrate/test_in_frame_non_hybrid_protein.out.txt @@ -1,2 +1,2 @@ -shared-id fusion chr-A pos-A direction-A pos-A-acceptor pos-A-donor chr-B pos-B direction-B pos-A-acceptor pos-A-donor genomic-distance filter-status circRNA intronic/exonic score soft+hardclips n-split-reads n-discordant-reads alignment-score mismatches n-edges n-nodes-A n-nodes-B n-splice-junc-A n-splice-junc-B entropy-bp-edge entropy-all-edges bp-pos-stddev entropy-disco-bps lr-A-slope lr-A-intercept lr-A-rvalue lr-A-pvalue lr-A-stderr lr-B-slope lr-B-intercept lr-B-rvalue lr-B-pvalue lr-B-stderr disco/split clips/score nodes/edge full-gene-dysregulation frameshift=0 frameshift=+1 frameshift=+2 median-AS-A median-AS-B max-AS-A max-AS-B data-structure -1 chr21:41508080->chr21:38445621 chr21 38445621 - 238 0 chr21 41508080 + 0 238 3062459 valid linear exonic 494 318 159 17 10000 0 12 5 8 2 0 0.6676 0.7788 0.2243 0.8525 0.2600 81.0521 0.8788 0.0000 0.0135 0.2575 16.2608 0.8745 0.0000 0.0137 0.1069 0.3219 1.0833 TMPRSS2(ENST00000424093.5)-havana->ERG(ENST00000429727.6)-ensembl 50 50 50 50 chr21:38445621/38445622(-)->chr21:41508080/41508081(+):(spanning_paired_1:111,spanning_paired_2:111) +shared-id fusion chr-A pos-A direction-A pos-A-acceptor pos-A-donor chr-B pos-B direction-B pos-A-acceptor pos-A-donor genomic-distance filter-status circRNA intronic/exonic score soft+hardclips n-split-reads n-discordant-reads alignment-score mismatches n-edges n-nodes-A n-nodes-B n-splice-junc-A n-splice-junc-B entropy-bp-edge entropy-all-edges bp-pos-stddev entropy-disco-bps lr-A-slope lr-A-intercept lr-A-rvalue lr-A-pvalue lr-A-stderr lr-B-slope lr-B-intercept lr-B-rvalue lr-B-pvalue lr-B-stderr disco/split clips/score nodes/edge full-gene-dysregulation frameshift=0 frameshift=+1 frameshift=+2 splice-motif-edit-distance median-AS-A median-AS-B max-AS-A max-AS-B data-structure +1 chr21:41508080->chr21:38445621 chr21 38445621 - 238 0 chr21 41508080 + 0 238 3062459 valid linear exonic 494 318 159 17 10000 0 12 5 8 2 0 0.6676 0.7788 0.2243 0.8525 0.2600 81.0521 0.8788 0.0000 0.0135 0.2575 16.2608 0.8745 0.0000 0.0137 0.1069 0.3219 1.0833 TMPRSS2(ENST00000424093.5)-havana->ERG(ENST00000429727.6)-ensembl 50 50 50 50 chr21:38445621/38445622(-)->chr21:41508080/41508081(+):(spanning_paired_1:111,spanning_paired_2:111) diff --git a/tests/integrate/test_terg_s041.out.txt b/tests/integrate/test_terg_s041.out.txt index 60317f55..5d84a9bb 100644 --- a/tests/integrate/test_terg_s041.out.txt +++ b/tests/integrate/test_terg_s041.out.txt @@ -1,6 +1,6 @@ -shared-id fusion chr-A pos-A direction-A pos-A-acceptor pos-A-donor chr-B pos-B direction-B pos-A-acceptor pos-A-donor genomic-distance filter-status circRNA intronic/exonic score soft+hardclips n-split-reads n-discordant-reads alignment-score mismatches n-edges n-nodes-A n-nodes-B n-splice-junc-A n-splice-junc-B entropy-bp-edge entropy-all-edges bp-pos-stddev entropy-disco-bps lr-A-slope lr-A-intercept lr-A-rvalue lr-A-pvalue lr-A-stderr lr-B-slope lr-B-intercept lr-B-rvalue lr-B-pvalue lr-B-stderr disco/split clips/score nodes/edge full-gene-dysregulation frameshift=0 frameshift=+1 frameshift=+2 median-AS-A median-AS-B max-AS-A max-AS-B data-structure -1 TMPRSS2->ERG chr21 38487350 - 200 0 chr21 41479719 + 0 200 2992369 valid linear intronic 151 96 48 7 10000 0 1 1 1 0 0 0.869 0.869 0.2887 0.8124 2.1465 5.6435 0.9817 0 0.0571 1.9162 26.2818 0.99 0 0.0376 0.1458 0.3179 2 50 50 50 50 chr21:38487350/38487351(-)->chr21:41479719/41479720(+):(discordant_mates:14,spanning_paired_1:41,spanning_paired_1_t:7,spanning_paired_2:41,spanning_paired_2_t:7) -1 TMPRSS2->ERG chr21 38445621 - 200 0 chr21 41498118 + 0 200 3052497 valid linear exonic 588 378 189 21 10000 0 14 7 9 2 4 0.7421 0.8197 0 0.7162 0.7344 19.9882 0.9657 0 0.0178 0.7194 21.976 0.9648 0 0.0177 0.1111 0.3214 1.1429 TMPRSS2(ENST00000332149.9)-ensembl_havana->ERG(ENST00000429727.6)-ensembl,TMPRSS2(ENST00000332149.9)-ensembl_havana->ERG(ENST00000468474.5)-havana,TMPRSS2(ENST00000332149.9)-ensembl_havana->ERG(ENST00000473107.1)-havana,TMPRSS2(ENST00000332149.9)-ensembl_havana->ERG(ENST00000481609.5)-havana,TMPRSS2(ENST00000332149.9)-ensembl_havana->ERG(ENST00000485493.1)-havana,TMPRSS2(ENST00000332149.9)-ensembl_havana->ERG(ENST00000492833.5)-ensembl_havana,TMPRSS2(ENST00000424093.5)-havana->ERG(ENST00000429727.6)-ensembl,TMPRSS2(ENST00000424093.5)-havana->ERG(ENST00000468474.5)-havana,TMPRSS2(ENST00000424093.5)-havana->ERG(ENST00000473107.1)-havana,TMPRSS2(ENST00000424093.5)-havana->ERG(ENST00000481609.5)-havana,TMPRSS2(ENST00000424093.5)-havana->ERG(ENST00000485493.1)-havana,TMPRSS2(ENST00000424093.5)-havana->ERG(ENST00000492833.5)-ensembl_havana,TMPRSS2(ENST00000455813.1)-havana->ERG(ENST00000429727.6)-ensembl,TMPRSS2(ENST00000455813.1)-havana->ERG(ENST00000468474.5)-havana,TMPRSS2(ENST00000455813.1)-havana->ERG(ENST00000473107.1)-havana,TMPRSS2(ENST00000455813.1)-havana->ERG(ENST00000481609.5)-havana,TMPRSS2(ENST00000455813.1)-havana->ERG(ENST00000492833.5)-ensembl_havana,TMPRSS2(ENST00000458356.5)-havana->ERG(ENST00000429727.6)-ensembl,TMPRSS2(ENST00000458356.5)-havana->ERG(ENST00000468474.5)-havana,TMPRSS2(ENST00000458356.5)-havana->ERG(ENST00000473107.1)-havana,TMPRSS2(ENST00000458356.5)-havana->ERG(ENST00000481609.5)-havana,TMPRSS2(ENST00000458356.5)-havana->ERG(ENST00000492833.5)-ensembl_havana,TMPRSS2(ENST00000463138.1)-havana->ERG(ENST00000398897.5)-ensembl_havana,TMPRSS2(ENST00000463138.1)-havana->ERG(ENST00000429727.6)-ensembl,TMPRSS2(ENST00000463138.1)-havana->ERG(ENST00000453032.6)-ensembl_havana,TMPRSS2(ENST00000463138.1)-havana->ERG(ENST00000468474.5)-havana,TMPRSS2(ENST00000463138.1)-havana->ERG(ENST00000473107.1)-havana,TMPRSS2(ENST00000463138.1)-havana->ERG(ENST00000481609.5)-havana,TMPRSS2(ENST00000463138.1)-havana->ERG(ENST00000485493.1)-havana,TMPRSS2(ENST00000463138.1)-havana->ERG(ENST00000492833.5)-ensembl_havana,TMPRSS2(ENST00000497881.5)-havana->ERG(ENST00000398897.5)-ensembl_havana,TMPRSS2(ENST00000497881.5)-havana->ERG(ENST00000429727.6)-ensembl,TMPRSS2(ENST00000497881.5)-havana->ERG(ENST00000453032.6)-ensembl_havana,TMPRSS2(ENST00000497881.5)-havana->ERG(ENST00000468474.5)-havana,TMPRSS2(ENST00000497881.5)-havana->ERG(ENST00000473107.1)-havana,TMPRSS2(ENST00000497881.5)-havana->ERG(ENST00000481609.5)-havana,TMPRSS2(ENST00000497881.5)-havana->ERG(ENST00000485493.1)-havana,TMPRSS2(ENST00000497881.5)-havana->ERG(ENST00000492833.5)-ensembl_havana TMPRSS2(ENST00000332149.9)-ensembl_havana->ERG(ENST00000288319.11)-ensembl_havana,TMPRSS2(ENST00000332149.9)-ensembl_havana->ERG(ENST00000398905.5)-havana,TMPRSS2(ENST00000332149.9)-ensembl_havana->ERG(ENST00000398907.5)-havana,TMPRSS2(ENST00000332149.9)-ensembl_havana->ERG(ENST00000398910.5)-havana,TMPRSS2(ENST00000332149.9)-ensembl_havana->ERG(ENST00000398911.5)-havana,TMPRSS2(ENST00000332149.9)-ensembl_havana->ERG(ENST00000398919.6)-ensembl_havana,TMPRSS2(ENST00000332149.9)-ensembl_havana->ERG(ENST00000417133.6)-ensembl_havana,TMPRSS2(ENST00000332149.9)-ensembl_havana->ERG(ENST00000442448.5)-ensembl,TMPRSS2(ENST00000398585.7)-ensembl->ERG(ENST00000288319.11)-ensembl_havana,TMPRSS2(ENST00000398585.7)-ensembl->ERG(ENST00000398905.5)-havana,TMPRSS2(ENST00000398585.7)-ensembl->ERG(ENST00000398907.5)-havana,TMPRSS2(ENST00000398585.7)-ensembl->ERG(ENST00000398910.5)-havana,TMPRSS2(ENST00000398585.7)-ensembl->ERG(ENST00000398911.5)-havana,TMPRSS2(ENST00000398585.7)-ensembl->ERG(ENST00000398919.6)-ensembl_havana,TMPRSS2(ENST00000398585.7)-ensembl->ERG(ENST00000417133.6)-ensembl_havana,TMPRSS2(ENST00000398585.7)-ensembl->ERG(ENST00000442448.5)-ensembl,TMPRSS2(ENST00000424093.5)-havana->ERG(ENST00000288319.11)-ensembl_havana,TMPRSS2(ENST00000424093.5)-havana->ERG(ENST00000398905.5)-havana,TMPRSS2(ENST00000424093.5)-havana->ERG(ENST00000398907.5)-havana,TMPRSS2(ENST00000424093.5)-havana->ERG(ENST00000398910.5)-havana,TMPRSS2(ENST00000424093.5)-havana->ERG(ENST00000398911.5)-havana,TMPRSS2(ENST00000424093.5)-havana->ERG(ENST00000398919.6)-ensembl_havana,TMPRSS2(ENST00000424093.5)-havana->ERG(ENST00000417133.6)-ensembl_havana,TMPRSS2(ENST00000424093.5)-havana->ERG(ENST00000442448.5)-ensembl,TMPRSS2(ENST00000454499.5)-havana->ERG(ENST00000288319.11)-ensembl_havana,TMPRSS2(ENST00000454499.5)-havana->ERG(ENST00000398905.5)-havana,TMPRSS2(ENST00000454499.5)-havana->ERG(ENST00000398907.5)-havana,TMPRSS2(ENST00000454499.5)-havana->ERG(ENST00000398910.5)-havana,TMPRSS2(ENST00000454499.5)-havana->ERG(ENST00000398911.5)-havana,TMPRSS2(ENST00000454499.5)-havana->ERG(ENST00000398919.6)-ensembl_havana,TMPRSS2(ENST00000454499.5)-havana->ERG(ENST00000417133.6)-ensembl_havana,TMPRSS2(ENST00000454499.5)-havana->ERG(ENST00000442448.5)-ensembl,TMPRSS2(ENST00000455813.1)-havana->ERG(ENST00000288319.11)-ensembl_havana,TMPRSS2(ENST00000455813.1)-havana->ERG(ENST00000398905.5)-havana,TMPRSS2(ENST00000455813.1)-havana->ERG(ENST00000398907.5)-havana,TMPRSS2(ENST00000455813.1)-havana->ERG(ENST00000398910.5)-havana,TMPRSS2(ENST00000455813.1)-havana->ERG(ENST00000398911.5)-havana,TMPRSS2(ENST00000455813.1)-havana->ERG(ENST00000398919.6)-ensembl_havana,TMPRSS2(ENST00000455813.1)-havana->ERG(ENST00000417133.6)-ensembl_havana,TMPRSS2(ENST00000455813.1)-havana->ERG(ENST00000442448.5)-ensembl,TMPRSS2(ENST00000458356.5)-havana->ERG(ENST00000288319.11)-ensembl_havana,TMPRSS2(ENST00000458356.5)-havana->ERG(ENST00000398905.5)-havana,TMPRSS2(ENST00000458356.5)-havana->ERG(ENST00000398907.5)-havana,TMPRSS2(ENST00000458356.5)-havana->ERG(ENST00000398910.5)-havana,TMPRSS2(ENST00000458356.5)-havana->ERG(ENST00000398911.5)-havana,TMPRSS2(ENST00000458356.5)-havana->ERG(ENST00000398919.6)-ensembl_havana,TMPRSS2(ENST00000458356.5)-havana->ERG(ENST00000417133.6)-ensembl_havana,TMPRSS2(ENST00000458356.5)-havana->ERG(ENST00000442448.5)-ensembl TMPRSS2(ENST00000332149.9)-ensembl_havana(+0)->ERG(ENST00000288319.11)-ensembl_havana(+1),TMPRSS2(ENST00000332149.9)-ensembl_havana(+0)->ERG(ENST00000398905.5)-havana(+1),TMPRSS2(ENST00000332149.9)-ensembl_havana(+0)->ERG(ENST00000398907.5)-havana(+1),TMPRSS2(ENST00000332149.9)-ensembl_havana(+0)->ERG(ENST00000398910.5)-havana(+1),TMPRSS2(ENST00000332149.9)-ensembl_havana(+0)->ERG(ENST00000398911.5)-havana(+1),TMPRSS2(ENST00000332149.9)-ensembl_havana(+0)->ERG(ENST00000398919.6)-ensembl_havana(+1),TMPRSS2(ENST00000332149.9)-ensembl_havana(+0)->ERG(ENST00000417133.6)-ensembl_havana(+1),TMPRSS2(ENST00000332149.9)-ensembl_havana(+0)->ERG(ENST00000429727.6)-ensembl(+1),TMPRSS2(ENST00000332149.9)-ensembl_havana(+0)->ERG(ENST00000442448.5)-ensembl(+1),TMPRSS2(ENST00000398585.7)-ensembl(+0)->ERG(ENST00000288319.11)-ensembl_havana(+1),TMPRSS2(ENST00000398585.7)-ensembl(+0)->ERG(ENST00000398905.5)-havana(+1),TMPRSS2(ENST00000398585.7)-ensembl(+0)->ERG(ENST00000398907.5)-havana(+1),TMPRSS2(ENST00000398585.7)-ensembl(+0)->ERG(ENST00000398910.5)-havana(+1),TMPRSS2(ENST00000398585.7)-ensembl(+0)->ERG(ENST00000398911.5)-havana(+1),TMPRSS2(ENST00000398585.7)-ensembl(+0)->ERG(ENST00000398919.6)-ensembl_havana(+1),TMPRSS2(ENST00000398585.7)-ensembl(+0)->ERG(ENST00000417133.6)-ensembl_havana(+1),TMPRSS2(ENST00000398585.7)-ensembl(+0)->ERG(ENST00000429727.6)-ensembl(+1),TMPRSS2(ENST00000398585.7)-ensembl(+0)->ERG(ENST00000442448.5)-ensembl(+1),TMPRSS2(ENST00000398585.7)-ensembl(+1)->ERG(ENST00000288319.11)-ensembl_havana(+0),TMPRSS2(ENST00000398585.7)-ensembl(+1)->ERG(ENST00000398905.5)-havana(+0),TMPRSS2(ENST00000398585.7)-ensembl(+1)->ERG(ENST00000398907.5)-havana(+0),TMPRSS2(ENST00000398585.7)-ensembl(+1)->ERG(ENST00000398910.5)-havana(+0),TMPRSS2(ENST00000398585.7)-ensembl(+1)->ERG(ENST00000398911.5)-havana(+0),TMPRSS2(ENST00000398585.7)-ensembl(+1)->ERG(ENST00000398919.6)-ensembl_havana(+0),TMPRSS2(ENST00000398585.7)-ensembl(+1)->ERG(ENST00000417133.6)-ensembl_havana(+0),TMPRSS2(ENST00000398585.7)-ensembl(+1)->ERG(ENST00000442448.5)-ensembl(+0),TMPRSS2(ENST00000424093.5)-havana(+0)->ERG(ENST00000288319.11)-ensembl_havana(+1),TMPRSS2(ENST00000424093.5)-havana(+0)->ERG(ENST00000398905.5)-havana(+1),TMPRSS2(ENST00000424093.5)-havana(+0)->ERG(ENST00000398907.5)-havana(+1),TMPRSS2(ENST00000424093.5)-havana(+0)->ERG(ENST00000398910.5)-havana(+1),TMPRSS2(ENST00000424093.5)-havana(+0)->ERG(ENST00000398911.5)-havana(+1),TMPRSS2(ENST00000424093.5)-havana(+0)->ERG(ENST00000398919.6)-ensembl_havana(+1),TMPRSS2(ENST00000424093.5)-havana(+0)->ERG(ENST00000417133.6)-ensembl_havana(+1),TMPRSS2(ENST00000424093.5)-havana(+0)->ERG(ENST00000429727.6)-ensembl(+1),TMPRSS2(ENST00000424093.5)-havana(+0)->ERG(ENST00000442448.5)-ensembl(+1),TMPRSS2(ENST00000454499.5)-havana(+0)->ERG(ENST00000288319.11)-ensembl_havana(+1),TMPRSS2(ENST00000454499.5)-havana(+0)->ERG(ENST00000398905.5)-havana(+1),TMPRSS2(ENST00000454499.5)-havana(+0)->ERG(ENST00000398907.5)-havana(+1),TMPRSS2(ENST00000454499.5)-havana(+0)->ERG(ENST00000398910.5)-havana(+1),TMPRSS2(ENST00000454499.5)-havana(+0)->ERG(ENST00000398911.5)-havana(+1),TMPRSS2(ENST00000454499.5)-havana(+0)->ERG(ENST00000398919.6)-ensembl_havana(+1),TMPRSS2(ENST00000454499.5)-havana(+0)->ERG(ENST00000417133.6)-ensembl_havana(+1),TMPRSS2(ENST00000454499.5)-havana(+0)->ERG(ENST00000429727.6)-ensembl(+1),TMPRSS2(ENST00000454499.5)-havana(+0)->ERG(ENST00000442448.5)-ensembl(+1),TMPRSS2(ENST00000455813.1)-havana(+0)->ERG(ENST00000288319.11)-ensembl_havana(+1),TMPRSS2(ENST00000455813.1)-havana(+0)->ERG(ENST00000398905.5)-havana(+1),TMPRSS2(ENST00000455813.1)-havana(+0)->ERG(ENST00000398907.5)-havana(+1),TMPRSS2(ENST00000455813.1)-havana(+0)->ERG(ENST00000398910.5)-havana(+1),TMPRSS2(ENST00000455813.1)-havana(+0)->ERG(ENST00000398911.5)-havana(+1),TMPRSS2(ENST00000455813.1)-havana(+0)->ERG(ENST00000398919.6)-ensembl_havana(+1),TMPRSS2(ENST00000455813.1)-havana(+0)->ERG(ENST00000417133.6)-ensembl_havana(+1),TMPRSS2(ENST00000455813.1)-havana(+0)->ERG(ENST00000429727.6)-ensembl(+1),TMPRSS2(ENST00000455813.1)-havana(+0)->ERG(ENST00000442448.5)-ensembl(+1),TMPRSS2(ENST00000458356.5)-havana(+0)->ERG(ENST00000288319.11)-ensembl_havana(+1),TMPRSS2(ENST00000458356.5)-havana(+0)->ERG(ENST00000398905.5)-havana(+1),TMPRSS2(ENST00000458356.5)-havana(+0)->ERG(ENST00000398907.5)-havana(+1),TMPRSS2(ENST00000458356.5)-havana(+0)->ERG(ENST00000398910.5)-havana(+1),TMPRSS2(ENST00000458356.5)-havana(+0)->ERG(ENST00000398911.5)-havana(+1),TMPRSS2(ENST00000458356.5)-havana(+0)->ERG(ENST00000398919.6)-ensembl_havana(+1),TMPRSS2(ENST00000458356.5)-havana(+0)->ERG(ENST00000417133.6)-ensembl_havana(+1),TMPRSS2(ENST00000458356.5)-havana(+0)->ERG(ENST00000429727.6)-ensembl(+1),TMPRSS2(ENST00000458356.5)-havana(+0)->ERG(ENST00000442448.5)-ensembl(+1) TMPRSS2(ENST00000332149.9)-ensembl_havana(+2)->ERG(ENST00000288319.11)-ensembl_havana(+0),TMPRSS2(ENST00000332149.9)-ensembl_havana(+2)->ERG(ENST00000398905.5)-havana(+0),TMPRSS2(ENST00000332149.9)-ensembl_havana(+2)->ERG(ENST00000398907.5)-havana(+0),TMPRSS2(ENST00000332149.9)-ensembl_havana(+2)->ERG(ENST00000398910.5)-havana(+0),TMPRSS2(ENST00000332149.9)-ensembl_havana(+2)->ERG(ENST00000398911.5)-havana(+0),TMPRSS2(ENST00000332149.9)-ensembl_havana(+2)->ERG(ENST00000398919.6)-ensembl_havana(+0),TMPRSS2(ENST00000332149.9)-ensembl_havana(+2)->ERG(ENST00000417133.6)-ensembl_havana(+0),TMPRSS2(ENST00000332149.9)-ensembl_havana(+2)->ERG(ENST00000442448.5)-ensembl(+0),TMPRSS2(ENST00000398585.7)-ensembl(+2)->ERG(ENST00000288319.11)-ensembl_havana(+0),TMPRSS2(ENST00000398585.7)-ensembl(+2)->ERG(ENST00000398905.5)-havana(+0),TMPRSS2(ENST00000398585.7)-ensembl(+2)->ERG(ENST00000398907.5)-havana(+0),TMPRSS2(ENST00000398585.7)-ensembl(+2)->ERG(ENST00000398910.5)-havana(+0),TMPRSS2(ENST00000398585.7)-ensembl(+2)->ERG(ENST00000398911.5)-havana(+0),TMPRSS2(ENST00000398585.7)-ensembl(+2)->ERG(ENST00000398919.6)-ensembl_havana(+0),TMPRSS2(ENST00000398585.7)-ensembl(+2)->ERG(ENST00000417133.6)-ensembl_havana(+0),TMPRSS2(ENST00000398585.7)-ensembl(+2)->ERG(ENST00000442448.5)-ensembl(+0),TMPRSS2(ENST00000424093.5)-havana(+2)->ERG(ENST00000288319.11)-ensembl_havana(+0),TMPRSS2(ENST00000424093.5)-havana(+2)->ERG(ENST00000398905.5)-havana(+0),TMPRSS2(ENST00000424093.5)-havana(+2)->ERG(ENST00000398907.5)-havana(+0),TMPRSS2(ENST00000424093.5)-havana(+2)->ERG(ENST00000398910.5)-havana(+0),TMPRSS2(ENST00000424093.5)-havana(+2)->ERG(ENST00000398911.5)-havana(+0),TMPRSS2(ENST00000424093.5)-havana(+2)->ERG(ENST00000398919.6)-ensembl_havana(+0),TMPRSS2(ENST00000424093.5)-havana(+2)->ERG(ENST00000417133.6)-ensembl_havana(+0),TMPRSS2(ENST00000424093.5)-havana(+2)->ERG(ENST00000442448.5)-ensembl(+0),TMPRSS2(ENST00000454499.5)-havana(+2)->ERG(ENST00000288319.11)-ensembl_havana(+0),TMPRSS2(ENST00000454499.5)-havana(+2)->ERG(ENST00000398905.5)-havana(+0),TMPRSS2(ENST00000454499.5)-havana(+2)->ERG(ENST00000398907.5)-havana(+0),TMPRSS2(ENST00000454499.5)-havana(+2)->ERG(ENST00000398910.5)-havana(+0),TMPRSS2(ENST00000454499.5)-havana(+2)->ERG(ENST00000398911.5)-havana(+0),TMPRSS2(ENST00000454499.5)-havana(+2)->ERG(ENST00000398919.6)-ensembl_havana(+0),TMPRSS2(ENST00000454499.5)-havana(+2)->ERG(ENST00000417133.6)-ensembl_havana(+0),TMPRSS2(ENST00000454499.5)-havana(+2)->ERG(ENST00000442448.5)-ensembl(+0),TMPRSS2(ENST00000458356.5)-havana(+2)->ERG(ENST00000288319.11)-ensembl_havana(+0),TMPRSS2(ENST00000458356.5)-havana(+2)->ERG(ENST00000398905.5)-havana(+0),TMPRSS2(ENST00000458356.5)-havana(+2)->ERG(ENST00000398907.5)-havana(+0),TMPRSS2(ENST00000458356.5)-havana(+2)->ERG(ENST00000398910.5)-havana(+0),TMPRSS2(ENST00000458356.5)-havana(+2)->ERG(ENST00000398911.5)-havana(+0),TMPRSS2(ENST00000458356.5)-havana(+2)->ERG(ENST00000398919.6)-ensembl_havana(+0),TMPRSS2(ENST00000458356.5)-havana(+2)->ERG(ENST00000417133.6)-ensembl_havana(+0),TMPRSS2(ENST00000458356.5)-havana(+2)->ERG(ENST00000442448.5)-ensembl(+0) 50 50 50 50 chr21:38445621/38445622(-)->chr21:41498118/41498119(+):(discordant_mates:14,spanning_paired_1:114,spanning_paired_1_t:4,spanning_paired_2:114,spanning_paired_2_t:4)&chr21:38445621/38445622(-)->chr21:41508080/41508081(+):(spanning_paired_1:33,spanning_paired_2:33)&chr21:38445621/38445622(-)->chr21:41507949/41507950(+):(discordant_mates:8,spanning_paired_1:15,spanning_paired_2:15)&chr21:38474121/38474122(-)->chr21:41498118/41498119(+):(spanning_paired_1:14,spanning_paired_2:14)&chr21:38445621/38445622(-)->chr21:41480475/41480476(+):(discordant_mates:6,spanning_paired_1:2,spanning_paired_2:2)&chr21:38423561/38423562(-)->chr21:41498118/41498119(+):(spanning_paired_1:2,spanning_paired_2:2)&chr21:38445621/38445622(-)->chr21:41506444/41506445(+):(discordant_mates:6,spanning_paired_1_t:1,spanning_paired_2_t:1)&chr21:38445621/38445622(-)->chr21:41504366/41504367(+):(spanning_paired_1:1,spanning_paired_2:1)&chr21:38446208/38446209(-)->chr21:41498118/41498119(+):(spanning_paired_1:1,spanning_paired_2:1)&chr21:38474121/38474122(-)->chr21:41480475/41480476(+):(spanning_paired_1:1,spanning_paired_2:1)&chr21:38474121/38474122(-)->chr21:41508080/41508081(+):(spanning_paired_1:1,spanning_paired_2:1)&chr21:38423567/38423568(-)->chr21:41485824/41485825(+):(discordant_mates:4)&chr21:38445482/38445483(-)->chr21:41500346/41500347(+):(discordant_mates:2)&chr21:38474093/38474094(-)->chr21:41507966/41507967(+):(discordant_mates:2) -2 chr21:36338771->chr21:36344707 chr21 36338771 + 0 10 chr21 36344707 - 10 0 5936 valid circular exonic 117 58 38 3 10000 0 2 2 2 0 0 0.8536 0.8591 2.436 0.871 2.1714 19.1829 0.9605 0 0.1021 2.0941 32.6902 0.9725 0 0.0814 0.0789 0.2479 2 50 50 50 50 chr21:36338771/36338772(+)->chr21:36344707/36344708(-):(discordant_mates:6,spanning_paired_1:14,spanning_paired_1_t:23,spanning_paired_2:14,spanning_paired_2_t:23)&chr21:36338841/36338842(+)->chr21:36341534/36341535(-):(spanning_paired_1:1,spanning_paired_2:1) -3 chr21:29329693->chr21:29321220 chr21 29321220 + 10 0 chr21 29329693 - 0 10 8473 valid circular exonic 49 35 15 4 10000 0 4 4 3 0 0 0.8605 0.8018 0 0.6667 8.3531 34.3077 0.9863 0 0.4412 9.4685 14.5897 0.9701 0 0.749 0.2667 0.3571 1.75 50 50 50 50 chr21:29321220/29321221(+)->chr21:29329693/29329694(-):(discordant_mates:4,spanning_paired_1:9,spanning_paired_1_t:1,spanning_paired_2:9,spanning_paired_2_t:1)&chr21:29326058/29326059(+)->chr21:29329693/29329694(-):(spanning_paired_1:4,spanning_paired_2:4)&chr21:29327324/29327325(+)->chr21:29329704/29329705(-):(spanning_paired_1_t:1,spanning_paired_2_t:1)&chr21:29321216/29321217(+)->chr21:29329647/29329648(-):(discordant_mates:4) -4 chr21:33432871<->chr21:33414887 chr21 33414887 + 5 5 chr21 33432871 - 5 5 17984 valid circular exonic 20 12 6 2 10000 0 3 3 2 1 0 0.8277 0.9167 0 0.75 12.6 50.8 0.7995 0.1045 5.4663 12.8 24.2 0.8072 0.0987 5.4049 0.3333 0.3 1.6667 50 50 50 50 chr21:33414887/33414888(+)->chr21:33432871/33432872(-):(spanning_paired_1:2,spanning_paired_1_t:3,spanning_paired_2:2,spanning_paired_2_t:3)&chr21:33426883/33426884(+)->chr21:33432871/33432872(-):(spanning_paired_1:1,spanning_paired_2:1)&chr21:33414915/33414916(+)->chr21:33421509/33421510(-):(discordant_mates:4) +shared-id fusion chr-A pos-A direction-A pos-A-acceptor pos-A-donor chr-B pos-B direction-B pos-A-acceptor pos-A-donor genomic-distance filter-status circRNA intronic/exonic score soft+hardclips n-split-reads n-discordant-reads alignment-score mismatches n-edges n-nodes-A n-nodes-B n-splice-junc-A n-splice-junc-B entropy-bp-edge entropy-all-edges bp-pos-stddev entropy-disco-bps lr-A-slope lr-A-intercept lr-A-rvalue lr-A-pvalue lr-A-stderr lr-B-slope lr-B-intercept lr-B-rvalue lr-B-pvalue lr-B-stderr disco/split clips/score nodes/edge full-gene-dysregulation frameshift=0 frameshift=+1 frameshift=+2 splice-motif-edit-distance median-AS-A median-AS-B max-AS-A max-AS-B data-structure +1 TMPRSS2->ERG chr21 38487350 - 200 0 chr21 41479719 + 0 200 2992369 valid linear intronic 151 96 48 7 10000 0 1 1 1 0 0 0.869 0.869 0.2887 0.8124 2.1465 5.6435 0.9817 0 0.0571 1.9162 26.2818 0.99 0 0.0376 0.1458 0.3179 2 50 50 50 50 chr21:38487350/38487351(-)->chr21:41479719/41479720(+):(discordant_mates:14,spanning_paired_1:41,spanning_paired_1_t:7,spanning_paired_2:41,spanning_paired_2_t:7) +1 TMPRSS2->ERG chr21 38445621 - 200 0 chr21 41498118 + 0 200 3052497 valid linear exonic 588 378 189 21 10000 0 14 7 9 2 4 0.7421 0.8197 0 0.7162 0.7344 19.9882 0.9657 0 0.0178 0.7194 21.976 0.9648 0 0.0177 0.1111 0.3214 1.1429 TMPRSS2(ENST00000332149.9)-ensembl_havana->ERG(ENST00000429727.6)-ensembl,TMPRSS2(ENST00000332149.9)-ensembl_havana->ERG(ENST00000468474.5)-havana,TMPRSS2(ENST00000332149.9)-ensembl_havana->ERG(ENST00000473107.1)-havana,TMPRSS2(ENST00000332149.9)-ensembl_havana->ERG(ENST00000481609.5)-havana,TMPRSS2(ENST00000332149.9)-ensembl_havana->ERG(ENST00000485493.1)-havana,TMPRSS2(ENST00000332149.9)-ensembl_havana->ERG(ENST00000492833.5)-ensembl_havana,TMPRSS2(ENST00000424093.5)-havana->ERG(ENST00000429727.6)-ensembl,TMPRSS2(ENST00000424093.5)-havana->ERG(ENST00000468474.5)-havana,TMPRSS2(ENST00000424093.5)-havana->ERG(ENST00000473107.1)-havana,TMPRSS2(ENST00000424093.5)-havana->ERG(ENST00000481609.5)-havana,TMPRSS2(ENST00000424093.5)-havana->ERG(ENST00000485493.1)-havana,TMPRSS2(ENST00000424093.5)-havana->ERG(ENST00000492833.5)-ensembl_havana,TMPRSS2(ENST00000455813.1)-havana->ERG(ENST00000429727.6)-ensembl,TMPRSS2(ENST00000455813.1)-havana->ERG(ENST00000468474.5)-havana,TMPRSS2(ENST00000455813.1)-havana->ERG(ENST00000473107.1)-havana,TMPRSS2(ENST00000455813.1)-havana->ERG(ENST00000481609.5)-havana,TMPRSS2(ENST00000455813.1)-havana->ERG(ENST00000492833.5)-ensembl_havana,TMPRSS2(ENST00000458356.5)-havana->ERG(ENST00000429727.6)-ensembl,TMPRSS2(ENST00000458356.5)-havana->ERG(ENST00000468474.5)-havana,TMPRSS2(ENST00000458356.5)-havana->ERG(ENST00000473107.1)-havana,TMPRSS2(ENST00000458356.5)-havana->ERG(ENST00000481609.5)-havana,TMPRSS2(ENST00000458356.5)-havana->ERG(ENST00000492833.5)-ensembl_havana,TMPRSS2(ENST00000463138.1)-havana->ERG(ENST00000398897.5)-ensembl_havana,TMPRSS2(ENST00000463138.1)-havana->ERG(ENST00000429727.6)-ensembl,TMPRSS2(ENST00000463138.1)-havana->ERG(ENST00000453032.6)-ensembl_havana,TMPRSS2(ENST00000463138.1)-havana->ERG(ENST00000468474.5)-havana,TMPRSS2(ENST00000463138.1)-havana->ERG(ENST00000473107.1)-havana,TMPRSS2(ENST00000463138.1)-havana->ERG(ENST00000481609.5)-havana,TMPRSS2(ENST00000463138.1)-havana->ERG(ENST00000485493.1)-havana,TMPRSS2(ENST00000463138.1)-havana->ERG(ENST00000492833.5)-ensembl_havana,TMPRSS2(ENST00000497881.5)-havana->ERG(ENST00000398897.5)-ensembl_havana,TMPRSS2(ENST00000497881.5)-havana->ERG(ENST00000429727.6)-ensembl,TMPRSS2(ENST00000497881.5)-havana->ERG(ENST00000453032.6)-ensembl_havana,TMPRSS2(ENST00000497881.5)-havana->ERG(ENST00000468474.5)-havana,TMPRSS2(ENST00000497881.5)-havana->ERG(ENST00000473107.1)-havana,TMPRSS2(ENST00000497881.5)-havana->ERG(ENST00000481609.5)-havana,TMPRSS2(ENST00000497881.5)-havana->ERG(ENST00000485493.1)-havana,TMPRSS2(ENST00000497881.5)-havana->ERG(ENST00000492833.5)-ensembl_havana TMPRSS2(ENST00000332149.9)-ensembl_havana->ERG(ENST00000288319.11)-ensembl_havana,TMPRSS2(ENST00000332149.9)-ensembl_havana->ERG(ENST00000398905.5)-havana,TMPRSS2(ENST00000332149.9)-ensembl_havana->ERG(ENST00000398907.5)-havana,TMPRSS2(ENST00000332149.9)-ensembl_havana->ERG(ENST00000398910.5)-havana,TMPRSS2(ENST00000332149.9)-ensembl_havana->ERG(ENST00000398911.5)-havana,TMPRSS2(ENST00000332149.9)-ensembl_havana->ERG(ENST00000398919.6)-ensembl_havana,TMPRSS2(ENST00000332149.9)-ensembl_havana->ERG(ENST00000417133.6)-ensembl_havana,TMPRSS2(ENST00000332149.9)-ensembl_havana->ERG(ENST00000442448.5)-ensembl,TMPRSS2(ENST00000398585.7)-ensembl->ERG(ENST00000288319.11)-ensembl_havana,TMPRSS2(ENST00000398585.7)-ensembl->ERG(ENST00000398905.5)-havana,TMPRSS2(ENST00000398585.7)-ensembl->ERG(ENST00000398907.5)-havana,TMPRSS2(ENST00000398585.7)-ensembl->ERG(ENST00000398910.5)-havana,TMPRSS2(ENST00000398585.7)-ensembl->ERG(ENST00000398911.5)-havana,TMPRSS2(ENST00000398585.7)-ensembl->ERG(ENST00000398919.6)-ensembl_havana,TMPRSS2(ENST00000398585.7)-ensembl->ERG(ENST00000417133.6)-ensembl_havana,TMPRSS2(ENST00000398585.7)-ensembl->ERG(ENST00000442448.5)-ensembl,TMPRSS2(ENST00000424093.5)-havana->ERG(ENST00000288319.11)-ensembl_havana,TMPRSS2(ENST00000424093.5)-havana->ERG(ENST00000398905.5)-havana,TMPRSS2(ENST00000424093.5)-havana->ERG(ENST00000398907.5)-havana,TMPRSS2(ENST00000424093.5)-havana->ERG(ENST00000398910.5)-havana,TMPRSS2(ENST00000424093.5)-havana->ERG(ENST00000398911.5)-havana,TMPRSS2(ENST00000424093.5)-havana->ERG(ENST00000398919.6)-ensembl_havana,TMPRSS2(ENST00000424093.5)-havana->ERG(ENST00000417133.6)-ensembl_havana,TMPRSS2(ENST00000424093.5)-havana->ERG(ENST00000442448.5)-ensembl,TMPRSS2(ENST00000454499.5)-havana->ERG(ENST00000288319.11)-ensembl_havana,TMPRSS2(ENST00000454499.5)-havana->ERG(ENST00000398905.5)-havana,TMPRSS2(ENST00000454499.5)-havana->ERG(ENST00000398907.5)-havana,TMPRSS2(ENST00000454499.5)-havana->ERG(ENST00000398910.5)-havana,TMPRSS2(ENST00000454499.5)-havana->ERG(ENST00000398911.5)-havana,TMPRSS2(ENST00000454499.5)-havana->ERG(ENST00000398919.6)-ensembl_havana,TMPRSS2(ENST00000454499.5)-havana->ERG(ENST00000417133.6)-ensembl_havana,TMPRSS2(ENST00000454499.5)-havana->ERG(ENST00000442448.5)-ensembl,TMPRSS2(ENST00000455813.1)-havana->ERG(ENST00000288319.11)-ensembl_havana,TMPRSS2(ENST00000455813.1)-havana->ERG(ENST00000398905.5)-havana,TMPRSS2(ENST00000455813.1)-havana->ERG(ENST00000398907.5)-havana,TMPRSS2(ENST00000455813.1)-havana->ERG(ENST00000398910.5)-havana,TMPRSS2(ENST00000455813.1)-havana->ERG(ENST00000398911.5)-havana,TMPRSS2(ENST00000455813.1)-havana->ERG(ENST00000398919.6)-ensembl_havana,TMPRSS2(ENST00000455813.1)-havana->ERG(ENST00000417133.6)-ensembl_havana,TMPRSS2(ENST00000455813.1)-havana->ERG(ENST00000442448.5)-ensembl,TMPRSS2(ENST00000458356.5)-havana->ERG(ENST00000288319.11)-ensembl_havana,TMPRSS2(ENST00000458356.5)-havana->ERG(ENST00000398905.5)-havana,TMPRSS2(ENST00000458356.5)-havana->ERG(ENST00000398907.5)-havana,TMPRSS2(ENST00000458356.5)-havana->ERG(ENST00000398910.5)-havana,TMPRSS2(ENST00000458356.5)-havana->ERG(ENST00000398911.5)-havana,TMPRSS2(ENST00000458356.5)-havana->ERG(ENST00000398919.6)-ensembl_havana,TMPRSS2(ENST00000458356.5)-havana->ERG(ENST00000417133.6)-ensembl_havana,TMPRSS2(ENST00000458356.5)-havana->ERG(ENST00000442448.5)-ensembl TMPRSS2(ENST00000332149.9)-ensembl_havana(+0)->ERG(ENST00000288319.11)-ensembl_havana(+1),TMPRSS2(ENST00000332149.9)-ensembl_havana(+0)->ERG(ENST00000398905.5)-havana(+1),TMPRSS2(ENST00000332149.9)-ensembl_havana(+0)->ERG(ENST00000398907.5)-havana(+1),TMPRSS2(ENST00000332149.9)-ensembl_havana(+0)->ERG(ENST00000398910.5)-havana(+1),TMPRSS2(ENST00000332149.9)-ensembl_havana(+0)->ERG(ENST00000398911.5)-havana(+1),TMPRSS2(ENST00000332149.9)-ensembl_havana(+0)->ERG(ENST00000398919.6)-ensembl_havana(+1),TMPRSS2(ENST00000332149.9)-ensembl_havana(+0)->ERG(ENST00000417133.6)-ensembl_havana(+1),TMPRSS2(ENST00000332149.9)-ensembl_havana(+0)->ERG(ENST00000429727.6)-ensembl(+1),TMPRSS2(ENST00000332149.9)-ensembl_havana(+0)->ERG(ENST00000442448.5)-ensembl(+1),TMPRSS2(ENST00000398585.7)-ensembl(+0)->ERG(ENST00000288319.11)-ensembl_havana(+1),TMPRSS2(ENST00000398585.7)-ensembl(+0)->ERG(ENST00000398905.5)-havana(+1),TMPRSS2(ENST00000398585.7)-ensembl(+0)->ERG(ENST00000398907.5)-havana(+1),TMPRSS2(ENST00000398585.7)-ensembl(+0)->ERG(ENST00000398910.5)-havana(+1),TMPRSS2(ENST00000398585.7)-ensembl(+0)->ERG(ENST00000398911.5)-havana(+1),TMPRSS2(ENST00000398585.7)-ensembl(+0)->ERG(ENST00000398919.6)-ensembl_havana(+1),TMPRSS2(ENST00000398585.7)-ensembl(+0)->ERG(ENST00000417133.6)-ensembl_havana(+1),TMPRSS2(ENST00000398585.7)-ensembl(+0)->ERG(ENST00000429727.6)-ensembl(+1),TMPRSS2(ENST00000398585.7)-ensembl(+0)->ERG(ENST00000442448.5)-ensembl(+1),TMPRSS2(ENST00000398585.7)-ensembl(+1)->ERG(ENST00000288319.11)-ensembl_havana(+0),TMPRSS2(ENST00000398585.7)-ensembl(+1)->ERG(ENST00000398905.5)-havana(+0),TMPRSS2(ENST00000398585.7)-ensembl(+1)->ERG(ENST00000398907.5)-havana(+0),TMPRSS2(ENST00000398585.7)-ensembl(+1)->ERG(ENST00000398910.5)-havana(+0),TMPRSS2(ENST00000398585.7)-ensembl(+1)->ERG(ENST00000398911.5)-havana(+0),TMPRSS2(ENST00000398585.7)-ensembl(+1)->ERG(ENST00000398919.6)-ensembl_havana(+0),TMPRSS2(ENST00000398585.7)-ensembl(+1)->ERG(ENST00000417133.6)-ensembl_havana(+0),TMPRSS2(ENST00000398585.7)-ensembl(+1)->ERG(ENST00000442448.5)-ensembl(+0),TMPRSS2(ENST00000424093.5)-havana(+0)->ERG(ENST00000288319.11)-ensembl_havana(+1),TMPRSS2(ENST00000424093.5)-havana(+0)->ERG(ENST00000398905.5)-havana(+1),TMPRSS2(ENST00000424093.5)-havana(+0)->ERG(ENST00000398907.5)-havana(+1),TMPRSS2(ENST00000424093.5)-havana(+0)->ERG(ENST00000398910.5)-havana(+1),TMPRSS2(ENST00000424093.5)-havana(+0)->ERG(ENST00000398911.5)-havana(+1),TMPRSS2(ENST00000424093.5)-havana(+0)->ERG(ENST00000398919.6)-ensembl_havana(+1),TMPRSS2(ENST00000424093.5)-havana(+0)->ERG(ENST00000417133.6)-ensembl_havana(+1),TMPRSS2(ENST00000424093.5)-havana(+0)->ERG(ENST00000429727.6)-ensembl(+1),TMPRSS2(ENST00000424093.5)-havana(+0)->ERG(ENST00000442448.5)-ensembl(+1),TMPRSS2(ENST00000454499.5)-havana(+0)->ERG(ENST00000288319.11)-ensembl_havana(+1),TMPRSS2(ENST00000454499.5)-havana(+0)->ERG(ENST00000398905.5)-havana(+1),TMPRSS2(ENST00000454499.5)-havana(+0)->ERG(ENST00000398907.5)-havana(+1),TMPRSS2(ENST00000454499.5)-havana(+0)->ERG(ENST00000398910.5)-havana(+1),TMPRSS2(ENST00000454499.5)-havana(+0)->ERG(ENST00000398911.5)-havana(+1),TMPRSS2(ENST00000454499.5)-havana(+0)->ERG(ENST00000398919.6)-ensembl_havana(+1),TMPRSS2(ENST00000454499.5)-havana(+0)->ERG(ENST00000417133.6)-ensembl_havana(+1),TMPRSS2(ENST00000454499.5)-havana(+0)->ERG(ENST00000429727.6)-ensembl(+1),TMPRSS2(ENST00000454499.5)-havana(+0)->ERG(ENST00000442448.5)-ensembl(+1),TMPRSS2(ENST00000455813.1)-havana(+0)->ERG(ENST00000288319.11)-ensembl_havana(+1),TMPRSS2(ENST00000455813.1)-havana(+0)->ERG(ENST00000398905.5)-havana(+1),TMPRSS2(ENST00000455813.1)-havana(+0)->ERG(ENST00000398907.5)-havana(+1),TMPRSS2(ENST00000455813.1)-havana(+0)->ERG(ENST00000398910.5)-havana(+1),TMPRSS2(ENST00000455813.1)-havana(+0)->ERG(ENST00000398911.5)-havana(+1),TMPRSS2(ENST00000455813.1)-havana(+0)->ERG(ENST00000398919.6)-ensembl_havana(+1),TMPRSS2(ENST00000455813.1)-havana(+0)->ERG(ENST00000417133.6)-ensembl_havana(+1),TMPRSS2(ENST00000455813.1)-havana(+0)->ERG(ENST00000429727.6)-ensembl(+1),TMPRSS2(ENST00000455813.1)-havana(+0)->ERG(ENST00000442448.5)-ensembl(+1),TMPRSS2(ENST00000458356.5)-havana(+0)->ERG(ENST00000288319.11)-ensembl_havana(+1),TMPRSS2(ENST00000458356.5)-havana(+0)->ERG(ENST00000398905.5)-havana(+1),TMPRSS2(ENST00000458356.5)-havana(+0)->ERG(ENST00000398907.5)-havana(+1),TMPRSS2(ENST00000458356.5)-havana(+0)->ERG(ENST00000398910.5)-havana(+1),TMPRSS2(ENST00000458356.5)-havana(+0)->ERG(ENST00000398911.5)-havana(+1),TMPRSS2(ENST00000458356.5)-havana(+0)->ERG(ENST00000398919.6)-ensembl_havana(+1),TMPRSS2(ENST00000458356.5)-havana(+0)->ERG(ENST00000417133.6)-ensembl_havana(+1),TMPRSS2(ENST00000458356.5)-havana(+0)->ERG(ENST00000429727.6)-ensembl(+1),TMPRSS2(ENST00000458356.5)-havana(+0)->ERG(ENST00000442448.5)-ensembl(+1) TMPRSS2(ENST00000332149.9)-ensembl_havana(+2)->ERG(ENST00000288319.11)-ensembl_havana(+0),TMPRSS2(ENST00000332149.9)-ensembl_havana(+2)->ERG(ENST00000398905.5)-havana(+0),TMPRSS2(ENST00000332149.9)-ensembl_havana(+2)->ERG(ENST00000398907.5)-havana(+0),TMPRSS2(ENST00000332149.9)-ensembl_havana(+2)->ERG(ENST00000398910.5)-havana(+0),TMPRSS2(ENST00000332149.9)-ensembl_havana(+2)->ERG(ENST00000398911.5)-havana(+0),TMPRSS2(ENST00000332149.9)-ensembl_havana(+2)->ERG(ENST00000398919.6)-ensembl_havana(+0),TMPRSS2(ENST00000332149.9)-ensembl_havana(+2)->ERG(ENST00000417133.6)-ensembl_havana(+0),TMPRSS2(ENST00000332149.9)-ensembl_havana(+2)->ERG(ENST00000442448.5)-ensembl(+0),TMPRSS2(ENST00000398585.7)-ensembl(+2)->ERG(ENST00000288319.11)-ensembl_havana(+0),TMPRSS2(ENST00000398585.7)-ensembl(+2)->ERG(ENST00000398905.5)-havana(+0),TMPRSS2(ENST00000398585.7)-ensembl(+2)->ERG(ENST00000398907.5)-havana(+0),TMPRSS2(ENST00000398585.7)-ensembl(+2)->ERG(ENST00000398910.5)-havana(+0),TMPRSS2(ENST00000398585.7)-ensembl(+2)->ERG(ENST00000398911.5)-havana(+0),TMPRSS2(ENST00000398585.7)-ensembl(+2)->ERG(ENST00000398919.6)-ensembl_havana(+0),TMPRSS2(ENST00000398585.7)-ensembl(+2)->ERG(ENST00000417133.6)-ensembl_havana(+0),TMPRSS2(ENST00000398585.7)-ensembl(+2)->ERG(ENST00000442448.5)-ensembl(+0),TMPRSS2(ENST00000424093.5)-havana(+2)->ERG(ENST00000288319.11)-ensembl_havana(+0),TMPRSS2(ENST00000424093.5)-havana(+2)->ERG(ENST00000398905.5)-havana(+0),TMPRSS2(ENST00000424093.5)-havana(+2)->ERG(ENST00000398907.5)-havana(+0),TMPRSS2(ENST00000424093.5)-havana(+2)->ERG(ENST00000398910.5)-havana(+0),TMPRSS2(ENST00000424093.5)-havana(+2)->ERG(ENST00000398911.5)-havana(+0),TMPRSS2(ENST00000424093.5)-havana(+2)->ERG(ENST00000398919.6)-ensembl_havana(+0),TMPRSS2(ENST00000424093.5)-havana(+2)->ERG(ENST00000417133.6)-ensembl_havana(+0),TMPRSS2(ENST00000424093.5)-havana(+2)->ERG(ENST00000442448.5)-ensembl(+0),TMPRSS2(ENST00000454499.5)-havana(+2)->ERG(ENST00000288319.11)-ensembl_havana(+0),TMPRSS2(ENST00000454499.5)-havana(+2)->ERG(ENST00000398905.5)-havana(+0),TMPRSS2(ENST00000454499.5)-havana(+2)->ERG(ENST00000398907.5)-havana(+0),TMPRSS2(ENST00000454499.5)-havana(+2)->ERG(ENST00000398910.5)-havana(+0),TMPRSS2(ENST00000454499.5)-havana(+2)->ERG(ENST00000398911.5)-havana(+0),TMPRSS2(ENST00000454499.5)-havana(+2)->ERG(ENST00000398919.6)-ensembl_havana(+0),TMPRSS2(ENST00000454499.5)-havana(+2)->ERG(ENST00000417133.6)-ensembl_havana(+0),TMPRSS2(ENST00000454499.5)-havana(+2)->ERG(ENST00000442448.5)-ensembl(+0),TMPRSS2(ENST00000458356.5)-havana(+2)->ERG(ENST00000288319.11)-ensembl_havana(+0),TMPRSS2(ENST00000458356.5)-havana(+2)->ERG(ENST00000398905.5)-havana(+0),TMPRSS2(ENST00000458356.5)-havana(+2)->ERG(ENST00000398907.5)-havana(+0),TMPRSS2(ENST00000458356.5)-havana(+2)->ERG(ENST00000398910.5)-havana(+0),TMPRSS2(ENST00000458356.5)-havana(+2)->ERG(ENST00000398911.5)-havana(+0),TMPRSS2(ENST00000458356.5)-havana(+2)->ERG(ENST00000398919.6)-ensembl_havana(+0),TMPRSS2(ENST00000458356.5)-havana(+2)->ERG(ENST00000417133.6)-ensembl_havana(+0),TMPRSS2(ENST00000458356.5)-havana(+2)->ERG(ENST00000442448.5)-ensembl(+0) 50 50 50 50 chr21:38445621/38445622(-)->chr21:41498118/41498119(+):(discordant_mates:14,spanning_paired_1:114,spanning_paired_1_t:4,spanning_paired_2:114,spanning_paired_2_t:4)&chr21:38445621/38445622(-)->chr21:41508080/41508081(+):(spanning_paired_1:33,spanning_paired_2:33)&chr21:38445621/38445622(-)->chr21:41507949/41507950(+):(discordant_mates:8,spanning_paired_1:15,spanning_paired_2:15)&chr21:38474121/38474122(-)->chr21:41498118/41498119(+):(spanning_paired_1:14,spanning_paired_2:14)&chr21:38445621/38445622(-)->chr21:41480475/41480476(+):(discordant_mates:6,spanning_paired_1:2,spanning_paired_2:2)&chr21:38423561/38423562(-)->chr21:41498118/41498119(+):(spanning_paired_1:2,spanning_paired_2:2)&chr21:38445621/38445622(-)->chr21:41506444/41506445(+):(discordant_mates:6,spanning_paired_1_t:1,spanning_paired_2_t:1)&chr21:38445621/38445622(-)->chr21:41504366/41504367(+):(spanning_paired_1:1,spanning_paired_2:1)&chr21:38446208/38446209(-)->chr21:41498118/41498119(+):(spanning_paired_1:1,spanning_paired_2:1)&chr21:38474121/38474122(-)->chr21:41480475/41480476(+):(spanning_paired_1:1,spanning_paired_2:1)&chr21:38474121/38474122(-)->chr21:41508080/41508081(+):(spanning_paired_1:1,spanning_paired_2:1)&chr21:38423567/38423568(-)->chr21:41485824/41485825(+):(discordant_mates:4)&chr21:38445482/38445483(-)->chr21:41500346/41500347(+):(discordant_mates:2)&chr21:38474093/38474094(-)->chr21:41507966/41507967(+):(discordant_mates:2) +2 chr21:36338771->chr21:36344707 chr21 36338771 + 0 10 chr21 36344707 - 10 0 5936 valid circular exonic 117 58 38 3 10000 0 2 2 2 0 0 0.8536 0.8591 2.436 0.871 2.1714 19.1829 0.9605 0 0.1021 2.0941 32.6902 0.9725 0 0.0814 0.0789 0.2479 2 50 50 50 50 chr21:36338771/36338772(+)->chr21:36344707/36344708(-):(discordant_mates:6,spanning_paired_1:14,spanning_paired_1_t:23,spanning_paired_2:14,spanning_paired_2_t:23)&chr21:36338841/36338842(+)->chr21:36341534/36341535(-):(spanning_paired_1:1,spanning_paired_2:1) +3 chr21:29329693->chr21:29321220 chr21 29321220 + 10 0 chr21 29329693 - 0 10 8473 valid circular exonic 49 35 15 4 10000 0 4 4 3 0 0 0.8605 0.8018 0 0.6667 8.3531 34.3077 0.9863 0 0.4412 9.4685 14.5897 0.9701 0 0.749 0.2667 0.3571 1.75 50 50 50 50 chr21:29321220/29321221(+)->chr21:29329693/29329694(-):(discordant_mates:4,spanning_paired_1:9,spanning_paired_1_t:1,spanning_paired_2:9,spanning_paired_2_t:1)&chr21:29326058/29326059(+)->chr21:29329693/29329694(-):(spanning_paired_1:4,spanning_paired_2:4)&chr21:29327324/29327325(+)->chr21:29329704/29329705(-):(spanning_paired_1_t:1,spanning_paired_2_t:1)&chr21:29321216/29321217(+)->chr21:29329647/29329648(-):(discordant_mates:4) +4 chr21:33432871<->chr21:33414887 chr21 33414887 + 5 5 chr21 33432871 - 5 5 17984 valid circular exonic 20 12 6 2 10000 0 3 3 2 1 0 0.8277 0.9167 0 0.75 12.6 50.8 0.7995 0.1045 5.4663 12.8 24.2 0.8072 0.0987 5.4049 0.3333 0.3 1.6667 50 50 50 50 chr21:33414887/33414888(+)->chr21:33432871/33432872(-):(spanning_paired_1:2,spanning_paired_1_t:3,spanning_paired_2:2,spanning_paired_2_t:3)&chr21:33426883/33426884(+)->chr21:33432871/33432872(-):(spanning_paired_1:1,spanning_paired_2:1)&chr21:33414915/33414916(+)->chr21:33421509/33421510(-):(discordant_mates:4) diff --git a/tests/integrate/test_terg_s041_b.out.txt b/tests/integrate/test_terg_s041_b.out.txt index 5adfb7cb..697bede1 100644 --- a/tests/integrate/test_terg_s041_b.out.txt +++ b/tests/integrate/test_terg_s041_b.out.txt @@ -1,6 +1,6 @@ -shared-id fusion chr-A pos-A direction-A pos-A-acceptor pos-A-donor chr-B pos-B direction-B pos-A-acceptor pos-A-donor genomic-distance filter-status circRNA intronic/exonic score soft+hardclips n-split-reads n-discordant-reads alignment-score mismatches n-edges n-nodes-A n-nodes-B n-splice-junc-A n-splice-junc-B entropy-bp-edge entropy-all-edges bp-pos-stddev entropy-disco-bps lr-A-slope lr-A-intercept lr-A-rvalue lr-A-pvalue lr-A-stderr lr-B-slope lr-B-intercept lr-B-rvalue lr-B-pvalue lr-B-stderr disco/split clips/score nodes/edge full-gene-dysregulation frameshift=0 frameshift=+1 frameshift=+2 median-AS-A median-AS-B max-AS-A max-AS-B data-structure -1 chr21:41479719->chr21:38487350 chr21 38487350 - 200 0 chr21 41479719 + 0 200 2992369 valid linear intronic 151 96 48 7 10000 0 1 1 1 0 0 0.869 0.869 0.2887 0.8124 2.1465 5.6435 0.9817 0 0.0571 1.9162 26.2818 0.99 0 0.0376 0.1458 0.3179 2 50 50 50 50 chr21:38487350/38487351(-)->chr21:41479719/41479720(+):(discordant_mates:14,spanning_paired_1:41,spanning_paired_1_t:7,spanning_paired_2:41,spanning_paired_2_t:7) -1 chr21:41498118->chr21:38445621 chr21 38445621 - 200 0 chr21 41498118 + 0 200 3052497 valid linear exonic 588 378 189 21 10000 0 14 7 9 2 4 0.7421 0.8197 0 0.7162 0.7344 19.9882 0.9657 0 0.0178 0.7194 21.976 0.9648 0 0.0177 0.1111 0.3214 1.1429 50 50 50 50 chr21:38445621/38445622(-)->chr21:41498118/41498119(+):(discordant_mates:14,spanning_paired_1:114,spanning_paired_1_t:4,spanning_paired_2:114,spanning_paired_2_t:4)&chr21:38445621/38445622(-)->chr21:41508080/41508081(+):(spanning_paired_1:33,spanning_paired_2:33)&chr21:38445621/38445622(-)->chr21:41507949/41507950(+):(discordant_mates:8,spanning_paired_1:15,spanning_paired_2:15)&chr21:38474121/38474122(-)->chr21:41498118/41498119(+):(spanning_paired_1:14,spanning_paired_2:14)&chr21:38445621/38445622(-)->chr21:41480475/41480476(+):(discordant_mates:6,spanning_paired_1:2,spanning_paired_2:2)&chr21:38423561/38423562(-)->chr21:41498118/41498119(+):(spanning_paired_1:2,spanning_paired_2:2)&chr21:38445621/38445622(-)->chr21:41506444/41506445(+):(discordant_mates:6,spanning_paired_1_t:1,spanning_paired_2_t:1)&chr21:38445621/38445622(-)->chr21:41504366/41504367(+):(spanning_paired_1:1,spanning_paired_2:1)&chr21:38446208/38446209(-)->chr21:41498118/41498119(+):(spanning_paired_1:1,spanning_paired_2:1)&chr21:38474121/38474122(-)->chr21:41480475/41480476(+):(spanning_paired_1:1,spanning_paired_2:1)&chr21:38474121/38474122(-)->chr21:41508080/41508081(+):(spanning_paired_1:1,spanning_paired_2:1)&chr21:38423567/38423568(-)->chr21:41485824/41485825(+):(discordant_mates:4)&chr21:38445482/38445483(-)->chr21:41500346/41500347(+):(discordant_mates:2)&chr21:38474093/38474094(-)->chr21:41507966/41507967(+):(discordant_mates:2) -2 chr21:36338771->chr21:36344707 chr21 36338771 + 0 10 chr21 36344707 - 10 0 5936 valid circular exonic 117 58 38 3 10000 0 2 2 2 0 0 0.8536 0.8591 2.436 0.871 2.1714 19.1829 0.9605 0 0.1021 2.0941 32.6902 0.9725 0 0.0814 0.0789 0.2479 2 50 50 50 50 chr21:36338771/36338772(+)->chr21:36344707/36344708(-):(discordant_mates:6,spanning_paired_1:14,spanning_paired_1_t:23,spanning_paired_2:14,spanning_paired_2_t:23)&chr21:36338841/36338842(+)->chr21:36341534/36341535(-):(spanning_paired_1:1,spanning_paired_2:1) -3 chr21:29329693->chr21:29321220 chr21 29321220 + 10 0 chr21 29329693 - 0 10 8473 valid circular exonic 49 35 15 4 10000 0 4 4 3 0 0 0.8605 0.8018 0 0.6667 8.3531 34.3077 0.9863 0 0.4412 9.4685 14.5897 0.9701 0 0.749 0.2667 0.3571 1.75 50 50 50 50 chr21:29321220/29321221(+)->chr21:29329693/29329694(-):(discordant_mates:4,spanning_paired_1:9,spanning_paired_1_t:1,spanning_paired_2:9,spanning_paired_2_t:1)&chr21:29326058/29326059(+)->chr21:29329693/29329694(-):(spanning_paired_1:4,spanning_paired_2:4)&chr21:29327324/29327325(+)->chr21:29329704/29329705(-):(spanning_paired_1_t:1,spanning_paired_2_t:1)&chr21:29321216/29321217(+)->chr21:29329647/29329648(-):(discordant_mates:4) -4 chr21:33432871<->chr21:33414887 chr21 33414887 + 5 5 chr21 33432871 - 5 5 17984 valid circular exonic 20 12 6 2 10000 0 3 3 2 1 0 0.8277 0.9167 0 0.75 12.6 50.8 0.7995 0.1045 5.4663 12.8 24.2 0.8072 0.0987 5.4049 0.3333 0.3 1.6667 50 50 50 50 chr21:33414887/33414888(+)->chr21:33432871/33432872(-):(spanning_paired_1:2,spanning_paired_1_t:3,spanning_paired_2:2,spanning_paired_2_t:3)&chr21:33426883/33426884(+)->chr21:33432871/33432872(-):(spanning_paired_1:1,spanning_paired_2:1)&chr21:33414915/33414916(+)->chr21:33421509/33421510(-):(discordant_mates:4) +shared-id fusion chr-A pos-A direction-A pos-A-acceptor pos-A-donor chr-B pos-B direction-B pos-A-acceptor pos-A-donor genomic-distance filter-status circRNA intronic/exonic score soft+hardclips n-split-reads n-discordant-reads alignment-score mismatches n-edges n-nodes-A n-nodes-B n-splice-junc-A n-splice-junc-B entropy-bp-edge entropy-all-edges bp-pos-stddev entropy-disco-bps lr-A-slope lr-A-intercept lr-A-rvalue lr-A-pvalue lr-A-stderr lr-B-slope lr-B-intercept lr-B-rvalue lr-B-pvalue lr-B-stderr disco/split clips/score nodes/edge full-gene-dysregulation frameshift=0 frameshift=+1 frameshift=+2 splice-motif-edit-distance median-AS-A median-AS-B max-AS-A max-AS-B data-structure +1 chr21:41479719->chr21:38487350 chr21 38487350 - 200 0 chr21 41479719 + 0 200 2992369 valid linear intronic 151 96 48 7 10000 0 1 1 1 0 0 0.869 0.869 0.2887 0.8124 2.1465 5.6435 0.9817 0 0.0571 1.9162 26.2818 0.99 0 0.0376 0.1458 0.3179 2 50 50 50 50 chr21:38487350/38487351(-)->chr21:41479719/41479720(+):(discordant_mates:14,spanning_paired_1:41,spanning_paired_1_t:7,spanning_paired_2:41,spanning_paired_2_t:7) +1 chr21:41498118->chr21:38445621 chr21 38445621 - 200 0 chr21 41498118 + 0 200 3052497 valid linear exonic 588 378 189 21 10000 0 14 7 9 2 4 0.7421 0.8197 0 0.7162 0.7344 19.9882 0.9657 0 0.0178 0.7194 21.976 0.9648 0 0.0177 0.1111 0.3214 1.1429 50 50 50 50 chr21:38445621/38445622(-)->chr21:41498118/41498119(+):(discordant_mates:14,spanning_paired_1:114,spanning_paired_1_t:4,spanning_paired_2:114,spanning_paired_2_t:4)&chr21:38445621/38445622(-)->chr21:41508080/41508081(+):(spanning_paired_1:33,spanning_paired_2:33)&chr21:38445621/38445622(-)->chr21:41507949/41507950(+):(discordant_mates:8,spanning_paired_1:15,spanning_paired_2:15)&chr21:38474121/38474122(-)->chr21:41498118/41498119(+):(spanning_paired_1:14,spanning_paired_2:14)&chr21:38445621/38445622(-)->chr21:41480475/41480476(+):(discordant_mates:6,spanning_paired_1:2,spanning_paired_2:2)&chr21:38423561/38423562(-)->chr21:41498118/41498119(+):(spanning_paired_1:2,spanning_paired_2:2)&chr21:38445621/38445622(-)->chr21:41506444/41506445(+):(discordant_mates:6,spanning_paired_1_t:1,spanning_paired_2_t:1)&chr21:38445621/38445622(-)->chr21:41504366/41504367(+):(spanning_paired_1:1,spanning_paired_2:1)&chr21:38446208/38446209(-)->chr21:41498118/41498119(+):(spanning_paired_1:1,spanning_paired_2:1)&chr21:38474121/38474122(-)->chr21:41480475/41480476(+):(spanning_paired_1:1,spanning_paired_2:1)&chr21:38474121/38474122(-)->chr21:41508080/41508081(+):(spanning_paired_1:1,spanning_paired_2:1)&chr21:38423567/38423568(-)->chr21:41485824/41485825(+):(discordant_mates:4)&chr21:38445482/38445483(-)->chr21:41500346/41500347(+):(discordant_mates:2)&chr21:38474093/38474094(-)->chr21:41507966/41507967(+):(discordant_mates:2) +2 chr21:36338771->chr21:36344707 chr21 36338771 + 0 10 chr21 36344707 - 10 0 5936 valid circular exonic 117 58 38 3 10000 0 2 2 2 0 0 0.8536 0.8591 2.436 0.871 2.1714 19.1829 0.9605 0 0.1021 2.0941 32.6902 0.9725 0 0.0814 0.0789 0.2479 2 50 50 50 50 chr21:36338771/36338772(+)->chr21:36344707/36344708(-):(discordant_mates:6,spanning_paired_1:14,spanning_paired_1_t:23,spanning_paired_2:14,spanning_paired_2_t:23)&chr21:36338841/36338842(+)->chr21:36341534/36341535(-):(spanning_paired_1:1,spanning_paired_2:1) +3 chr21:29329693->chr21:29321220 chr21 29321220 + 10 0 chr21 29329693 - 0 10 8473 valid circular exonic 49 35 15 4 10000 0 4 4 3 0 0 0.8605 0.8018 0 0.6667 8.3531 34.3077 0.9863 0 0.4412 9.4685 14.5897 0.9701 0 0.749 0.2667 0.3571 1.75 50 50 50 50 chr21:29321220/29321221(+)->chr21:29329693/29329694(-):(discordant_mates:4,spanning_paired_1:9,spanning_paired_1_t:1,spanning_paired_2:9,spanning_paired_2_t:1)&chr21:29326058/29326059(+)->chr21:29329693/29329694(-):(spanning_paired_1:4,spanning_paired_2:4)&chr21:29327324/29327325(+)->chr21:29329704/29329705(-):(spanning_paired_1_t:1,spanning_paired_2_t:1)&chr21:29321216/29321217(+)->chr21:29329647/29329648(-):(discordant_mates:4) +4 chr21:33432871<->chr21:33414887 chr21 33414887 + 5 5 chr21 33432871 - 5 5 17984 valid circular exonic 20 12 6 2 10000 0 3 3 2 1 0 0.8277 0.9167 0 0.75 12.6 50.8 0.7995 0.1045 5.4663 12.8 24.2 0.8072 0.0987 5.4049 0.3333 0.3 1.6667 50 50 50 50 chr21:33414887/33414888(+)->chr21:33432871/33432872(-):(spanning_paired_1:2,spanning_paired_1_t:3,spanning_paired_2:2,spanning_paired_2_t:3)&chr21:33426883/33426884(+)->chr21:33432871/33432872(-):(spanning_paired_1:1,spanning_paired_2:1)&chr21:33414915/33414916(+)->chr21:33421509/33421510(-):(discordant_mates:4) diff --git a/tests/test_drdisco_integrate.py b/tests/test_drdisco_integrate.py index b3e82165..9db92021 100755 --- a/tests/test_drdisco_integrate.py +++ b/tests/test_drdisco_integrate.py @@ -51,7 +51,7 @@ def test_s041(self): for gtf_file in gtf_files: cl = DetectOutput(input_file) - cl.integrate(output_file, gtf_file) + cl.integrate(output_file, gtf_file, None) self.assertTrue(filecmp.cmp(test_file, output_file), msg="diff '" + test_file + "' '" + output_file + "':\n" + subprocess.Popen(['diff', test_file, output_file], stdout=subprocess.PIPE).stdout.read()) @@ -64,7 +64,7 @@ def test_s041_nocrash(self): output_file = T_TEST_DIR + "test_" + test_id + ".out.txt" cl = DetectOutput(input_file) - cl.integrate(output_file, gtf_file) + cl.integrate(output_file, gtf_file, None) self.assertTrue(filecmp.cmp(test_file, output_file), msg="diff '" + test_file + "' '" + output_file + "':\n" + subprocess.Popen(['diff', test_file, output_file], stdout=subprocess.PIPE).stdout.read()) @@ -77,7 +77,7 @@ def test_s041_no_gtf(self): output_file = T_TEST_DIR + "test_" + test_id + ".out.txt" cl = DetectOutput(input_file) - cl.integrate(output_file, gtf_file) + cl.integrate(output_file, gtf_file, None) self.assertTrue(filecmp.cmp(test_file, output_file), msg="diff '" + test_file + "' '" + output_file + "':\n" + subprocess.Popen(['diff', test_file, output_file], stdout=subprocess.PIPE).stdout.read()) @@ -94,7 +94,7 @@ def test_in_frame_non_hybrid_protein(self): for gtf_file in gtf_files: cl = DetectOutput(input_file) - cl.integrate(output_file, gtf_file) + cl.integrate(output_file, gtf_file, None) self.assertTrue(filecmp.cmp(test_file, output_file), msg="diff '" + test_file + "' '" + output_file + "':\n" + subprocess.Popen(['diff', test_file, output_file], stdout=subprocess.PIPE).stdout.read()) From 29256ea02efc1ae1a829af88d6e087ba15eba79d Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Mon, 8 Jan 2018 15:12:03 +0100 Subject: [PATCH 11/25] and functional test --- bin/dr-disco | 12 +++++---- tests/test_functional.py | 58 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 65 insertions(+), 5 deletions(-) diff --git a/bin/dr-disco b/bin/dr-disco index 36ced9a2..c437df62 100755 --- a/bin/dr-disco +++ b/bin/dr-disco @@ -140,12 +140,14 @@ def CLI_classify(table_input_file, table_output_file, only_valid, blacklist_regi @click.argument('table_input_file', type=click.Path(exists=True)) @click.argument('table_output_file') @click.option('--gtf', help="Use gene annotation (GTF file)") -def CLI_integrate(table_input_file, table_output_file, gtf): +@click.option('--fasta', help="Use FASTA sequence file to estimate edit distances to splice junction motifs") +def CLI_integrate(table_input_file, table_output_file, gtf, fasta): cl = DetectOutput(table_input_file) - if gtf: - cl.integrate(table_output_file, str(gtf)) - else: - cl.integrate(table_output_file, None) + + gtf = str(gtf) if gtf else None + fasta = str(fasta) if fasta else None + + cl.integrate(table_output_file, gtf, fasta) if __name__ == '__main__': diff --git a/tests/test_functional.py b/tests/test_functional.py index 21967536..4a946dee 100755 --- a/tests/test_functional.py +++ b/tests/test_functional.py @@ -254,5 +254,63 @@ def test_02_s041_no_gtf(self): self.assertTrue(filecmp.cmp(test_file, output_file), msg="diff '" + test_file + "' '" + output_file + "':\n" + subprocess.Popen(['diff', test_file, output_file], stdout=subprocess.PIPE).stdout.read()) + +class TestFunctional_integrate_splice_site_motif(unittest.TestCase): + def __get_temp_dirs(self): + TEST_DIR = "tests/splice_site_motif/" + T_TEST_DIR = "tmp/" + TEST_DIR + + if not os.path.exists(T_TEST_DIR): + os.makedirs(T_TEST_DIR) + + return TEST_DIR, T_TEST_DIR + + def test_sj_01(self): + TEST_DIR, T_TEST_DIR = self.__get_temp_dirs() + + test_id = 'splice_site_motif_01' + + + input_sam = TEST_DIR + "test_" + test_id + ".in.sam" + input_bam = T_TEST_DIR + "test_" + test_id + ".fixed.bam" + input_file = T_TEST_DIR + "test_" + test_id + ".dbed" + + gtf_file = None + fasta_file = TEST_DIR + "test_" + test_id + ".in.fa" + + output_file = T_TEST_DIR + "test_" + test_id + ".out.dbed" + test_file = TEST_DIR + "test_" + test_id + ".out.dbed" + + # sam -> fixed bam + command = ["bin/dr-disco", + "fix", + input_sam, + input_bam] + + self.assertEqual(subprocess.call(command), 0, msg=" ".join([str(x) for x in command])) + + # fixed bam -> dr-disco detect + command = ["bin/dr-disco", + "detect", + "-m", "0", + input_bam, + input_file] + + self.assertEqual(subprocess.call(command), 0, msg=" ".join([str(x) for x in command])) + + + # dr-disco-detect (skip classify) -> dr-disco integrate + command = ["bin/dr-disco", + "integrate", + "--fasta", fasta_file, + input_file, + output_file] + + self.assertEqual(subprocess.call(command), 0, msg=" ".join([str(x) for x in command])) + + self.assertTrue(filecmp.cmp(test_file, output_file), msg="diff '" + test_file + "' '" + output_file + "':\n" + subprocess.Popen(['diff', test_file, output_file], stdout=subprocess.PIPE).stdout.read()) + + + if __name__ == '__main__': main() From f0d93c08f9e2dc8c2e0fbcae19f1080e4c263983 Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Mon, 8 Jan 2018 15:18:57 +0100 Subject: [PATCH 12/25] flake 8 --- bin/dr-disco | 4 ++-- drdisco/DetectOutput.py | 41 ++++++++++++++++----------------- drdisco/utils.py | 17 +++++++------- tests/test_functional.py | 14 ++++------- tests/test_splice_site_motif.py | 1 - 5 files changed, 36 insertions(+), 41 deletions(-) diff --git a/bin/dr-disco b/bin/dr-disco index c437df62..5a6a3b81 100755 --- a/bin/dr-disco +++ b/bin/dr-disco @@ -143,10 +143,10 @@ def CLI_classify(table_input_file, table_output_file, only_valid, blacklist_regi @click.option('--fasta', help="Use FASTA sequence file to estimate edit distances to splice junction motifs") def CLI_integrate(table_input_file, table_output_file, gtf, fasta): cl = DetectOutput(table_input_file) - + gtf = str(gtf) if gtf else None fasta = str(fasta) if fasta else None - + cl.integrate(table_output_file, gtf, fasta) diff --git a/drdisco/DetectOutput.py b/drdisco/DetectOutput.py index 9780dcc2..20b8c97b 100644 --- a/drdisco/DetectOutput.py +++ b/drdisco/DetectOutput.py @@ -196,18 +196,18 @@ def is_on_splice_junction_motif(self, fasta_fh): motif: -5' exon: +5' exon: [ ...{AC}{A}{G} ] {G}{T}{AG}{A}{G}{T} . . . {C}{A}{G} [ {G}... ] """ - + pos5_in_exon_length = 3 pos5_post_exon_length = 6 - + pos3_pre_exon_length = 3 pos3_in_exon_length = 1 - + if self.donorA > self.donorB: pos5p = [self.chrA, self.posA, self.strandA] pos3p = [self.chrB, self.posB, self.strandB] @@ -216,28 +216,27 @@ def is_on_splice_junction_motif(self, fasta_fh): pos3p = [self.chrA, self.posA, self.strandA] else: pos5p = None - if pos5p: - sequences = dict( (s.name, s) for s in fasta_fh ) - + sequences = dict((s.name, s) for s in fasta_fh) + if pos5p[2] == '-': - seq_in_5p_exon = str(sequences[pos5p[0]][pos5p[1]-pos5_in_exon_length:pos5p[1]]).upper() - seq_post_5p_exon = str(sequences[pos5p[0]][pos5p[1]:pos5p[1]+pos5_post_exon_length]).upper() + seq_in_5p_exon = str(sequences[pos5p[0]][pos5p[1] - pos5_in_exon_length:pos5p[1]]).upper() + seq_post_5p_exon = str(sequences[pos5p[0]][pos5p[1]:pos5p[1] + pos5_post_exon_length]).upper() else: - seq_in_5p_exon = reverse_complement(str(sequences[pos5p[0]][pos5p[1]:pos5p[1]+pos5_in_exon_length])) - seq_post_5p_exon = reverse_complement(str(sequences[pos5p[0]][pos5p[1]-pos5_post_exon_length:pos5p[1]])) + seq_in_5p_exon = reverse_complement(str(sequences[pos5p[0]][pos5p[1]:pos5p[1] + pos5_in_exon_length])) + seq_post_5p_exon = reverse_complement(str(sequences[pos5p[0]][pos5p[1] - pos5_post_exon_length:pos5p[1]])) - if pos3p[2] == '+': - seq_pre_3p_exon = str(sequences[pos3p[0]][pos3p[1]-pos3_pre_exon_length:pos3p[1]]).upper() - seq_in_3p_exon = str(sequences[pos3p[0]][pos3p[1]:pos3p[1]+pos3_in_exon_length]).upper() + if pos3p[2] == ' + ': + seq_pre_3p_exon = str(sequences[pos3p[0]][pos3p[1] - pos3_pre_exon_length:pos3p[1]]).upper() + seq_in_3p_exon = str(sequences[pos3p[0]][pos3p[1]:pos3p[1] + pos3_in_exon_length]).upper() else: - seq_in_3p_exon = reverse_complement(str(sequences[pos3p[0]][pos3p[1]-pos3_in_exon_length:pos3p[1]])) - seq_pre_3p_exon = reverse_complement(str(sequences[pos3p[0]][pos3p[1]:pos3p[1]+pos3_pre_exon_length])) + seq_in_3p_exon = reverse_complement(str(sequences[pos3p[0]][pos3p[1] - pos3_in_exon_length:pos3p[1]])) + seq_pre_3p_exon = reverse_complement(str(sequences[pos3p[0]][pos3p[1]:pos3p[1] + pos3_pre_exon_length])) def calc_dist(pat, subseq): d = 0 - + if len(pat) != len(subseq): raise Exception("invalid pattern size") for i in range(len(pat)): @@ -245,9 +244,9 @@ def calc_dist(pat, subseq): d += 1 return d - - # print "[ ... " + seq_in_5p_exon + " ] " + seq_post_5p_exon + " ... ... " + seq_pre_3p_exon + " [ " + seq_in_3p_exon + " ... ]" , - dist = calc_dist(["AC","A","G"], seq_in_5p_exon) + calc_dist(["G","T","AG","A","G","T" ], seq_post_5p_exon) + calc_dist(["C","A","G"], seq_pre_3p_exon) + calc_dist(["G"], seq_in_3p_exon) + + # print "[ ... " + seq_in_5p_exon + " ] " + seq_post_5p_exon + " ... ... " + seq_pre_3p_exon + " [ " + seq_in_3p_exon + " ... ]" , + dist = calc_dist(["AC", "A", "G"], seq_in_5p_exon) + calc_dist(["G", "T", "AG", "A", "G", "T"], seq_post_5p_exon) + calc_dist(["C", "A", "G"], seq_pre_3p_exon) + calc_dist(["G"], seq_in_3p_exon) self.edit_dist_to_splice_motif = str(dist) return dist @@ -465,7 +464,7 @@ def insert_in_index(index, entries, score): # index used to annotate gene names: TMPRSS2->ERG gene_annotation = GeneAnnotation(gtf_file) dfs = DetectFrameShifts(gtf_file) if gtf_file else None - + ffs = HTSeq.FastaReader(fasta_file) if fasta_file else None intronic_linear = [] diff --git a/drdisco/utils.py b/drdisco/utils.py index c9c24b23..5c2a003f 100644 --- a/drdisco/utils.py +++ b/drdisco/utils.py @@ -2,16 +2,17 @@ # *- coding: utf-8 -*- # vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4 textwidth=79: -alt_map = {'ins':'0'} -complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'} +alt_map = {'ins': '0'} +complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'} + def reverse_complement(seq): seq = seq.upper() - for k,v in alt_map.iteritems(): - seq = seq.replace(k,v) - bases = list(seq) - bases = reversed([complement.get(base,base) for base in bases]) + for k, v in alt_map.iteritems(): + seq = seq.replace(k, v) + bases = list(seq) + bases = reversed([complement.get(base, base) for base in bases]) bases = ''.join(bases) - for k,v in alt_map.iteritems(): - bases = bases.replace(v,k) + for k, v in alt_map.iteritems(): + bases = bases.replace(v, k) return bases diff --git a/tests/test_functional.py b/tests/test_functional.py index 4a946dee..0b82ff0d 100755 --- a/tests/test_functional.py +++ b/tests/test_functional.py @@ -254,7 +254,6 @@ def test_02_s041_no_gtf(self): self.assertTrue(filecmp.cmp(test_file, output_file), msg="diff '" + test_file + "' '" + output_file + "':\n" + subprocess.Popen(['diff', test_file, output_file], stdout=subprocess.PIPE).stdout.read()) - class TestFunctional_integrate_splice_site_motif(unittest.TestCase): def __get_temp_dirs(self): TEST_DIR = "tests/splice_site_motif/" @@ -270,12 +269,11 @@ def test_sj_01(self): test_id = 'splice_site_motif_01' - input_sam = TEST_DIR + "test_" + test_id + ".in.sam" input_bam = T_TEST_DIR + "test_" + test_id + ".fixed.bam" input_file = T_TEST_DIR + "test_" + test_id + ".dbed" - gtf_file = None + # gtf_file = None fasta_file = TEST_DIR + "test_" + test_id + ".in.fa" output_file = T_TEST_DIR + "test_" + test_id + ".out.dbed" @@ -298,19 +296,17 @@ def test_sj_01(self): self.assertEqual(subprocess.call(command), 0, msg=" ".join([str(x) for x in command])) - # dr-disco-detect (skip classify) -> dr-disco integrate command = ["bin/dr-disco", - "integrate", - "--fasta", fasta_file, - input_file, - output_file] + "integrate", + "--fasta", fasta_file, + input_file, + output_file] self.assertEqual(subprocess.call(command), 0, msg=" ".join([str(x) for x in command])) self.assertTrue(filecmp.cmp(test_file, output_file), msg="diff '" + test_file + "' '" + output_file + "':\n" + subprocess.Popen(['diff', test_file, output_file], stdout=subprocess.PIPE).stdout.read()) - if __name__ == '__main__': main() diff --git a/tests/test_splice_site_motif.py b/tests/test_splice_site_motif.py index 1e8ae8f5..42738f53 100755 --- a/tests/test_splice_site_motif.py +++ b/tests/test_splice_site_motif.py @@ -72,7 +72,6 @@ def test_sj_01(self): self.assertTrue(filecmp.cmp(test_file, output_file), msg="diff '" + test_file + "' '" + output_file + "':\n" + subprocess.Popen(['diff', test_file, output_file], stdout=subprocess.PIPE).stdout.read()) - def test_sj_02(self): test_id = 'splice_site_motif_02' From bc9061fc905d1473f9931b59347ed5b071078525 Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Mon, 8 Jan 2018 16:24:39 +0100 Subject: [PATCH 13/25] use decent working pyfaidx over htseq for random access to fasta files --- drdisco/DetectOutput.py | 50 +++++++++++++++++++++-------------------- requirements.txt | 1 + 2 files changed, 27 insertions(+), 24 deletions(-) diff --git a/drdisco/DetectOutput.py b/drdisco/DetectOutput.py index 20b8c97b..85a8e7fb 100644 --- a/drdisco/DetectOutput.py +++ b/drdisco/DetectOutput.py @@ -9,6 +9,7 @@ from drdisco.utils import reverse_complement import gzip import HTSeq +from pyfaidx import Fasta """[License: GNU General Public License v3 (GPLv3)] @@ -218,37 +219,38 @@ def is_on_splice_junction_motif(self, fasta_fh): pos5p = None if pos5p: - sequences = dict((s.name, s) for s in fasta_fh) - if pos5p[2] == '-': - seq_in_5p_exon = str(sequences[pos5p[0]][pos5p[1] - pos5_in_exon_length:pos5p[1]]).upper() - seq_post_5p_exon = str(sequences[pos5p[0]][pos5p[1]:pos5p[1] + pos5_post_exon_length]).upper() + seq_in_5p_exon = str(fasta_fh[pos5p[0]][pos5p[1] - pos5_in_exon_length:pos5p[1]]).upper() + seq_post_5p_exon = str(fasta_fh[pos5p[0]][pos5p[1]:pos5p[1] + pos5_post_exon_length]).upper() else: - seq_in_5p_exon = reverse_complement(str(sequences[pos5p[0]][pos5p[1]:pos5p[1] + pos5_in_exon_length])) - seq_post_5p_exon = reverse_complement(str(sequences[pos5p[0]][pos5p[1] - pos5_post_exon_length:pos5p[1]])) + seq_in_5p_exon = reverse_complement(str(fasta_fh[pos5p[0]][pos5p[1]:pos5p[1] + pos5_in_exon_length])) + seq_post_5p_exon = reverse_complement(str(fasta_fh[pos5p[0]][pos5p[1] - pos5_post_exon_length:pos5p[1]])) - if pos3p[2] == ' + ': - seq_pre_3p_exon = str(sequences[pos3p[0]][pos3p[1] - pos3_pre_exon_length:pos3p[1]]).upper() - seq_in_3p_exon = str(sequences[pos3p[0]][pos3p[1]:pos3p[1] + pos3_in_exon_length]).upper() + if pos3p[2] == '+': + seq_pre_3p_exon = str(fasta_fh[pos3p[0]][pos3p[1] - pos3_pre_exon_length:pos3p[1]]).upper() + seq_in_3p_exon = str(fasta_fh[pos3p[0]][pos3p[1]:pos3p[1] + pos3_in_exon_length]).upper() else: - seq_in_3p_exon = reverse_complement(str(sequences[pos3p[0]][pos3p[1] - pos3_in_exon_length:pos3p[1]])) - seq_pre_3p_exon = reverse_complement(str(sequences[pos3p[0]][pos3p[1]:pos3p[1] + pos3_pre_exon_length])) + seq_in_3p_exon = reverse_complement(str(fasta_fh[pos3p[0]][pos3p[1] - pos3_in_exon_length:pos3p[1]])) + seq_pre_3p_exon = reverse_complement(str(fasta_fh[pos3p[0]][pos3p[1]:pos3p[1] + pos3_pre_exon_length])) + + def calc_dist(pat, subseq): + d = 0 - def calc_dist(pat, subseq): - d = 0 + if len(pat) != len(subseq): + raise Exception("invalid pattern size") + for i in range(len(pat)): + if subseq[i] not in pat[i]: + d += 1 - if len(pat) != len(subseq): - raise Exception("invalid pattern size") - for i in range(len(pat)): - if subseq[i] not in pat[i]: - d += 1 + return d - return d + dist = calc_dist(["AC", "A", "G"], seq_in_5p_exon) + calc_dist(["G", "T", "AG", "A", "G", "T"], seq_post_5p_exon) + calc_dist(["C", "A", "G"], seq_pre_3p_exon) + calc_dist(["G"], seq_in_3p_exon) + # print "[ ... " + seq_in_5p_exon + " ] " + seq_post_5p_exon + " ... ... " + seq_pre_3p_exon + " [ " + seq_in_3p_exon + " ... ] ---> " + str(dist) + self.edit_dist_to_splice_motif = str(dist) - # print "[ ... " + seq_in_5p_exon + " ] " + seq_post_5p_exon + " ... ... " + seq_pre_3p_exon + " [ " + seq_in_3p_exon + " ... ]" , - dist = calc_dist(["AC", "A", "G"], seq_in_5p_exon) + calc_dist(["G", "T", "AG", "A", "G", "T"], seq_post_5p_exon) + calc_dist(["C", "A", "G"], seq_pre_3p_exon) + calc_dist(["G"], seq_in_3p_exon) - self.edit_dist_to_splice_motif = str(dist) - return dist + return dist + else: + return "" def __str__(self): line = self.line @@ -465,7 +467,7 @@ def insert_in_index(index, entries, score): gene_annotation = GeneAnnotation(gtf_file) dfs = DetectFrameShifts(gtf_file) if gtf_file else None - ffs = HTSeq.FastaReader(fasta_file) if fasta_file else None + ffs = Fasta(fasta_file) if fasta_file else None intronic_linear = [] remainder = [] diff --git a/requirements.txt b/requirements.txt index 6abd4a0f..f45ae29a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,3 +5,4 @@ HTSeq==0.6.1 numpy pysam==0.10.0 scipy +pyfaidx==0.5.1 From f60ee6640d2dc000e1c437dd63bbc106e73cdeec Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Tue, 9 Jan 2018 11:33:48 +0100 Subject: [PATCH 14/25] utilitized --- drdisco/DetectOutput.py | 11 +---------- drdisco/utils.py | 13 +++++++++++++ 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/drdisco/DetectOutput.py b/drdisco/DetectOutput.py index 85a8e7fb..bdc8038d 100644 --- a/drdisco/DetectOutput.py +++ b/drdisco/DetectOutput.py @@ -6,7 +6,7 @@ from drdisco import log from drdisco.DetectFrameShifts import DetectFrameShifts -from drdisco.utils import reverse_complement +from drdisco.utils import reverse_complement, is_gzip import gzip import HTSeq from pyfaidx import Fasta @@ -39,15 +39,6 @@ """ -def is_gzip(filename): - try: - f = gzip.GzipFile(filename, 'rb') - f.read() - return True - except Exception: - return False - - class DetectOutputEntry: def __init__(self, line_in_results_file): self.line = line_in_results_file.strip().split("\t") diff --git a/drdisco/utils.py b/drdisco/utils.py index 5c2a003f..71359065 100644 --- a/drdisco/utils.py +++ b/drdisco/utils.py @@ -2,6 +2,10 @@ # *- coding: utf-8 -*- # vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4 textwidth=79: + +import gzip + + alt_map = {'ins': '0'} complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'} @@ -16,3 +20,12 @@ def reverse_complement(seq): for k, v in alt_map.iteritems(): bases = bases.replace(v, k) return bases + + +def is_gzip(filename): + try: + f = gzip.GzipFile(filename, 'rb') + f.read() + return True + except Exception: + return False From fa921871a553b2e81e676209b8908493fcfb5950 Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Tue, 9 Jan 2018 12:51:24 +0100 Subject: [PATCH 15/25] more through testing of frame shifts --- drdisco/DetectFrameShifts.py | 7 ++ drdisco/DetectOutput.py | 46 ++++++----- .../test_frameshift-prediction_01.in.dbed | 3 + .../test_frameshift-prediction_01.out.txt | 0 tests/test_functional.py | 80 +++++++++++++++++++ 5 files changed, 114 insertions(+), 22 deletions(-) create mode 100644 tests/integrate/test_frameshift-prediction_01.in.dbed create mode 100644 tests/integrate/test_frameshift-prediction_01.out.txt diff --git a/drdisco/DetectFrameShifts.py b/drdisco/DetectFrameShifts.py index c52b2a02..bc5a4f45 100644 --- a/drdisco/DetectFrameShifts.py +++ b/drdisco/DetectFrameShifts.py @@ -189,7 +189,14 @@ def evaluate(self, _from, _to, offset): """ Offset may be convenient because STAR sometimes has problems aligning/clipping the first 2 bases after an exon Values of 4 and larger do not make sense. + + Args: + _from ([chr, pos, strand]): donor break position + _to ([chr, pos, strand]): acceptor position position + """ + print _from , " --> " , _to + from_l_fgd = [] to_l_fgd = [] diff --git a/drdisco/DetectOutput.py b/drdisco/DetectOutput.py index bdc8038d..7b1f89d1 100644 --- a/drdisco/DetectOutput.py +++ b/drdisco/DetectOutput.py @@ -41,7 +41,7 @@ class DetectOutputEntry: def __init__(self, line_in_results_file): - self.line = line_in_results_file.strip().split("\t") + self.line = line_in_results_file.strip("\r\n").split("\t") self.parse() def parse(self): @@ -146,14 +146,15 @@ def parse(self): def get_donors_acceptors(self, gtf_file): idx = {} for a in self.structure.split('&'): - for b in a.split(':', 3)[3].strip('()').split(','): - c = b.split(':') - c[0] = c[0].replace('_1', '_[12]').replace('_2', '_[12]') - if c[0] != 'discordant_mates': - if c[0] not in idx: - idx[c[0]] = 0 + if a != '': + for b in a.split(':', 3)[3].strip('()').split(','): + c = b.split(':') + c[0] = c[0].replace('_1', '_[12]').replace('_2', '_[12]') + if c[0] != 'discordant_mates': + if c[0] not in idx: + idx[c[0]] = 0 - idx[c[0]] += int(c[1]) + idx[c[0]] += int(c[1]) def pos_to_gene_str(pos_chr, pos_pos): if pos_chr[0:3] == 'chr': @@ -482,24 +483,25 @@ def insert_in_index(index, entries, score): frameshifts_2 = [x[0][0] + '(+' + str(x[0][1]) + ')->' + x[1][0] + '(+' + str(x[1][1]) + ')' for x in frame_shifts[2]] for additional_breaks in e.structure.split('&'): - params = additional_breaks.split(':(') - n_split_reads = sum([int(x.split(':')[1]) for x in params[1].rstrip(')').split(',') if x.split(':')[0] != 'discordant_mates']) + if additional_breaks != '': + params = additional_breaks.split(':(') + n_split_reads = sum([int(x.split(':')[1]) for x in params[1].rstrip(')').split(',') if x.split(':')[0] != 'discordant_mates']) - posAB = params[0].split(':') - posA, posB = int(posAB[1].split('/')[0]), int(posAB[2].split('/')[0]) + posAB = params[0].split(':') + posA, posB = int(posAB[1].split('/')[0]), int(posAB[2].split('/')[0]) - if params[0] not in done_breaks and n_split_reads > 0: - if e.donorA > e.donorB: - frame_shifts = dfs.evaluate([e.chrA, posA, e.RNAstrandA], [e.chrB, posB, e.RNAstrandB], 2) - else: - frame_shifts = dfs.evaluate([e.chrB, posB, e.RNAstrandB], [e.chrA, posA, e.RNAstrandA], 2) + if params[0] not in done_breaks and n_split_reads > 0: + if e.donorA > e.donorB: + frame_shifts = dfs.evaluate([e.chrA, posA, e.RNAstrandA], [e.chrB, posB, e.RNAstrandB], 2) + else: + frame_shifts = dfs.evaluate([e.chrB, posB, e.RNAstrandB], [e.chrA, posA, e.RNAstrandA], 2) - fgd += [x[0] + '->' + x[1] for x in frame_shifts['fgd']] - frameshifts_0 += [x[0][0] + '->' + x[1][0] for x in frame_shifts[0]] - frameshifts_1 += [x[0][0] + '(+' + str(x[0][1]) + ')->' + x[1][0] + '(+' + str(x[1][1]) + ')' for x in frame_shifts[1]] - frameshifts_2 += [x[0][0] + '(+' + str(x[0][1]) + ')->' + x[1][0] + '(+' + str(x[1][1]) + ')' for x in frame_shifts[2]] + fgd += [x[0] + '->' + x[1] for x in frame_shifts['fgd']] + frameshifts_0 += [x[0][0] + '->' + x[1][0] for x in frame_shifts[0]] + frameshifts_1 += [x[0][0] + '(+' + str(x[0][1]) + ')->' + x[1][0] + '(+' + str(x[1][1]) + ')' for x in frame_shifts[1]] + frameshifts_2 += [x[0][0] + '(+' + str(x[0][1]) + ')->' + x[1][0] + '(+' + str(x[1][1]) + ')' for x in frame_shifts[2]] - done_breaks.add(params[0]) + done_breaks.add(params[0]) e.fgd = ','.join(sorted(list(set(fgd)))) e.frameshift_0 = ','.join(sorted(list(set(frameshifts_0)))) diff --git a/tests/integrate/test_frameshift-prediction_01.in.dbed b/tests/integrate/test_frameshift-prediction_01.in.dbed new file mode 100644 index 00000000..c21a1d98 --- /dev/null +++ b/tests/integrate/test_frameshift-prediction_01.in.dbed @@ -0,0 +1,3 @@ +chr-A pos-A direction-A pos-A-acceptor pos-A-donor chr-B pos-B direction-B pos-A-acceptor pos-A-donor genomic-distance filter-status circRNA intronic/exonic score soft+hardclips n-split-reads n-discordant-reads alignment-score mismatches n-edges n-nodes-A n-nodes-B n-splice-junc-A n-splice-junc-B entropy-bp-edge entropy-all-edges bp-pos-stddev entropy-disco-bps lr-A-slope lr-A-intercept lr-A-rvalue lr-A-pvalue lr-A-stderr lr-B-slope lr-B-intercept lr-B-rvalue lr-B-pvalue lr-B-stderr disco/split clips/score nodes/edge median-AS-A median-AS-B max-AS-A max-AS-B data-structure +chr1 1035203 - 0 200 chr1 999610 - 200 0 3052497 valid linear exonic 588 378 189 21 10000 0 14 7 9 2 4 0.7421 0.8197 0 0.7162 0.7344 19.9882 0.9657 0 0.0178 0.7194 21.976 0.9648 0 0.0177 0.1111 0.3214 1.1429 50 50 50 50 +1 1035203 - 0 200 1 999610 - 200 0 2992369 valid linear intronic 151 96 48 7 10000 0 1 1 1 0 0 0.869 0.869 0.2887 0.8124 2.1465 5.6435 0.9817 0 0.0571 1.9162 26.2818 0.99 0 0.0376 0.1458 0.3179 2 50 50 50 50 diff --git a/tests/integrate/test_frameshift-prediction_01.out.txt b/tests/integrate/test_frameshift-prediction_01.out.txt new file mode 100644 index 00000000..e69de29b diff --git a/tests/test_functional.py b/tests/test_functional.py index 0b82ff0d..d0a85b0c 100755 --- a/tests/test_functional.py +++ b/tests/test_functional.py @@ -254,6 +254,86 @@ def test_02_s041_no_gtf(self): self.assertTrue(filecmp.cmp(test_file, output_file), msg="diff '" + test_file + "' '" + output_file + "':\n" + subprocess.Popen(['diff', test_file, output_file], stdout=subprocess.PIPE).stdout.read()) +class TestFrameShiftPrediction(unittest.TestCase): + def __get_temp_dirs(self): + TEST_DIR = "tests/integrate/" + T_TEST_DIR = "tmp/" + TEST_DIR + + if not os.path.exists(T_TEST_DIR): + os.makedirs(T_TEST_DIR) + + return TEST_DIR, T_TEST_DIR + + def test_01(self): # example of in-frame fusion - strands are RNA strand + TEST_DIR, T_TEST_DIR = self.__get_temp_dirs() + + test_id = 'frameshift-prediction_01' + + + # both do have their DNA strand at minus!! : + # + # <=(-)=| acceptor in negative strand at RNA + # =====(+)=====>| donor in positive strand at RNA + # + # donor acceptor + # fusions = ['chr1', 1035203, '+'], ['chr1', 999610, '-']) + # , (['1', 1035203, '+'], ['1', 999610, '-'])] # strands are at RNA level, and gene order is DONOR, ACCEPTOR + # + # + input_file = TEST_DIR + "test_" + test_id + ".in.dbed" + test_file = TEST_DIR + "test_" + test_id + ".out.txt" + output_file = T_TEST_DIR + "test_" + test_id + ".out.txt" + + gtf_files = [TEST_DIR + 'frameshift_example.gtf', TEST_DIR + 'frameshift_example.no_chr_prefix.gtf'] + for gtf_file in gtf_files: + command = ["bin/dr-disco", + "integrate", + "--gtf", gtf_file, + input_file, + output_file] + + self.assertEqual(subprocess.call(command), 0, msg=" ".join([str(x) for x in command])) + + self.assertTrue(filecmp.cmp(test_file, output_file), msg="diff '" + test_file + "' '" + output_file + "':\n" + subprocess.Popen(['diff', test_file, output_file], stdout=subprocess.PIPE).stdout.read()) + + # dfs = DetectFrameShifts(gtf_file) + # frameshift_annotation = dfs.evaluate(fusion[0], fusion[1], 0) + # self.assertEqual(str(frameshift_annotation[0]), "[(('AGRN(ENST00000620552.4)-ensembl', 0), ('HES4(ENST00000304952.10)-ensembl_havana', 0))]") + # self.assertEqual(len(frameshift_annotation[1]), 0) + # self.assertEqual(len(frameshift_annotation[2]), 0) + + def test_02(self): # 0, +2 + TEST_DIR, T_TEST_DIR = self.__get_temp_dirs() + + test_id = 'frameshift-prediction_02' + + # fusions = [(['chr1', 1035203, '+'], ['chr1', 999020, '-']), (['1', 1035203, '+'], ['1', 999020, '-'])] # (from), (to) and strands are at RNA level! + # for fusion in fusions: + + # gtf_files = [TEST_DIR + 'frameshift_example.gtf', TEST_DIR + 'frameshift_example.no_chr_prefix.gtf'] + # for gtf_file in gtf_files: + # dfs = DetectFrameShifts(gtf_file) + # frameshift_annotation = dfs.evaluate(fusion[0], fusion[1], 0) + # self.assertEqual(len(frameshift_annotation[0]), 0) + # self.assertEqual(len(frameshift_annotation[1]), 0) + # self.assertEqual(str(frameshift_annotation[2]), "[(('AGRN(ENST00000620552.4)-ensembl', 0), ('HES4(ENST00000304952.10)-ensembl_havana', 2))]") + + def test_03(self): # +1, +2 -> 0 + TEST_DIR, T_TEST_DIR = self.__get_temp_dirs() + + test_id = 'frameshift-prediction_02' + # fusions = [(['chr1', 1040604, '+'], ['chr1', 999020, '-']), (['1', 1040604, '+'], ['1', 999020, '-'])] + # gtf_files = [TEST_DIR + 'frameshift_example.gtf', TEST_DIR + 'frameshift_example.no_chr_prefix.gtf'] + + # for fusion in fusions: + # for gtf_file in gtf_files: + # dfs = DetectFrameShifts(gtf_file) + # frameshift_annotation = dfs.evaluate(fusion[0], fusion[1], 0) + # self.assertEqual(str(frameshift_annotation[0]), "[(('AGRN(ENST00000620552.4)-ensembl', 1), ('HES4(ENST00000304952.10)-ensembl_havana', 2))]") + # self.assertEqual(len(frameshift_annotation[1]), 0) + # self.assertEqual(len(frameshift_annotation[2]), 0) + + class TestFunctional_integrate_splice_site_motif(unittest.TestCase): def __get_temp_dirs(self): TEST_DIR = "tests/splice_site_motif/" From 2b3664217d565fb0fda88b057a27648278384efb Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Tue, 9 Jan 2018 15:29:20 +0100 Subject: [PATCH 16/25] seems to work quite ok --- .../test_frameshift-prediction_01.in.dbed | 2 + .../test_frameshift-prediction_01.out.txt | 3 + .../test_frameshift-prediction_02.in.dbed | 5 ++ .../test_frameshift-prediction_02.out.txt | 3 + .../test_frameshift-prediction_03.in.dbed | 5 ++ .../test_frameshift-prediction_03.out.txt | 0 tests/test_functional.py | 65 ++++++++++++------- 7 files changed, 61 insertions(+), 22 deletions(-) create mode 100644 tests/integrate/test_frameshift-prediction_02.in.dbed create mode 100644 tests/integrate/test_frameshift-prediction_02.out.txt create mode 100644 tests/integrate/test_frameshift-prediction_03.in.dbed create mode 100644 tests/integrate/test_frameshift-prediction_03.out.txt diff --git a/tests/integrate/test_frameshift-prediction_01.in.dbed b/tests/integrate/test_frameshift-prediction_01.in.dbed index c21a1d98..a01729e3 100644 --- a/tests/integrate/test_frameshift-prediction_01.in.dbed +++ b/tests/integrate/test_frameshift-prediction_01.in.dbed @@ -1,3 +1,5 @@ chr-A pos-A direction-A pos-A-acceptor pos-A-donor chr-B pos-B direction-B pos-A-acceptor pos-A-donor genomic-distance filter-status circRNA intronic/exonic score soft+hardclips n-split-reads n-discordant-reads alignment-score mismatches n-edges n-nodes-A n-nodes-B n-splice-junc-A n-splice-junc-B entropy-bp-edge entropy-all-edges bp-pos-stddev entropy-disco-bps lr-A-slope lr-A-intercept lr-A-rvalue lr-A-pvalue lr-A-stderr lr-B-slope lr-B-intercept lr-B-rvalue lr-B-pvalue lr-B-stderr disco/split clips/score nodes/edge median-AS-A median-AS-B max-AS-A max-AS-B data-structure chr1 1035203 - 0 200 chr1 999610 - 200 0 3052497 valid linear exonic 588 378 189 21 10000 0 14 7 9 2 4 0.7421 0.8197 0 0.7162 0.7344 19.9882 0.9657 0 0.0178 0.7194 21.976 0.9648 0 0.0177 0.1111 0.3214 1.1429 50 50 50 50 +chr1 999610 - 200 0 chr1 1035203 - 0 200 3052497 valid linear exonic 588 378 189 21 10000 0 14 7 9 2 4 0.7421 0.8197 0 0.7162 0.7344 19.9882 0.9657 0 0.0178 0.7194 21.976 0.9648 0 0.0177 0.1111 0.3214 1.1429 50 50 50 50 1 1035203 - 0 200 1 999610 - 200 0 2992369 valid linear intronic 151 96 48 7 10000 0 1 1 1 0 0 0.869 0.869 0.2887 0.8124 2.1465 5.6435 0.9817 0 0.0571 1.9162 26.2818 0.99 0 0.0376 0.1458 0.3179 2 50 50 50 50 +1 999610 - 200 0 1 1035203 - 0 200 2992369 valid linear intronic 151 96 48 7 10000 0 1 1 1 0 0 0.869 0.869 0.2887 0.8124 2.1465 5.6435 0.9817 0 0.0571 1.9162 26.2818 0.99 0 0.0376 0.1458 0.3179 2 50 50 50 50 diff --git a/tests/integrate/test_frameshift-prediction_01.out.txt b/tests/integrate/test_frameshift-prediction_01.out.txt index e69de29b..794427aa 100644 --- a/tests/integrate/test_frameshift-prediction_01.out.txt +++ b/tests/integrate/test_frameshift-prediction_01.out.txt @@ -0,0 +1,3 @@ +shared-id fusion chr-A pos-A direction-A pos-A-acceptor pos-A-donor chr-B pos-B direction-B pos-A-acceptor pos-A-donor genomic-distance filter-status circRNA intronic/exonic score soft+hardclips n-split-reads n-discordant-reads alignment-score mismatches n-edges n-nodes-A n-nodes-B n-splice-junc-A n-splice-junc-B entropy-bp-edge entropy-all-edges bp-pos-stddev entropy-disco-bps lr-A-slope lr-A-intercept lr-A-rvalue lr-A-pvalue lr-A-stderr lr-B-slope lr-B-intercept lr-B-rvalue lr-B-pvalue lr-B-stderr disco/split clips/score nodes/edge full-gene-dysregulation frameshift=0 frameshift=+1 frameshift=+2 splice-motif-edit-distance median-AS-A median-AS-B max-AS-A max-AS-B data-structure +1 chr1:1035203->chr1:999610 chr1 1035203 - 0 200 chr1 999610 - 200 0 3052497 valid linear exonic 588 378 189 21 10000 0 14 7 9 2 4 0.7421 0.8197 0 0.7162 0.7344 19.9882 0.9657 0 0.0178 0.7194 21.976 0.9648 0 0.0177 0.1111 0.3214 1.1429 AGRN(ENST00000620552.4)-ensembl->HES4(ENST00000304952.10)-ensembl_havana 50 50 50 50 +2 1:1035203->1:999610 1 1035203 - 0 200 1 999610 - 200 0 2992369 valid linear intronic 151 96 48 7 10000 0 1 1 1 0 0 0.869 0.869 0.2887 0.8124 2.1465 5.6435 0.9817 0 0.0571 1.9162 26.2818 0.99 0 0.0376 0.1458 0.3179 2 AGRN(ENST00000620552.4)-ensembl->HES4(ENST00000304952.10)-ensembl_havana 50 50 50 50 diff --git a/tests/integrate/test_frameshift-prediction_02.in.dbed b/tests/integrate/test_frameshift-prediction_02.in.dbed new file mode 100644 index 00000000..2ea6cec2 --- /dev/null +++ b/tests/integrate/test_frameshift-prediction_02.in.dbed @@ -0,0 +1,5 @@ +chr-A pos-A direction-A pos-A-acceptor pos-A-donor chr-B pos-B direction-B pos-A-acceptor pos-A-donor genomic-distance filter-status circRNA intronic/exonic score soft+hardclips n-split-reads n-discordant-reads alignment-score mismatches n-edges n-nodes-A n-nodes-B n-splice-junc-A n-splice-junc-B entropy-bp-edge entropy-all-edges bp-pos-stddev entropy-disco-bps lr-A-slope lr-A-intercept lr-A-rvalue lr-A-pvalue lr-A-stderr lr-B-slope lr-B-intercept lr-B-rvalue lr-B-pvalue lr-B-stderr disco/split clips/score nodes/edge median-AS-A median-AS-B max-AS-A max-AS-B data-structure +chr1 1035203 - 0 200 chr1 999020 - 200 0 3052497 valid linear exonic 588 378 189 21 10000 0 14 7 9 2 4 0.7421 0.8197 0 0.7162 0.7344 19.9882 0.9657 0 0.0178 0.7194 21.976 0.9648 0 0.0177 0.1111 0.3214 1.1429 50 50 50 50 +chr1 999020 - 200 0 chr1 1035203 - 0 200 3052497 valid linear exonic 588 378 189 21 10000 0 14 7 9 2 4 0.7421 0.8197 0 0.7162 0.7344 19.9882 0.9657 0 0.0178 0.7194 21.976 0.9648 0 0.0177 0.1111 0.3214 1.1429 50 50 50 50 +1 1035203 - 0 200 1 999020 - 200 0 2992369 valid linear intronic 151 96 48 7 10000 0 1 1 1 0 0 0.869 0.869 0.2887 0.8124 2.1465 5.6435 0.9817 0 0.0571 1.9162 26.2818 0.99 0 0.0376 0.1458 0.3179 2 50 50 50 50 +1 999020 - 200 0 1 1035203 - 0 200 2992369 valid linear intronic 151 96 48 7 10000 0 1 1 1 0 0 0.869 0.869 0.2887 0.8124 2.1465 5.6435 0.9817 0 0.0571 1.9162 26.2818 0.99 0 0.0376 0.1458 0.3179 2 50 50 50 50 diff --git a/tests/integrate/test_frameshift-prediction_02.out.txt b/tests/integrate/test_frameshift-prediction_02.out.txt new file mode 100644 index 00000000..7ba5df6c --- /dev/null +++ b/tests/integrate/test_frameshift-prediction_02.out.txt @@ -0,0 +1,3 @@ +shared-id fusion chr-A pos-A direction-A pos-A-acceptor pos-A-donor chr-B pos-B direction-B pos-A-acceptor pos-A-donor genomic-distance filter-status circRNA intronic/exonic score soft+hardclips n-split-reads n-discordant-reads alignment-score mismatches n-edges n-nodes-A n-nodes-B n-splice-junc-A n-splice-junc-B entropy-bp-edge entropy-all-edges bp-pos-stddev entropy-disco-bps lr-A-slope lr-A-intercept lr-A-rvalue lr-A-pvalue lr-A-stderr lr-B-slope lr-B-intercept lr-B-rvalue lr-B-pvalue lr-B-stderr disco/split clips/score nodes/edge full-gene-dysregulation frameshift=0 frameshift=+1 frameshift=+2 splice-motif-edit-distance median-AS-A median-AS-B max-AS-A max-AS-B data-structure +1 chr1:1035203->chr1:999020 chr1 1035203 - 0 200 chr1 999020 - 200 0 3052497 valid linear exonic 588 378 189 21 10000 0 14 7 9 2 4 0.7421 0.8197 0 0.7162 0.7344 19.9882 0.9657 0 0.0178 0.7194 21.976 0.9648 0 0.0177 0.1111 0.3214 1.1429 AGRN(ENST00000620552.4)-ensembl(+0)->HES4(ENST00000304952.10)-ensembl_havana(+2) 50 50 50 50 +2 1:1035203->1:999020 1 1035203 - 0 200 1 999020 - 200 0 2992369 valid linear intronic 151 96 48 7 10000 0 1 1 1 0 0 0.869 0.869 0.2887 0.8124 2.1465 5.6435 0.9817 0 0.0571 1.9162 26.2818 0.99 0 0.0376 0.1458 0.3179 2 AGRN(ENST00000620552.4)-ensembl(+0)->HES4(ENST00000304952.10)-ensembl_havana(+2) 50 50 50 50 diff --git a/tests/integrate/test_frameshift-prediction_03.in.dbed b/tests/integrate/test_frameshift-prediction_03.in.dbed new file mode 100644 index 00000000..4811ba69 --- /dev/null +++ b/tests/integrate/test_frameshift-prediction_03.in.dbed @@ -0,0 +1,5 @@ +chr-A pos-A direction-A pos-A-acceptor pos-A-donor chr-B pos-B direction-B pos-A-acceptor pos-A-donor genomic-distance filter-status circRNA intronic/exonic score soft+hardclips n-split-reads n-discordant-reads alignment-score mismatches n-edges n-nodes-A n-nodes-B n-splice-junc-A n-splice-junc-B entropy-bp-edge entropy-all-edges bp-pos-stddev entropy-disco-bps lr-A-slope lr-A-intercept lr-A-rvalue lr-A-pvalue lr-A-stderr lr-B-slope lr-B-intercept lr-B-rvalue lr-B-pvalue lr-B-stderr disco/split clips/score nodes/edge median-AS-A median-AS-B max-AS-A max-AS-B data-structure +chr1 1040604 - 0 200 chr1 999020 - 200 0 3052497 valid linear exonic 588 378 189 21 10000 0 14 7 9 2 4 0.7421 0.8197 0 0.7162 0.7344 19.9882 0.9657 0 0.0178 0.7194 21.976 0.9648 0 0.0177 0.1111 0.3214 1.1429 50 50 50 50 +chr1 999020 - 200 0 chr1 1040604 - 0 200 3052497 valid linear exonic 588 378 189 21 10000 0 14 7 9 2 4 0.7421 0.8197 0 0.7162 0.7344 19.9882 0.9657 0 0.0178 0.7194 21.976 0.9648 0 0.0177 0.1111 0.3214 1.1429 50 50 50 50 +1 1040604 - 0 200 1 999020 - 200 0 2992369 valid linear intronic 151 96 48 7 10000 0 1 1 1 0 0 0.869 0.869 0.2887 0.8124 2.1465 5.6435 0.9817 0 0.0571 1.9162 26.2818 0.99 0 0.0376 0.1458 0.3179 2 50 50 50 50 +1 999020 - 200 0 1 1040604 - 0 200 2992369 valid linear intronic 151 96 48 7 10000 0 1 1 1 0 0 0.869 0.869 0.2887 0.8124 2.1465 5.6435 0.9817 0 0.0571 1.9162 26.2818 0.99 0 0.0376 0.1458 0.3179 2 50 50 50 50 diff --git a/tests/integrate/test_frameshift-prediction_03.out.txt b/tests/integrate/test_frameshift-prediction_03.out.txt new file mode 100644 index 00000000..e69de29b diff --git a/tests/test_functional.py b/tests/test_functional.py index d0a85b0c..ffafa2cc 100755 --- a/tests/test_functional.py +++ b/tests/test_functional.py @@ -269,7 +269,6 @@ def test_01(self): # example of in-frame fusion - strands are RNA strand test_id = 'frameshift-prediction_01' - # both do have their DNA strand at minus!! : # # <=(-)=| acceptor in negative strand at RNA @@ -278,8 +277,7 @@ def test_01(self): # example of in-frame fusion - strands are RNA strand # donor acceptor # fusions = ['chr1', 1035203, '+'], ['chr1', 999610, '-']) # , (['1', 1035203, '+'], ['1', 999610, '-'])] # strands are at RNA level, and gene order is DONOR, ACCEPTOR - # - # + input_file = TEST_DIR + "test_" + test_id + ".in.dbed" test_file = TEST_DIR + "test_" + test_id + ".out.txt" output_file = T_TEST_DIR + "test_" + test_id + ".out.txt" @@ -296,8 +294,7 @@ def test_01(self): # example of in-frame fusion - strands are RNA strand self.assertTrue(filecmp.cmp(test_file, output_file), msg="diff '" + test_file + "' '" + output_file + "':\n" + subprocess.Popen(['diff', test_file, output_file], stdout=subprocess.PIPE).stdout.read()) - # dfs = DetectFrameShifts(gtf_file) - # frameshift_annotation = dfs.evaluate(fusion[0], fusion[1], 0) + # must statisfy: # self.assertEqual(str(frameshift_annotation[0]), "[(('AGRN(ENST00000620552.4)-ensembl', 0), ('HES4(ENST00000304952.10)-ensembl_havana', 0))]") # self.assertEqual(len(frameshift_annotation[1]), 0) # self.assertEqual(len(frameshift_annotation[2]), 0) @@ -306,14 +303,25 @@ def test_02(self): # 0, +2 TEST_DIR, T_TEST_DIR = self.__get_temp_dirs() test_id = 'frameshift-prediction_02' - + # fusions = [(['chr1', 1035203, '+'], ['chr1', 999020, '-']), (['1', 1035203, '+'], ['1', 999020, '-'])] # (from), (to) and strands are at RNA level! - # for fusion in fusions: - - # gtf_files = [TEST_DIR + 'frameshift_example.gtf', TEST_DIR + 'frameshift_example.no_chr_prefix.gtf'] - # for gtf_file in gtf_files: - # dfs = DetectFrameShifts(gtf_file) - # frameshift_annotation = dfs.evaluate(fusion[0], fusion[1], 0) + input_file = TEST_DIR + "test_" + test_id + ".in.dbed" + test_file = TEST_DIR + "test_" + test_id + ".out.txt" + output_file = T_TEST_DIR + "test_" + test_id + ".out.txt" + + gtf_files = [TEST_DIR + 'frameshift_example.gtf', TEST_DIR + 'frameshift_example.no_chr_prefix.gtf'] + for gtf_file in gtf_files: + command = ["bin/dr-disco", + "integrate", + "--gtf", gtf_file, + input_file, + output_file] + + self.assertEqual(subprocess.call(command), 0, msg=" ".join([str(x) for x in command])) + + self.assertTrue(filecmp.cmp(test_file, output_file), msg="diff '" + test_file + "' '" + output_file + "':\n" + subprocess.Popen(['diff', test_file, output_file], stdout=subprocess.PIPE).stdout.read()) + + # must statisfy: # self.assertEqual(len(frameshift_annotation[0]), 0) # self.assertEqual(len(frameshift_annotation[1]), 0) # self.assertEqual(str(frameshift_annotation[2]), "[(('AGRN(ENST00000620552.4)-ensembl', 0), ('HES4(ENST00000304952.10)-ensembl_havana', 2))]") @@ -321,17 +329,30 @@ def test_02(self): # 0, +2 def test_03(self): # +1, +2 -> 0 TEST_DIR, T_TEST_DIR = self.__get_temp_dirs() - test_id = 'frameshift-prediction_02' + test_id = 'frameshift-prediction_03' + # fusions = [(['chr1', 1040604, '+'], ['chr1', 999020, '-']), (['1', 1040604, '+'], ['1', 999020, '-'])] - # gtf_files = [TEST_DIR + 'frameshift_example.gtf', TEST_DIR + 'frameshift_example.no_chr_prefix.gtf'] - - # for fusion in fusions: - # for gtf_file in gtf_files: - # dfs = DetectFrameShifts(gtf_file) - # frameshift_annotation = dfs.evaluate(fusion[0], fusion[1], 0) - # self.assertEqual(str(frameshift_annotation[0]), "[(('AGRN(ENST00000620552.4)-ensembl', 1), ('HES4(ENST00000304952.10)-ensembl_havana', 2))]") - # self.assertEqual(len(frameshift_annotation[1]), 0) - # self.assertEqual(len(frameshift_annotation[2]), 0) + + input_file = TEST_DIR + "test_" + test_id + ".in.dbed" + test_file = TEST_DIR + "test_" + test_id + ".out.txt" + output_file = T_TEST_DIR + "test_" + test_id + ".out.txt" + + gtf_files = [TEST_DIR + 'frameshift_example.gtf', TEST_DIR + 'frameshift_example.no_chr_prefix.gtf'] + for gtf_file in gtf_files: + command = ["bin/dr-disco", + "integrate", + "--gtf", gtf_file, + input_file, + output_file] + + self.assertEqual(subprocess.call(command), 0, msg=" ".join([str(x) for x in command])) + + self.assertTrue(filecmp.cmp(test_file, output_file), msg="diff '" + test_file + "' '" + output_file + "':\n" + subprocess.Popen(['diff', test_file, output_file], stdout=subprocess.PIPE).stdout.read()) + + # must statisfy: + # self.assertEqual(str(frameshift_annotation[0]), "[(('AGRN(ENST00000620552.4)-ensembl', 1), ('HES4(ENST00000304952.10)-ensembl_havana', 2))]") + # self.assertEqual(len(frameshift_annotation[1]), 0) + # self.assertEqual(len(frameshift_annotation[2]), 0) class TestFunctional_integrate_splice_site_motif(unittest.TestCase): From e41b7364895a91c17a7a2079af6bdd1640b8bff3 Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Tue, 9 Jan 2018 15:44:09 +0100 Subject: [PATCH 17/25] tempsav --- tests/integrate/test_frameshift-prediction_01.out.txt | 4 +++- tests/integrate/test_frameshift-prediction_02.out.txt | 4 +++- tests/integrate/test_frameshift-prediction_03.out.txt | 5 +++++ 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/tests/integrate/test_frameshift-prediction_01.out.txt b/tests/integrate/test_frameshift-prediction_01.out.txt index 794427aa..57e271c9 100644 --- a/tests/integrate/test_frameshift-prediction_01.out.txt +++ b/tests/integrate/test_frameshift-prediction_01.out.txt @@ -1,3 +1,5 @@ shared-id fusion chr-A pos-A direction-A pos-A-acceptor pos-A-donor chr-B pos-B direction-B pos-A-acceptor pos-A-donor genomic-distance filter-status circRNA intronic/exonic score soft+hardclips n-split-reads n-discordant-reads alignment-score mismatches n-edges n-nodes-A n-nodes-B n-splice-junc-A n-splice-junc-B entropy-bp-edge entropy-all-edges bp-pos-stddev entropy-disco-bps lr-A-slope lr-A-intercept lr-A-rvalue lr-A-pvalue lr-A-stderr lr-B-slope lr-B-intercept lr-B-rvalue lr-B-pvalue lr-B-stderr disco/split clips/score nodes/edge full-gene-dysregulation frameshift=0 frameshift=+1 frameshift=+2 splice-motif-edit-distance median-AS-A median-AS-B max-AS-A max-AS-B data-structure 1 chr1:1035203->chr1:999610 chr1 1035203 - 0 200 chr1 999610 - 200 0 3052497 valid linear exonic 588 378 189 21 10000 0 14 7 9 2 4 0.7421 0.8197 0 0.7162 0.7344 19.9882 0.9657 0 0.0178 0.7194 21.976 0.9648 0 0.0177 0.1111 0.3214 1.1429 AGRN(ENST00000620552.4)-ensembl->HES4(ENST00000304952.10)-ensembl_havana 50 50 50 50 -2 1:1035203->1:999610 1 1035203 - 0 200 1 999610 - 200 0 2992369 valid linear intronic 151 96 48 7 10000 0 1 1 1 0 0 0.869 0.869 0.2887 0.8124 2.1465 5.6435 0.9817 0 0.0571 1.9162 26.2818 0.99 0 0.0376 0.1458 0.3179 2 AGRN(ENST00000620552.4)-ensembl->HES4(ENST00000304952.10)-ensembl_havana 50 50 50 50 +2 chr1:1035203->chr1:999610 chr1 999610 - 200 0 chr1 1035203 - 0 200 3052497 valid linear exonic 588 378 189 21 10000 0 14 7 9 2 4 0.7421 0.8197 0 0.7162 0.7344 19.9882 0.9657 0 0.0178 0.7194 21.976 0.9648 0 0.0177 0.1111 0.3214 1.1429 AGRN(ENST00000620552.4)-ensembl->HES4(ENST00000304952.10)-ensembl_havana 50 50 50 50 +3 1:1035203->1:999610 1 1035203 - 0 200 1 999610 - 200 0 2992369 valid linear intronic 151 96 48 7 10000 0 1 1 1 0 0 0.869 0.869 0.2887 0.8124 2.1465 5.6435 0.9817 0 0.0571 1.9162 26.2818 0.99 0 0.0376 0.1458 0.3179 2 AGRN(ENST00000620552.4)-ensembl->HES4(ENST00000304952.10)-ensembl_havana 50 50 50 50 +4 1:1035203->1:999610 1 999610 - 200 0 1 1035203 - 0 200 2992369 valid linear intronic 151 96 48 7 10000 0 1 1 1 0 0 0.869 0.869 0.2887 0.8124 2.1465 5.6435 0.9817 0 0.0571 1.9162 26.2818 0.99 0 0.0376 0.1458 0.3179 2 AGRN(ENST00000620552.4)-ensembl->HES4(ENST00000304952.10)-ensembl_havana 50 50 50 50 diff --git a/tests/integrate/test_frameshift-prediction_02.out.txt b/tests/integrate/test_frameshift-prediction_02.out.txt index 7ba5df6c..80bc007d 100644 --- a/tests/integrate/test_frameshift-prediction_02.out.txt +++ b/tests/integrate/test_frameshift-prediction_02.out.txt @@ -1,3 +1,5 @@ shared-id fusion chr-A pos-A direction-A pos-A-acceptor pos-A-donor chr-B pos-B direction-B pos-A-acceptor pos-A-donor genomic-distance filter-status circRNA intronic/exonic score soft+hardclips n-split-reads n-discordant-reads alignment-score mismatches n-edges n-nodes-A n-nodes-B n-splice-junc-A n-splice-junc-B entropy-bp-edge entropy-all-edges bp-pos-stddev entropy-disco-bps lr-A-slope lr-A-intercept lr-A-rvalue lr-A-pvalue lr-A-stderr lr-B-slope lr-B-intercept lr-B-rvalue lr-B-pvalue lr-B-stderr disco/split clips/score nodes/edge full-gene-dysregulation frameshift=0 frameshift=+1 frameshift=+2 splice-motif-edit-distance median-AS-A median-AS-B max-AS-A max-AS-B data-structure 1 chr1:1035203->chr1:999020 chr1 1035203 - 0 200 chr1 999020 - 200 0 3052497 valid linear exonic 588 378 189 21 10000 0 14 7 9 2 4 0.7421 0.8197 0 0.7162 0.7344 19.9882 0.9657 0 0.0178 0.7194 21.976 0.9648 0 0.0177 0.1111 0.3214 1.1429 AGRN(ENST00000620552.4)-ensembl(+0)->HES4(ENST00000304952.10)-ensembl_havana(+2) 50 50 50 50 -2 1:1035203->1:999020 1 1035203 - 0 200 1 999020 - 200 0 2992369 valid linear intronic 151 96 48 7 10000 0 1 1 1 0 0 0.869 0.869 0.2887 0.8124 2.1465 5.6435 0.9817 0 0.0571 1.9162 26.2818 0.99 0 0.0376 0.1458 0.3179 2 AGRN(ENST00000620552.4)-ensembl(+0)->HES4(ENST00000304952.10)-ensembl_havana(+2) 50 50 50 50 +2 chr1:1035203->chr1:999020 chr1 999020 - 200 0 chr1 1035203 - 0 200 3052497 valid linear exonic 588 378 189 21 10000 0 14 7 9 2 4 0.7421 0.8197 0 0.7162 0.7344 19.9882 0.9657 0 0.0178 0.7194 21.976 0.9648 0 0.0177 0.1111 0.3214 1.1429 AGRN(ENST00000620552.4)-ensembl(+0)->HES4(ENST00000304952.10)-ensembl_havana(+2) 50 50 50 50 +3 1:1035203->1:999020 1 1035203 - 0 200 1 999020 - 200 0 2992369 valid linear intronic 151 96 48 7 10000 0 1 1 1 0 0 0.869 0.869 0.2887 0.8124 2.1465 5.6435 0.9817 0 0.0571 1.9162 26.2818 0.99 0 0.0376 0.1458 0.3179 2 AGRN(ENST00000620552.4)-ensembl(+0)->HES4(ENST00000304952.10)-ensembl_havana(+2) 50 50 50 50 +4 1:1035203->1:999020 1 999020 - 200 0 1 1035203 - 0 200 2992369 valid linear intronic 151 96 48 7 10000 0 1 1 1 0 0 0.869 0.869 0.2887 0.8124 2.1465 5.6435 0.9817 0 0.0571 1.9162 26.2818 0.99 0 0.0376 0.1458 0.3179 2 AGRN(ENST00000620552.4)-ensembl(+0)->HES4(ENST00000304952.10)-ensembl_havana(+2) 50 50 50 50 diff --git a/tests/integrate/test_frameshift-prediction_03.out.txt b/tests/integrate/test_frameshift-prediction_03.out.txt index e69de29b..ce136ba3 100644 --- a/tests/integrate/test_frameshift-prediction_03.out.txt +++ b/tests/integrate/test_frameshift-prediction_03.out.txt @@ -0,0 +1,5 @@ +shared-id fusion chr-A pos-A direction-A pos-A-acceptor pos-A-donor chr-B pos-B direction-B pos-A-acceptor pos-A-donor genomic-distance filter-status circRNA intronic/exonic score soft+hardclips n-split-reads n-discordant-reads alignment-score mismatches n-edges n-nodes-A n-nodes-B n-splice-junc-A n-splice-junc-B entropy-bp-edge entropy-all-edges bp-pos-stddev entropy-disco-bps lr-A-slope lr-A-intercept lr-A-rvalue lr-A-pvalue lr-A-stderr lr-B-slope lr-B-intercept lr-B-rvalue lr-B-pvalue lr-B-stderr disco/split clips/score nodes/edge full-gene-dysregulation frameshift=0 frameshift=+1 frameshift=+2 splice-motif-edit-distance median-AS-A median-AS-B max-AS-A max-AS-B data-structure +1 chr1:1040604->chr1:999020 chr1 1040604 - 0 200 chr1 999020 - 200 0 3052497 valid linear exonic 588 378 189 21 10000 0 14 7 9 2 4 0.7421 0.8197 0 0.7162 0.7344 19.9882 0.9657 0 0.0178 0.7194 21.976 0.9648 0 0.0177 0.1111 0.3214 1.1429 AGRN(ENST00000620552.4)-ensembl->HES4(ENST00000304952.10)-ensembl_havana 50 50 50 50 +2 chr1:1040604->chr1:999020 chr1 999020 - 200 0 chr1 1040604 - 0 200 3052497 valid linear exonic 588 378 189 21 10000 0 14 7 9 2 4 0.7421 0.8197 0 0.7162 0.7344 19.9882 0.9657 0 0.0178 0.7194 21.976 0.9648 0 0.0177 0.1111 0.3214 1.1429 AGRN(ENST00000620552.4)-ensembl->HES4(ENST00000304952.10)-ensembl_havana 50 50 50 50 +3 1:1040604->1:999020 1 1040604 - 0 200 1 999020 - 200 0 2992369 valid linear intronic 151 96 48 7 10000 0 1 1 1 0 0 0.869 0.869 0.2887 0.8124 2.1465 5.6435 0.9817 0 0.0571 1.9162 26.2818 0.99 0 0.0376 0.1458 0.3179 2 AGRN(ENST00000620552.4)-ensembl->HES4(ENST00000304952.10)-ensembl_havana 50 50 50 50 +4 1:1040604->1:999020 1 999020 - 200 0 1 1040604 - 0 200 2992369 valid linear intronic 151 96 48 7 10000 0 1 1 1 0 0 0.869 0.869 0.2887 0.8124 2.1465 5.6435 0.9817 0 0.0571 1.9162 26.2818 0.99 0 0.0376 0.1458 0.3179 2 AGRN(ENST00000620552.4)-ensembl->HES4(ENST00000304952.10)-ensembl_havana 50 50 50 50 From 117e5de2f9501293bd405034aa75d84be524da67 Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Tue, 9 Jan 2018 15:46:55 +0100 Subject: [PATCH 18/25] complement for frameshift --- ...eshift-prediction_01-complementary.in.dbed | 17 +++++++++++++++ ...eshift-prediction_01-complementary.out.txt | 17 +++++++++++++++ tests/test_functional.py | 21 +++++++++++++++++++ 3 files changed, 55 insertions(+) create mode 100644 tests/integrate/test_frameshift-prediction_01-complementary.in.dbed create mode 100644 tests/integrate/test_frameshift-prediction_01-complementary.out.txt diff --git a/tests/integrate/test_frameshift-prediction_01-complementary.in.dbed b/tests/integrate/test_frameshift-prediction_01-complementary.in.dbed new file mode 100644 index 00000000..24dc2f6a --- /dev/null +++ b/tests/integrate/test_frameshift-prediction_01-complementary.in.dbed @@ -0,0 +1,17 @@ +chr-A pos-A direction-A pos-A-acceptor pos-A-donor chr-B pos-B direction-B pos-A-acceptor pos-A-donor genomic-distance filter-status circRNA intronic/exonic score soft+hardclips n-split-reads n-discordant-reads alignment-score mismatches n-edges n-nodes-A n-nodes-B n-splice-junc-A n-splice-junc-B entropy-bp-edge entropy-all-edges bp-pos-stddev entropy-disco-bps lr-A-slope lr-A-intercept lr-A-rvalue lr-A-pvalue lr-A-stderr lr-B-slope lr-B-intercept lr-B-rvalue lr-B-pvalue lr-B-stderr disco/split clips/score nodes/edge median-AS-A median-AS-B max-AS-A max-AS-B data-structure +chr1 1035203 + 0 200 chr1 999610 - 200 0 3052497 valid linear exonic 588 378 189 21 10000 0 14 7 9 2 4 0.7421 0.8197 0 0.7162 0.7344 19.9882 0.9657 0 0.0178 0.7194 21.976 0.9648 0 0.0177 0.1111 0.3214 1.1429 50 50 50 50 +chr1 999610 + 200 0 chr1 1035203 - 0 200 3052497 valid linear exonic 588 378 189 21 10000 0 14 7 9 2 4 0.7421 0.8197 0 0.7162 0.7344 19.9882 0.9657 0 0.0178 0.7194 21.976 0.9648 0 0.0177 0.1111 0.3214 1.1429 50 50 50 50 +1 1035203 + 0 200 1 999610 - 200 0 2992369 valid linear intronic 151 96 48 7 10000 0 1 1 1 0 0 0.869 0.869 0.2887 0.8124 2.1465 5.6435 0.9817 0 0.0571 1.9162 26.2818 0.99 0 0.0376 0.1458 0.3179 2 50 50 50 50 +1 999610 + 200 0 1 1035203 - 0 200 2992369 valid linear intronic 151 96 48 7 10000 0 1 1 1 0 0 0.869 0.869 0.2887 0.8124 2.1465 5.6435 0.9817 0 0.0571 1.9162 26.2818 0.99 0 0.0376 0.1458 0.3179 2 50 50 50 50 +chr1 1035203 - 0 200 chr1 999610 + 200 0 3052497 valid linear exonic 588 378 189 21 10000 0 14 7 9 2 4 0.7421 0.8197 0 0.7162 0.7344 19.9882 0.9657 0 0.0178 0.7194 21.976 0.9648 0 0.0177 0.1111 0.3214 1.1429 50 50 50 50 +chr1 999610 - 200 0 chr1 1035203 + 0 200 3052497 valid linear exonic 588 378 189 21 10000 0 14 7 9 2 4 0.7421 0.8197 0 0.7162 0.7344 19.9882 0.9657 0 0.0178 0.7194 21.976 0.9648 0 0.0177 0.1111 0.3214 1.1429 50 50 50 50 +1 1035203 - 0 200 1 999610 + 200 0 2992369 valid linear intronic 151 96 48 7 10000 0 1 1 1 0 0 0.869 0.869 0.2887 0.8124 2.1465 5.6435 0.9817 0 0.0571 1.9162 26.2818 0.99 0 0.0376 0.1458 0.3179 2 50 50 50 50 +1 999610 - 200 0 1 1035203 + 0 200 2992369 valid linear intronic 151 96 48 7 10000 0 1 1 1 0 0 0.869 0.869 0.2887 0.8124 2.1465 5.6435 0.9817 0 0.0571 1.9162 26.2818 0.99 0 0.0376 0.1458 0.3179 2 50 50 50 50 +chr1 1035203 + 0 200 chr1 999610 + 200 0 3052497 valid linear exonic 588 378 189 21 10000 0 14 7 9 2 4 0.7421 0.8197 0 0.7162 0.7344 19.9882 0.9657 0 0.0178 0.7194 21.976 0.9648 0 0.0177 0.1111 0.3214 1.1429 50 50 50 50 +chr1 999610 + 200 0 chr1 1035203 + 0 200 3052497 valid linear exonic 588 378 189 21 10000 0 14 7 9 2 4 0.7421 0.8197 0 0.7162 0.7344 19.9882 0.9657 0 0.0178 0.7194 21.976 0.9648 0 0.0177 0.1111 0.3214 1.1429 50 50 50 50 +1 1035203 + 0 200 1 999610 + 200 0 2992369 valid linear intronic 151 96 48 7 10000 0 1 1 1 0 0 0.869 0.869 0.2887 0.8124 2.1465 5.6435 0.9817 0 0.0571 1.9162 26.2818 0.99 0 0.0376 0.1458 0.3179 2 50 50 50 50 +1 999610 + 200 0 1 1035203 + 0 200 2992369 valid linear intronic 151 96 48 7 10000 0 1 1 1 0 0 0.869 0.869 0.2887 0.8124 2.1465 5.6435 0.9817 0 0.0571 1.9162 26.2818 0.99 0 0.0376 0.1458 0.3179 2 50 50 50 50 +chr1 1035203 - 200 0 chr1 999610 - 0 200 3052497 valid linear exonic 588 378 189 21 10000 0 14 7 9 2 4 0.7421 0.8197 0 0.7162 0.7344 19.9882 0.9657 0 0.0178 0.7194 21.976 0.9648 0 0.0177 0.1111 0.3214 1.1429 50 50 50 50 +chr1 999610 - 0 200 chr1 1035203 - 200 0 3052497 valid linear exonic 588 378 189 21 10000 0 14 7 9 2 4 0.7421 0.8197 0 0.7162 0.7344 19.9882 0.9657 0 0.0178 0.7194 21.976 0.9648 0 0.0177 0.1111 0.3214 1.1429 50 50 50 50 +1 1035203 - 200 0 1 999610 - 0 200 2992369 valid linear intronic 151 96 48 7 10000 0 1 1 1 0 0 0.869 0.869 0.2887 0.8124 2.1465 5.6435 0.9817 0 0.0571 1.9162 26.2818 0.99 0 0.0376 0.1458 0.3179 2 50 50 50 50 +1 999610 - 0 200 1 1035203 - 200 0 2992369 valid linear intronic 151 96 48 7 10000 0 1 1 1 0 0 0.869 0.869 0.2887 0.8124 2.1465 5.6435 0.9817 0 0.0571 1.9162 26.2818 0.99 0 0.0376 0.1458 0.3179 2 50 50 50 50 diff --git a/tests/integrate/test_frameshift-prediction_01-complementary.out.txt b/tests/integrate/test_frameshift-prediction_01-complementary.out.txt new file mode 100644 index 00000000..707d2583 --- /dev/null +++ b/tests/integrate/test_frameshift-prediction_01-complementary.out.txt @@ -0,0 +1,17 @@ +shared-id fusion chr-A pos-A direction-A pos-A-acceptor pos-A-donor chr-B pos-B direction-B pos-A-acceptor pos-A-donor genomic-distance filter-status circRNA intronic/exonic score soft+hardclips n-split-reads n-discordant-reads alignment-score mismatches n-edges n-nodes-A n-nodes-B n-splice-junc-A n-splice-junc-B entropy-bp-edge entropy-all-edges bp-pos-stddev entropy-disco-bps lr-A-slope lr-A-intercept lr-A-rvalue lr-A-pvalue lr-A-stderr lr-B-slope lr-B-intercept lr-B-rvalue lr-B-pvalue lr-B-stderr disco/split clips/score nodes/edge full-gene-dysregulation frameshift=0 frameshift=+1 frameshift=+2 splice-motif-edit-distance median-AS-A median-AS-B max-AS-A max-AS-B data-structure +1 1:1035203->1:999610 1 1035203 + 0 200 1 999610 + 200 0 2992369 valid linear intronic 151 96 48 7 10000 0 1 1 1 0 0 0.869 0.869 0.2887 0.8124 2.1465 5.6435 0.9817 0 0.0571 1.9162 26.2818 0.99 0 0.0376 0.1458 0.3179 2 50 50 50 50 +1 chr1:1035203->chr1:999610 chr1 1035203 + 0 200 chr1 999610 - 200 0 3052497 valid linear exonic 588 378 189 21 10000 0 14 7 9 2 4 0.7421 0.8197 0 0.7162 0.7344 19.9882 0.9657 0 0.0178 0.7194 21.976 0.9648 0 0.0177 0.1111 0.3214 1.1429 50 50 50 50 +2 chr1:1035203->chr1:999610 chr1 1035203 + 0 200 chr1 999610 + 200 0 3052497 valid linear exonic 588 378 189 21 10000 0 14 7 9 2 4 0.7421 0.8197 0 0.7162 0.7344 19.9882 0.9657 0 0.0178 0.7194 21.976 0.9648 0 0.0177 0.1111 0.3214 1.1429 50 50 50 50 +3 chr1:1035203->chr1:999610 chr1 1035203 - 0 200 chr1 999610 + 200 0 3052497 valid linear exonic 588 378 189 21 10000 0 14 7 9 2 4 0.7421 0.8197 0 0.7162 0.7344 19.9882 0.9657 0 0.0178 0.7194 21.976 0.9648 0 0.0177 0.1111 0.3214 1.1429 50 50 50 50 +4 chr1:999610->chr1:1035203 chr1 1035203 - 200 0 chr1 999610 - 0 200 3052497 valid linear exonic 588 378 189 21 10000 0 14 7 9 2 4 0.7421 0.8197 0 0.7162 0.7344 19.9882 0.9657 0 0.0178 0.7194 21.976 0.9648 0 0.0177 0.1111 0.3214 1.1429 50 50 50 50 +5 chr1:1035203->chr1:999610 chr1 999610 + 200 0 chr1 1035203 + 0 200 3052497 valid linear exonic 588 378 189 21 10000 0 14 7 9 2 4 0.7421 0.8197 0 0.7162 0.7344 19.9882 0.9657 0 0.0178 0.7194 21.976 0.9648 0 0.0177 0.1111 0.3214 1.1429 50 50 50 50 +6 chr1:1035203->chr1:999610 chr1 999610 + 200 0 chr1 1035203 - 0 200 3052497 valid linear exonic 588 378 189 21 10000 0 14 7 9 2 4 0.7421 0.8197 0 0.7162 0.7344 19.9882 0.9657 0 0.0178 0.7194 21.976 0.9648 0 0.0177 0.1111 0.3214 1.1429 50 50 50 50 +7 chr1:1035203->chr1:999610 chr1 999610 - 200 0 chr1 1035203 + 0 200 3052497 valid linear exonic 588 378 189 21 10000 0 14 7 9 2 4 0.7421 0.8197 0 0.7162 0.7344 19.9882 0.9657 0 0.0178 0.7194 21.976 0.9648 0 0.0177 0.1111 0.3214 1.1429 50 50 50 50 +8 chr1:999610->chr1:1035203 chr1 999610 - 0 200 chr1 1035203 - 200 0 3052497 valid linear exonic 588 378 189 21 10000 0 14 7 9 2 4 0.7421 0.8197 0 0.7162 0.7344 19.9882 0.9657 0 0.0178 0.7194 21.976 0.9648 0 0.0177 0.1111 0.3214 1.1429 50 50 50 50 +9 1:1035203->1:999610 1 1035203 - 0 200 1 999610 + 200 0 2992369 valid linear intronic 151 96 48 7 10000 0 1 1 1 0 0 0.869 0.869 0.2887 0.8124 2.1465 5.6435 0.9817 0 0.0571 1.9162 26.2818 0.99 0 0.0376 0.1458 0.3179 2 50 50 50 50 +10 1:1035203->1:999610 1 999610 + 200 0 1 1035203 + 0 200 2992369 valid linear intronic 151 96 48 7 10000 0 1 1 1 0 0 0.869 0.869 0.2887 0.8124 2.1465 5.6435 0.9817 0 0.0571 1.9162 26.2818 0.99 0 0.0376 0.1458 0.3179 2 50 50 50 50 +10 1:1035203->1:999610 1 999610 - 200 0 1 1035203 + 0 200 2992369 valid linear intronic 151 96 48 7 10000 0 1 1 1 0 0 0.869 0.869 0.2887 0.8124 2.1465 5.6435 0.9817 0 0.0571 1.9162 26.2818 0.99 0 0.0376 0.1458 0.3179 2 50 50 50 50 +11 1:1035203->1:999610 1 999610 + 200 0 1 1035203 - 0 200 2992369 valid linear intronic 151 96 48 7 10000 0 1 1 1 0 0 0.869 0.869 0.2887 0.8124 2.1465 5.6435 0.9817 0 0.0571 1.9162 26.2818 0.99 0 0.0376 0.1458 0.3179 2 50 50 50 50 +12 1:1035203->1:999610 1 1035203 + 0 200 1 999610 - 200 0 2992369 valid linear intronic 151 96 48 7 10000 0 1 1 1 0 0 0.869 0.869 0.2887 0.8124 2.1465 5.6435 0.9817 0 0.0571 1.9162 26.2818 0.99 0 0.0376 0.1458 0.3179 2 50 50 50 50 +13 1:999610->1:1035203 1 1035203 - 200 0 1 999610 - 0 200 2992369 valid linear intronic 151 96 48 7 10000 0 1 1 1 0 0 0.869 0.869 0.2887 0.8124 2.1465 5.6435 0.9817 0 0.0571 1.9162 26.2818 0.99 0 0.0376 0.1458 0.3179 2 50 50 50 50 +14 1:999610->1:1035203 1 999610 - 0 200 1 1035203 - 200 0 2992369 valid linear intronic 151 96 48 7 10000 0 1 1 1 0 0 0.869 0.869 0.2887 0.8124 2.1465 5.6435 0.9817 0 0.0571 1.9162 26.2818 0.99 0 0.0376 0.1458 0.3179 2 50 50 50 50 diff --git a/tests/test_functional.py b/tests/test_functional.py index ffafa2cc..6cbba1e3 100755 --- a/tests/test_functional.py +++ b/tests/test_functional.py @@ -299,6 +299,27 @@ def test_01(self): # example of in-frame fusion - strands are RNA strand # self.assertEqual(len(frameshift_annotation[1]), 0) # self.assertEqual(len(frameshift_annotation[2]), 0) + def test_01_complementary(self): + TEST_DIR, T_TEST_DIR = self.__get_temp_dirs() + + test_id = 'frameshift-prediction_01-complementary' + + input_file = TEST_DIR + "test_" + test_id + ".in.dbed" + test_file = TEST_DIR + "test_" + test_id + ".out.txt" + output_file = T_TEST_DIR + "test_" + test_id + ".out.txt" + + gtf_files = [TEST_DIR + 'frameshift_example.gtf', TEST_DIR + 'frameshift_example.no_chr_prefix.gtf'] + for gtf_file in gtf_files: + command = ["bin/dr-disco", + "integrate", + "--gtf", gtf_file, + input_file, + output_file] + + self.assertEqual(subprocess.call(command), 0, msg=" ".join([str(x) for x in command])) + + self.assertTrue(filecmp.cmp(test_file, output_file), msg="diff '" + test_file + "' '" + output_file + "':\n" + subprocess.Popen(['diff', test_file, output_file], stdout=subprocess.PIPE).stdout.read()) + def test_02(self): # 0, +2 TEST_DIR, T_TEST_DIR = self.__get_temp_dirs() From 2961a9e672223239231681c7ecc003c77e04e488 Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Tue, 9 Jan 2018 15:50:01 +0100 Subject: [PATCH 19/25] removal of print statement --- drdisco/DetectFrameShifts.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drdisco/DetectFrameShifts.py b/drdisco/DetectFrameShifts.py index bc5a4f45..5fccd089 100644 --- a/drdisco/DetectFrameShifts.py +++ b/drdisco/DetectFrameShifts.py @@ -195,8 +195,7 @@ def evaluate(self, _from, _to, offset): _to ([chr, pos, strand]): acceptor position position """ - print _from , " --> " , _to - + from_l_fgd = [] to_l_fgd = [] From 808dc78cad5af501f1e49ba711f68f72d3d556e7 Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Tue, 9 Jan 2018 15:52:17 +0100 Subject: [PATCH 20/25] flake --- drdisco/DetectOutput.py | 2 +- tests/test_functional.py | 20 ++++++++++---------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/drdisco/DetectOutput.py b/drdisco/DetectOutput.py index 7b1f89d1..2cb623f5 100644 --- a/drdisco/DetectOutput.py +++ b/drdisco/DetectOutput.py @@ -491,7 +491,7 @@ def insert_in_index(index, entries, score): posA, posB = int(posAB[1].split('/')[0]), int(posAB[2].split('/')[0]) if params[0] not in done_breaks and n_split_reads > 0: - if e.donorA > e.donorB: + if e.donorA > e.donorB: # nice, use same thing to swap if necessary frame_shifts = dfs.evaluate([e.chrA, posA, e.RNAstrandA], [e.chrB, posB, e.RNAstrandB], 2) else: frame_shifts = dfs.evaluate([e.chrB, posB, e.RNAstrandB], [e.chrA, posA, e.RNAstrandA], 2) diff --git a/tests/test_functional.py b/tests/test_functional.py index 6cbba1e3..bc80e26c 100755 --- a/tests/test_functional.py +++ b/tests/test_functional.py @@ -271,17 +271,17 @@ def test_01(self): # example of in-frame fusion - strands are RNA strand # both do have their DNA strand at minus!! : # - # <=(-)=| acceptor in negative strand at RNA - # =====(+)=====>| donor in positive strand at RNA - # - # donor acceptor - # fusions = ['chr1', 1035203, '+'], ['chr1', 999610, '-']) - # , (['1', 1035203, '+'], ['1', 999610, '-'])] # strands are at RNA level, and gene order is DONOR, ACCEPTOR + # <=(-)=| acceptor in negative strand at RNA + # =====(+)=====>| donor in positive strand at RNA + + # donor acceptor + # fusions = chr1', 1035203, '+'], ['chr1', 999610, '-']) + # 1', 1035203, '+'], ['1', 999610, '-'])] # strands are at RNA level, and gene order is DONOR, ACCEPTOR input_file = TEST_DIR + "test_" + test_id + ".in.dbed" test_file = TEST_DIR + "test_" + test_id + ".out.txt" output_file = T_TEST_DIR + "test_" + test_id + ".out.txt" - + gtf_files = [TEST_DIR + 'frameshift_example.gtf', TEST_DIR + 'frameshift_example.no_chr_prefix.gtf'] for gtf_file in gtf_files: command = ["bin/dr-disco", @@ -307,7 +307,7 @@ def test_01_complementary(self): input_file = TEST_DIR + "test_" + test_id + ".in.dbed" test_file = TEST_DIR + "test_" + test_id + ".out.txt" output_file = T_TEST_DIR + "test_" + test_id + ".out.txt" - + gtf_files = [TEST_DIR + 'frameshift_example.gtf', TEST_DIR + 'frameshift_example.no_chr_prefix.gtf'] for gtf_file in gtf_files: command = ["bin/dr-disco", @@ -353,11 +353,11 @@ def test_03(self): # +1, +2 -> 0 test_id = 'frameshift-prediction_03' # fusions = [(['chr1', 1040604, '+'], ['chr1', 999020, '-']), (['1', 1040604, '+'], ['1', 999020, '-'])] - + input_file = TEST_DIR + "test_" + test_id + ".in.dbed" test_file = TEST_DIR + "test_" + test_id + ".out.txt" output_file = T_TEST_DIR + "test_" + test_id + ".out.txt" - + gtf_files = [TEST_DIR + 'frameshift_example.gtf', TEST_DIR + 'frameshift_example.no_chr_prefix.gtf'] for gtf_file in gtf_files: command = ["bin/dr-disco", From 66cd6d1a5b293441f0bbf53857666fe957ff2fd9 Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Tue, 9 Jan 2018 15:54:53 +0100 Subject: [PATCH 21/25] Changelog --- Changelog | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Changelog b/Changelog index c313fa17..dd993e22 100644 --- a/Changelog +++ b/Changelog @@ -1,3 +1,8 @@ +2018-01-09 Youri Hoogstrate v0.15.0 + * Bugfix resulting in higher number of detected frame shifts + * `dr-disco integrate --fasta ` provides edit distance to + canonical splice junction motif (quick impementation) + 2017-12-20 Youri Hoogstrate v0.14.6 * New improvement to entropy filter From 15e3cc3ebf79f15642911f9a8dc2dbbe3cf41984 Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Tue, 9 Jan 2018 16:07:29 +0100 Subject: [PATCH 22/25] fix order --- ...eshift-prediction_01-complementary.in.dbed | 32 +++++++++---------- ...eshift-prediction_01-complementary.out.txt | 32 +++++++++---------- 2 files changed, 32 insertions(+), 32 deletions(-) diff --git a/tests/integrate/test_frameshift-prediction_01-complementary.in.dbed b/tests/integrate/test_frameshift-prediction_01-complementary.in.dbed index 24dc2f6a..335e4da6 100644 --- a/tests/integrate/test_frameshift-prediction_01-complementary.in.dbed +++ b/tests/integrate/test_frameshift-prediction_01-complementary.in.dbed @@ -1,17 +1,17 @@ chr-A pos-A direction-A pos-A-acceptor pos-A-donor chr-B pos-B direction-B pos-A-acceptor pos-A-donor genomic-distance filter-status circRNA intronic/exonic score soft+hardclips n-split-reads n-discordant-reads alignment-score mismatches n-edges n-nodes-A n-nodes-B n-splice-junc-A n-splice-junc-B entropy-bp-edge entropy-all-edges bp-pos-stddev entropy-disco-bps lr-A-slope lr-A-intercept lr-A-rvalue lr-A-pvalue lr-A-stderr lr-B-slope lr-B-intercept lr-B-rvalue lr-B-pvalue lr-B-stderr disco/split clips/score nodes/edge median-AS-A median-AS-B max-AS-A max-AS-B data-structure -chr1 1035203 + 0 200 chr1 999610 - 200 0 3052497 valid linear exonic 588 378 189 21 10000 0 14 7 9 2 4 0.7421 0.8197 0 0.7162 0.7344 19.9882 0.9657 0 0.0178 0.7194 21.976 0.9648 0 0.0177 0.1111 0.3214 1.1429 50 50 50 50 -chr1 999610 + 200 0 chr1 1035203 - 0 200 3052497 valid linear exonic 588 378 189 21 10000 0 14 7 9 2 4 0.7421 0.8197 0 0.7162 0.7344 19.9882 0.9657 0 0.0178 0.7194 21.976 0.9648 0 0.0177 0.1111 0.3214 1.1429 50 50 50 50 -1 1035203 + 0 200 1 999610 - 200 0 2992369 valid linear intronic 151 96 48 7 10000 0 1 1 1 0 0 0.869 0.869 0.2887 0.8124 2.1465 5.6435 0.9817 0 0.0571 1.9162 26.2818 0.99 0 0.0376 0.1458 0.3179 2 50 50 50 50 -1 999610 + 200 0 1 1035203 - 0 200 2992369 valid linear intronic 151 96 48 7 10000 0 1 1 1 0 0 0.869 0.869 0.2887 0.8124 2.1465 5.6435 0.9817 0 0.0571 1.9162 26.2818 0.99 0 0.0376 0.1458 0.3179 2 50 50 50 50 -chr1 1035203 - 0 200 chr1 999610 + 200 0 3052497 valid linear exonic 588 378 189 21 10000 0 14 7 9 2 4 0.7421 0.8197 0 0.7162 0.7344 19.9882 0.9657 0 0.0178 0.7194 21.976 0.9648 0 0.0177 0.1111 0.3214 1.1429 50 50 50 50 -chr1 999610 - 200 0 chr1 1035203 + 0 200 3052497 valid linear exonic 588 378 189 21 10000 0 14 7 9 2 4 0.7421 0.8197 0 0.7162 0.7344 19.9882 0.9657 0 0.0178 0.7194 21.976 0.9648 0 0.0177 0.1111 0.3214 1.1429 50 50 50 50 -1 1035203 - 0 200 1 999610 + 200 0 2992369 valid linear intronic 151 96 48 7 10000 0 1 1 1 0 0 0.869 0.869 0.2887 0.8124 2.1465 5.6435 0.9817 0 0.0571 1.9162 26.2818 0.99 0 0.0376 0.1458 0.3179 2 50 50 50 50 -1 999610 - 200 0 1 1035203 + 0 200 2992369 valid linear intronic 151 96 48 7 10000 0 1 1 1 0 0 0.869 0.869 0.2887 0.8124 2.1465 5.6435 0.9817 0 0.0571 1.9162 26.2818 0.99 0 0.0376 0.1458 0.3179 2 50 50 50 50 -chr1 1035203 + 0 200 chr1 999610 + 200 0 3052497 valid linear exonic 588 378 189 21 10000 0 14 7 9 2 4 0.7421 0.8197 0 0.7162 0.7344 19.9882 0.9657 0 0.0178 0.7194 21.976 0.9648 0 0.0177 0.1111 0.3214 1.1429 50 50 50 50 -chr1 999610 + 200 0 chr1 1035203 + 0 200 3052497 valid linear exonic 588 378 189 21 10000 0 14 7 9 2 4 0.7421 0.8197 0 0.7162 0.7344 19.9882 0.9657 0 0.0178 0.7194 21.976 0.9648 0 0.0177 0.1111 0.3214 1.1429 50 50 50 50 -1 1035203 + 0 200 1 999610 + 200 0 2992369 valid linear intronic 151 96 48 7 10000 0 1 1 1 0 0 0.869 0.869 0.2887 0.8124 2.1465 5.6435 0.9817 0 0.0571 1.9162 26.2818 0.99 0 0.0376 0.1458 0.3179 2 50 50 50 50 -1 999610 + 200 0 1 1035203 + 0 200 2992369 valid linear intronic 151 96 48 7 10000 0 1 1 1 0 0 0.869 0.869 0.2887 0.8124 2.1465 5.6435 0.9817 0 0.0571 1.9162 26.2818 0.99 0 0.0376 0.1458 0.3179 2 50 50 50 50 -chr1 1035203 - 200 0 chr1 999610 - 0 200 3052497 valid linear exonic 588 378 189 21 10000 0 14 7 9 2 4 0.7421 0.8197 0 0.7162 0.7344 19.9882 0.9657 0 0.0178 0.7194 21.976 0.9648 0 0.0177 0.1111 0.3214 1.1429 50 50 50 50 -chr1 999610 - 0 200 chr1 1035203 - 200 0 3052497 valid linear exonic 588 378 189 21 10000 0 14 7 9 2 4 0.7421 0.8197 0 0.7162 0.7344 19.9882 0.9657 0 0.0178 0.7194 21.976 0.9648 0 0.0177 0.1111 0.3214 1.1429 50 50 50 50 -1 1035203 - 200 0 1 999610 - 0 200 2992369 valid linear intronic 151 96 48 7 10000 0 1 1 1 0 0 0.869 0.869 0.2887 0.8124 2.1465 5.6435 0.9817 0 0.0571 1.9162 26.2818 0.99 0 0.0376 0.1458 0.3179 2 50 50 50 50 -1 999610 - 0 200 1 1035203 - 200 0 2992369 valid linear intronic 151 96 48 7 10000 0 1 1 1 0 0 0.869 0.869 0.2887 0.8124 2.1465 5.6435 0.9817 0 0.0571 1.9162 26.2818 0.99 0 0.0376 0.1458 0.3179 2 50 50 50 50 +chr1 1035203 + 0 200 chr1 999610 - 200 0 3052497 valid linear exonic 800 378 189 21 10000 0 14 7 9 2 4 0.7421 0.8197 0 0.7162 0.7344 19.9882 0.9657 0 0.0178 0.7194 21.976 0.9648 0 0.0177 0.1111 0.3214 1.1429 50 50 50 50 +chr1 999610 + 200 0 chr1 1035203 - 0 200 3052497 valid linear exonic 750 378 189 21 10000 0 14 7 9 2 4 0.7421 0.8197 0 0.7162 0.7344 19.9882 0.9657 0 0.0178 0.7194 21.976 0.9648 0 0.0177 0.1111 0.3214 1.1429 50 50 50 50 +1 1035203 + 0 200 1 999610 - 200 0 2992369 valid linear intronic 700 96 48 7 10000 0 1 1 1 0 0 0.869 0.869 0.2887 0.8124 2.1465 5.6435 0.9817 0 0.0571 1.9162 26.2818 0.99 0 0.0376 0.1458 0.3179 2 50 50 50 50 +1 999610 + 200 0 1 1035203 - 0 200 2992369 valid linear intronic 650 96 48 7 10000 0 1 1 1 0 0 0.869 0.869 0.2887 0.8124 2.1465 5.6435 0.9817 0 0.0571 1.9162 26.2818 0.99 0 0.0376 0.1458 0.3179 2 50 50 50 50 +chr1 1035203 - 0 200 chr1 999610 + 200 0 3052497 valid linear exonic 600 378 189 21 10000 0 14 7 9 2 4 0.7421 0.8197 0 0.7162 0.7344 19.9882 0.9657 0 0.0178 0.7194 21.976 0.9648 0 0.0177 0.1111 0.3214 1.1429 50 50 50 50 +chr1 999610 - 200 0 chr1 1035203 + 0 200 3052497 valid linear exonic 550 378 189 21 10000 0 14 7 9 2 4 0.7421 0.8197 0 0.7162 0.7344 19.9882 0.9657 0 0.0178 0.7194 21.976 0.9648 0 0.0177 0.1111 0.3214 1.1429 50 50 50 50 +1 1035203 - 0 200 1 999610 + 200 0 2992369 valid linear intronic 500 96 48 7 10000 0 1 1 1 0 0 0.869 0.869 0.2887 0.8124 2.1465 5.6435 0.9817 0 0.0571 1.9162 26.2818 0.99 0 0.0376 0.1458 0.3179 2 50 50 50 50 +1 999610 - 200 0 1 1035203 + 0 200 2992369 valid linear intronic 450 96 48 7 10000 0 1 1 1 0 0 0.869 0.869 0.2887 0.8124 2.1465 5.6435 0.9817 0 0.0571 1.9162 26.2818 0.99 0 0.0376 0.1458 0.3179 2 50 50 50 50 +chr1 1035203 + 0 200 chr1 999610 + 200 0 3052497 valid linear exonic 400 378 189 21 10000 0 14 7 9 2 4 0.7421 0.8197 0 0.7162 0.7344 19.9882 0.9657 0 0.0178 0.7194 21.976 0.9648 0 0.0177 0.1111 0.3214 1.1429 50 50 50 50 +chr1 999610 + 200 0 chr1 1035203 + 0 200 3052497 valid linear exonic 350 378 189 21 10000 0 14 7 9 2 4 0.7421 0.8197 0 0.7162 0.7344 19.9882 0.9657 0 0.0178 0.7194 21.976 0.9648 0 0.0177 0.1111 0.3214 1.1429 50 50 50 50 +1 1035203 + 0 200 1 999610 + 200 0 2992369 valid linear intronic 300 96 48 7 10000 0 1 1 1 0 0 0.869 0.869 0.2887 0.8124 2.1465 5.6435 0.9817 0 0.0571 1.9162 26.2818 0.99 0 0.0376 0.1458 0.3179 2 50 50 50 50 +1 999610 + 200 0 1 1035203 + 0 200 2992369 valid linear intronic 250 96 48 7 10000 0 1 1 1 0 0 0.869 0.869 0.2887 0.8124 2.1465 5.6435 0.9817 0 0.0571 1.9162 26.2818 0.99 0 0.0376 0.1458 0.3179 2 50 50 50 50 +chr1 1035203 - 200 0 chr1 999610 - 0 200 3052497 valid linear exonic 200 378 189 21 10000 0 14 7 9 2 4 0.7421 0.8197 0 0.7162 0.7344 19.9882 0.9657 0 0.0178 0.7194 21.976 0.9648 0 0.0177 0.1111 0.3214 1.1429 50 50 50 50 +chr1 999610 - 0 200 chr1 1035203 - 200 0 3052497 valid linear exonic 150 378 189 21 10000 0 14 7 9 2 4 0.7421 0.8197 0 0.7162 0.7344 19.9882 0.9657 0 0.0178 0.7194 21.976 0.9648 0 0.0177 0.1111 0.3214 1.1429 50 50 50 50 +1 1035203 - 200 0 1 999610 - 0 200 2992369 valid linear intronic 100 96 48 7 10000 0 1 1 1 0 0 0.869 0.869 0.2887 0.8124 2.1465 5.6435 0.9817 0 0.0571 1.9162 26.2818 0.99 0 0.0376 0.1458 0.3179 2 50 50 50 50 +1 999610 - 0 200 1 1035203 - 200 0 2992369 valid linear intronic 50 96 48 7 10000 0 1 1 1 0 0 0.869 0.869 0.2887 0.8124 2.1465 5.6435 0.9817 0 0.0571 1.9162 26.2818 0.99 0 0.0376 0.1458 0.3179 2 50 50 50 50 diff --git a/tests/integrate/test_frameshift-prediction_01-complementary.out.txt b/tests/integrate/test_frameshift-prediction_01-complementary.out.txt index 707d2583..f68d4a96 100644 --- a/tests/integrate/test_frameshift-prediction_01-complementary.out.txt +++ b/tests/integrate/test_frameshift-prediction_01-complementary.out.txt @@ -1,17 +1,17 @@ shared-id fusion chr-A pos-A direction-A pos-A-acceptor pos-A-donor chr-B pos-B direction-B pos-A-acceptor pos-A-donor genomic-distance filter-status circRNA intronic/exonic score soft+hardclips n-split-reads n-discordant-reads alignment-score mismatches n-edges n-nodes-A n-nodes-B n-splice-junc-A n-splice-junc-B entropy-bp-edge entropy-all-edges bp-pos-stddev entropy-disco-bps lr-A-slope lr-A-intercept lr-A-rvalue lr-A-pvalue lr-A-stderr lr-B-slope lr-B-intercept lr-B-rvalue lr-B-pvalue lr-B-stderr disco/split clips/score nodes/edge full-gene-dysregulation frameshift=0 frameshift=+1 frameshift=+2 splice-motif-edit-distance median-AS-A median-AS-B max-AS-A max-AS-B data-structure -1 1:1035203->1:999610 1 1035203 + 0 200 1 999610 + 200 0 2992369 valid linear intronic 151 96 48 7 10000 0 1 1 1 0 0 0.869 0.869 0.2887 0.8124 2.1465 5.6435 0.9817 0 0.0571 1.9162 26.2818 0.99 0 0.0376 0.1458 0.3179 2 50 50 50 50 -1 chr1:1035203->chr1:999610 chr1 1035203 + 0 200 chr1 999610 - 200 0 3052497 valid linear exonic 588 378 189 21 10000 0 14 7 9 2 4 0.7421 0.8197 0 0.7162 0.7344 19.9882 0.9657 0 0.0178 0.7194 21.976 0.9648 0 0.0177 0.1111 0.3214 1.1429 50 50 50 50 -2 chr1:1035203->chr1:999610 chr1 1035203 + 0 200 chr1 999610 + 200 0 3052497 valid linear exonic 588 378 189 21 10000 0 14 7 9 2 4 0.7421 0.8197 0 0.7162 0.7344 19.9882 0.9657 0 0.0178 0.7194 21.976 0.9648 0 0.0177 0.1111 0.3214 1.1429 50 50 50 50 -3 chr1:1035203->chr1:999610 chr1 1035203 - 0 200 chr1 999610 + 200 0 3052497 valid linear exonic 588 378 189 21 10000 0 14 7 9 2 4 0.7421 0.8197 0 0.7162 0.7344 19.9882 0.9657 0 0.0178 0.7194 21.976 0.9648 0 0.0177 0.1111 0.3214 1.1429 50 50 50 50 -4 chr1:999610->chr1:1035203 chr1 1035203 - 200 0 chr1 999610 - 0 200 3052497 valid linear exonic 588 378 189 21 10000 0 14 7 9 2 4 0.7421 0.8197 0 0.7162 0.7344 19.9882 0.9657 0 0.0178 0.7194 21.976 0.9648 0 0.0177 0.1111 0.3214 1.1429 50 50 50 50 -5 chr1:1035203->chr1:999610 chr1 999610 + 200 0 chr1 1035203 + 0 200 3052497 valid linear exonic 588 378 189 21 10000 0 14 7 9 2 4 0.7421 0.8197 0 0.7162 0.7344 19.9882 0.9657 0 0.0178 0.7194 21.976 0.9648 0 0.0177 0.1111 0.3214 1.1429 50 50 50 50 -6 chr1:1035203->chr1:999610 chr1 999610 + 200 0 chr1 1035203 - 0 200 3052497 valid linear exonic 588 378 189 21 10000 0 14 7 9 2 4 0.7421 0.8197 0 0.7162 0.7344 19.9882 0.9657 0 0.0178 0.7194 21.976 0.9648 0 0.0177 0.1111 0.3214 1.1429 50 50 50 50 -7 chr1:1035203->chr1:999610 chr1 999610 - 200 0 chr1 1035203 + 0 200 3052497 valid linear exonic 588 378 189 21 10000 0 14 7 9 2 4 0.7421 0.8197 0 0.7162 0.7344 19.9882 0.9657 0 0.0178 0.7194 21.976 0.9648 0 0.0177 0.1111 0.3214 1.1429 50 50 50 50 -8 chr1:999610->chr1:1035203 chr1 999610 - 0 200 chr1 1035203 - 200 0 3052497 valid linear exonic 588 378 189 21 10000 0 14 7 9 2 4 0.7421 0.8197 0 0.7162 0.7344 19.9882 0.9657 0 0.0178 0.7194 21.976 0.9648 0 0.0177 0.1111 0.3214 1.1429 50 50 50 50 -9 1:1035203->1:999610 1 1035203 - 0 200 1 999610 + 200 0 2992369 valid linear intronic 151 96 48 7 10000 0 1 1 1 0 0 0.869 0.869 0.2887 0.8124 2.1465 5.6435 0.9817 0 0.0571 1.9162 26.2818 0.99 0 0.0376 0.1458 0.3179 2 50 50 50 50 -10 1:1035203->1:999610 1 999610 + 200 0 1 1035203 + 0 200 2992369 valid linear intronic 151 96 48 7 10000 0 1 1 1 0 0 0.869 0.869 0.2887 0.8124 2.1465 5.6435 0.9817 0 0.0571 1.9162 26.2818 0.99 0 0.0376 0.1458 0.3179 2 50 50 50 50 -10 1:1035203->1:999610 1 999610 - 200 0 1 1035203 + 0 200 2992369 valid linear intronic 151 96 48 7 10000 0 1 1 1 0 0 0.869 0.869 0.2887 0.8124 2.1465 5.6435 0.9817 0 0.0571 1.9162 26.2818 0.99 0 0.0376 0.1458 0.3179 2 50 50 50 50 -11 1:1035203->1:999610 1 999610 + 200 0 1 1035203 - 0 200 2992369 valid linear intronic 151 96 48 7 10000 0 1 1 1 0 0 0.869 0.869 0.2887 0.8124 2.1465 5.6435 0.9817 0 0.0571 1.9162 26.2818 0.99 0 0.0376 0.1458 0.3179 2 50 50 50 50 -12 1:1035203->1:999610 1 1035203 + 0 200 1 999610 - 200 0 2992369 valid linear intronic 151 96 48 7 10000 0 1 1 1 0 0 0.869 0.869 0.2887 0.8124 2.1465 5.6435 0.9817 0 0.0571 1.9162 26.2818 0.99 0 0.0376 0.1458 0.3179 2 50 50 50 50 -13 1:999610->1:1035203 1 1035203 - 200 0 1 999610 - 0 200 2992369 valid linear intronic 151 96 48 7 10000 0 1 1 1 0 0 0.869 0.869 0.2887 0.8124 2.1465 5.6435 0.9817 0 0.0571 1.9162 26.2818 0.99 0 0.0376 0.1458 0.3179 2 50 50 50 50 -14 1:999610->1:1035203 1 999610 - 0 200 1 1035203 - 200 0 2992369 valid linear intronic 151 96 48 7 10000 0 1 1 1 0 0 0.869 0.869 0.2887 0.8124 2.1465 5.6435 0.9817 0 0.0571 1.9162 26.2818 0.99 0 0.0376 0.1458 0.3179 2 50 50 50 50 +1 1:1035203->1:999610 1 1035203 + 0 200 1 999610 + 200 0 2992369 valid linear intronic 300 96 48 7 10000 0 1 1 1 0 0 0.869 0.869 0.2887 0.8124 2.1465 5.6435 0.9817 0 0.0571 1.9162 26.2818 0.99 0 0.0376 0.1458 0.3179 2 50 50 50 50 +1 chr1:1035203->chr1:999610 chr1 1035203 + 0 200 chr1 999610 - 200 0 3052497 valid linear exonic 800 378 189 21 10000 0 14 7 9 2 4 0.7421 0.8197 0 0.7162 0.7344 19.9882 0.9657 0 0.0178 0.7194 21.976 0.9648 0 0.0177 0.1111 0.3214 1.1429 50 50 50 50 +2 1:1035203->1:999610 1 999610 + 200 0 1 1035203 - 0 200 2992369 valid linear intronic 650 96 48 7 10000 0 1 1 1 0 0 0.869 0.869 0.2887 0.8124 2.1465 5.6435 0.9817 0 0.0571 1.9162 26.2818 0.99 0 0.0376 0.1458 0.3179 2 50 50 50 50 +2 1:1035203->1:999610 1 999610 - 200 0 1 1035203 + 0 200 2992369 valid linear intronic 450 96 48 7 10000 0 1 1 1 0 0 0.869 0.869 0.2887 0.8124 2.1465 5.6435 0.9817 0 0.0571 1.9162 26.2818 0.99 0 0.0376 0.1458 0.3179 2 50 50 50 50 +3 1:1035203->1:999610 1 1035203 - 0 200 1 999610 + 200 0 2992369 valid linear intronic 500 96 48 7 10000 0 1 1 1 0 0 0.869 0.869 0.2887 0.8124 2.1465 5.6435 0.9817 0 0.0571 1.9162 26.2818 0.99 0 0.0376 0.1458 0.3179 2 50 50 50 50 +4 chr1:1035203->chr1:999610 chr1 999610 + 200 0 chr1 1035203 - 0 200 3052497 valid linear exonic 750 378 189 21 10000 0 14 7 9 2 4 0.7421 0.8197 0 0.7162 0.7344 19.9882 0.9657 0 0.0178 0.7194 21.976 0.9648 0 0.0177 0.1111 0.3214 1.1429 50 50 50 50 +5 1:1035203->1:999610 1 1035203 + 0 200 1 999610 - 200 0 2992369 valid linear intronic 700 96 48 7 10000 0 1 1 1 0 0 0.869 0.869 0.2887 0.8124 2.1465 5.6435 0.9817 0 0.0571 1.9162 26.2818 0.99 0 0.0376 0.1458 0.3179 2 50 50 50 50 +6 1:1035203->1:999610 1 999610 + 200 0 1 1035203 + 0 200 2992369 valid linear intronic 250 96 48 7 10000 0 1 1 1 0 0 0.869 0.869 0.2887 0.8124 2.1465 5.6435 0.9817 0 0.0571 1.9162 26.2818 0.99 0 0.0376 0.1458 0.3179 2 50 50 50 50 +7 chr1:1035203->chr1:999610 chr1 1035203 - 0 200 chr1 999610 + 200 0 3052497 valid linear exonic 600 378 189 21 10000 0 14 7 9 2 4 0.7421 0.8197 0 0.7162 0.7344 19.9882 0.9657 0 0.0178 0.7194 21.976 0.9648 0 0.0177 0.1111 0.3214 1.1429 50 50 50 50 +8 chr1:1035203->chr1:999610 chr1 999610 - 200 0 chr1 1035203 + 0 200 3052497 valid linear exonic 550 378 189 21 10000 0 14 7 9 2 4 0.7421 0.8197 0 0.7162 0.7344 19.9882 0.9657 0 0.0178 0.7194 21.976 0.9648 0 0.0177 0.1111 0.3214 1.1429 50 50 50 50 +9 chr1:1035203->chr1:999610 chr1 1035203 + 0 200 chr1 999610 + 200 0 3052497 valid linear exonic 400 378 189 21 10000 0 14 7 9 2 4 0.7421 0.8197 0 0.7162 0.7344 19.9882 0.9657 0 0.0178 0.7194 21.976 0.9648 0 0.0177 0.1111 0.3214 1.1429 50 50 50 50 +10 chr1:1035203->chr1:999610 chr1 999610 + 200 0 chr1 1035203 + 0 200 3052497 valid linear exonic 350 378 189 21 10000 0 14 7 9 2 4 0.7421 0.8197 0 0.7162 0.7344 19.9882 0.9657 0 0.0178 0.7194 21.976 0.9648 0 0.0177 0.1111 0.3214 1.1429 50 50 50 50 +11 chr1:999610->chr1:1035203 chr1 1035203 - 200 0 chr1 999610 - 0 200 3052497 valid linear exonic 200 378 189 21 10000 0 14 7 9 2 4 0.7421 0.8197 0 0.7162 0.7344 19.9882 0.9657 0 0.0178 0.7194 21.976 0.9648 0 0.0177 0.1111 0.3214 1.1429 50 50 50 50 +12 chr1:999610->chr1:1035203 chr1 999610 - 0 200 chr1 1035203 - 200 0 3052497 valid linear exonic 150 378 189 21 10000 0 14 7 9 2 4 0.7421 0.8197 0 0.7162 0.7344 19.9882 0.9657 0 0.0178 0.7194 21.976 0.9648 0 0.0177 0.1111 0.3214 1.1429 50 50 50 50 +13 1:999610->1:1035203 1 1035203 - 200 0 1 999610 - 0 200 2992369 valid linear intronic 100 96 48 7 10000 0 1 1 1 0 0 0.869 0.869 0.2887 0.8124 2.1465 5.6435 0.9817 0 0.0571 1.9162 26.2818 0.99 0 0.0376 0.1458 0.3179 2 50 50 50 50 +14 1:999610->1:1035203 1 999610 - 0 200 1 1035203 - 200 0 2992369 valid linear intronic 50 96 48 7 10000 0 1 1 1 0 0 0.869 0.869 0.2887 0.8124 2.1465 5.6435 0.9817 0 0.0571 1.9162 26.2818 0.99 0 0.0376 0.1458 0.3179 2 50 50 50 50 From 7b4e0228360fca535932788a38ed4a9cec9e0af8 Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Tue, 9 Jan 2018 16:17:15 +0100 Subject: [PATCH 23/25] deunique --- drdisco/DetectOutput.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/drdisco/DetectOutput.py b/drdisco/DetectOutput.py index 2cb623f5..4a417ab0 100644 --- a/drdisco/DetectOutput.py +++ b/drdisco/DetectOutput.py @@ -439,11 +439,11 @@ def classify_intronic_exonic(): log.info("Classified " + str(k) + "/" + str(n) + " as valid") def integrate(self, output_table, gtf_file, fasta_file): - def insert_in_index(index, entries, score): + def insert_in_index(index, entries, score, i): if score not in index: index[score] = {} - key = entries[0].chrA + ':' + str(entries[0].posA) + '(' + entries[0].strandA + ')-' + entries[0].chrB + ':' + str(entries[0].posB) + '(' + entries[0].strandB + ')' + key = entries[0].chrA + ':' + str(entries[0].posA) + '(' + entries[0].strandA + ')-' + entries[0].chrB + ':' + str(entries[0].posB) + '(' + entries[0].strandB + ')_'+str(i) index[score][key] = entries with open(output_table, 'w') as fh_out: @@ -532,7 +532,7 @@ def insert(pos, e): # Reorder idx2 = {} - + i = 0 for e in intronic_linear: results = {} positions = [(e.chrA, e.posA, e.strandA), (e.chrB, e.posB, e.strandB)] @@ -571,12 +571,14 @@ def insert(pos, e): top_result = (r, penalty) if top_result[0]: - insert_in_index(idx2, [e, top_result[0]], e.score + top_result[0].score) + insert_in_index(idx2, [e, top_result[0]], e.score + top_result[0].score, i) else: - insert_in_index(idx2, [e], e.score) + insert_in_index(idx2, [e], e.score, i) + + i += 1 for e in remainder: - insert_in_index(idx2, [e], e.score) + insert_in_index(idx2, [e], e.score, i) log.info("Determining fusion gene names and generate output") # Generate output @@ -584,6 +586,7 @@ def insert(pos, e): exported = set([]) for score in sorted(idx2.keys(), reverse=True): for key in sorted(idx2[score].keys()): + print score,key added = 0 for entry in idx2[score][key]: if entry not in exported: From 2ea68fbc0c42b0bec2bad3754c863d5581f9ccd0 Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Tue, 9 Jan 2018 16:31:02 +0100 Subject: [PATCH 24/25] deunique --- drdisco/DetectOutput.py | 15 ++++++++------- ...frameshift-prediction_01-complementary.out.txt | 14 +++++++------- 2 files changed, 15 insertions(+), 14 deletions(-) diff --git a/drdisco/DetectOutput.py b/drdisco/DetectOutput.py index 4a417ab0..a9ab87d0 100644 --- a/drdisco/DetectOutput.py +++ b/drdisco/DetectOutput.py @@ -532,7 +532,7 @@ def insert(pos, e): # Reorder idx2 = {} - i = 0 + q = 0 for e in intronic_linear: results = {} positions = [(e.chrA, e.posA, e.strandA), (e.chrB, e.posB, e.strandB)] @@ -559,7 +559,7 @@ def insert(pos, e): results[e2] += 1 top_result = (None, 9999999999999) - for r in results: + for r in sorted(results.keys()): if results[r] >= 2: d1 = (r.posA - e.posA) d2 = (r.posB - e.posB) @@ -567,18 +567,20 @@ def insert(pos, e): shared_score = math.sqrt((pow(e.score, 2) + pow(r.score, 2)) * 0.5) penalty = 1.0 * sq_d / shared_score + if penalty < top_result[1]: top_result = (r, penalty) if top_result[0]: - insert_in_index(idx2, [e, top_result[0]], e.score + top_result[0].score, i) + insert_in_index(idx2, [e, top_result[0]], e.score + top_result[0].score, q) else: - insert_in_index(idx2, [e], e.score, i) + insert_in_index(idx2, [e], e.score, q) - i += 1 + q += 1 for e in remainder: - insert_in_index(idx2, [e], e.score, i) + insert_in_index(idx2, [e], e.score, q) + q +=1 log.info("Determining fusion gene names and generate output") # Generate output @@ -586,7 +588,6 @@ def insert(pos, e): exported = set([]) for score in sorted(idx2.keys(), reverse=True): for key in sorted(idx2[score].keys()): - print score,key added = 0 for entry in idx2[score][key]: if entry not in exported: diff --git a/tests/integrate/test_frameshift-prediction_01-complementary.out.txt b/tests/integrate/test_frameshift-prediction_01-complementary.out.txt index f68d4a96..0fcc81c3 100644 --- a/tests/integrate/test_frameshift-prediction_01-complementary.out.txt +++ b/tests/integrate/test_frameshift-prediction_01-complementary.out.txt @@ -1,14 +1,14 @@ shared-id fusion chr-A pos-A direction-A pos-A-acceptor pos-A-donor chr-B pos-B direction-B pos-A-acceptor pos-A-donor genomic-distance filter-status circRNA intronic/exonic score soft+hardclips n-split-reads n-discordant-reads alignment-score mismatches n-edges n-nodes-A n-nodes-B n-splice-junc-A n-splice-junc-B entropy-bp-edge entropy-all-edges bp-pos-stddev entropy-disco-bps lr-A-slope lr-A-intercept lr-A-rvalue lr-A-pvalue lr-A-stderr lr-B-slope lr-B-intercept lr-B-rvalue lr-B-pvalue lr-B-stderr disco/split clips/score nodes/edge full-gene-dysregulation frameshift=0 frameshift=+1 frameshift=+2 splice-motif-edit-distance median-AS-A median-AS-B max-AS-A max-AS-B data-structure -1 1:1035203->1:999610 1 1035203 + 0 200 1 999610 + 200 0 2992369 valid linear intronic 300 96 48 7 10000 0 1 1 1 0 0 0.869 0.869 0.2887 0.8124 2.1465 5.6435 0.9817 0 0.0571 1.9162 26.2818 0.99 0 0.0376 0.1458 0.3179 2 50 50 50 50 +1 1:1035203->1:999610 1 1035203 - 0 200 1 999610 + 200 0 2992369 valid linear intronic 500 96 48 7 10000 0 1 1 1 0 0 0.869 0.869 0.2887 0.8124 2.1465 5.6435 0.9817 0 0.0571 1.9162 26.2818 0.99 0 0.0376 0.1458 0.3179 2 50 50 50 50 1 chr1:1035203->chr1:999610 chr1 1035203 + 0 200 chr1 999610 - 200 0 3052497 valid linear exonic 800 378 189 21 10000 0 14 7 9 2 4 0.7421 0.8197 0 0.7162 0.7344 19.9882 0.9657 0 0.0178 0.7194 21.976 0.9648 0 0.0177 0.1111 0.3214 1.1429 50 50 50 50 2 1:1035203->1:999610 1 999610 + 200 0 1 1035203 - 0 200 2992369 valid linear intronic 650 96 48 7 10000 0 1 1 1 0 0 0.869 0.869 0.2887 0.8124 2.1465 5.6435 0.9817 0 0.0571 1.9162 26.2818 0.99 0 0.0376 0.1458 0.3179 2 50 50 50 50 -2 1:1035203->1:999610 1 999610 - 200 0 1 1035203 + 0 200 2992369 valid linear intronic 450 96 48 7 10000 0 1 1 1 0 0 0.869 0.869 0.2887 0.8124 2.1465 5.6435 0.9817 0 0.0571 1.9162 26.2818 0.99 0 0.0376 0.1458 0.3179 2 50 50 50 50 -3 1:1035203->1:999610 1 1035203 - 0 200 1 999610 + 200 0 2992369 valid linear intronic 500 96 48 7 10000 0 1 1 1 0 0 0.869 0.869 0.2887 0.8124 2.1465 5.6435 0.9817 0 0.0571 1.9162 26.2818 0.99 0 0.0376 0.1458 0.3179 2 50 50 50 50 -4 chr1:1035203->chr1:999610 chr1 999610 + 200 0 chr1 1035203 - 0 200 3052497 valid linear exonic 750 378 189 21 10000 0 14 7 9 2 4 0.7421 0.8197 0 0.7162 0.7344 19.9882 0.9657 0 0.0178 0.7194 21.976 0.9648 0 0.0177 0.1111 0.3214 1.1429 50 50 50 50 -5 1:1035203->1:999610 1 1035203 + 0 200 1 999610 - 200 0 2992369 valid linear intronic 700 96 48 7 10000 0 1 1 1 0 0 0.869 0.869 0.2887 0.8124 2.1465 5.6435 0.9817 0 0.0571 1.9162 26.2818 0.99 0 0.0376 0.1458 0.3179 2 50 50 50 50 -6 1:1035203->1:999610 1 999610 + 200 0 1 1035203 + 0 200 2992369 valid linear intronic 250 96 48 7 10000 0 1 1 1 0 0 0.869 0.869 0.2887 0.8124 2.1465 5.6435 0.9817 0 0.0571 1.9162 26.2818 0.99 0 0.0376 0.1458 0.3179 2 50 50 50 50 +2 chr1:1035203->chr1:999610 chr1 999610 - 200 0 chr1 1035203 + 0 200 3052497 valid linear exonic 550 378 189 21 10000 0 14 7 9 2 4 0.7421 0.8197 0 0.7162 0.7344 19.9882 0.9657 0 0.0178 0.7194 21.976 0.9648 0 0.0177 0.1111 0.3214 1.1429 50 50 50 50 +3 1:1035203->1:999610 1 1035203 + 0 200 1 999610 + 200 0 2992369 valid linear intronic 300 96 48 7 10000 0 1 1 1 0 0 0.869 0.869 0.2887 0.8124 2.1465 5.6435 0.9817 0 0.0571 1.9162 26.2818 0.99 0 0.0376 0.1458 0.3179 2 50 50 50 50 +4 1:1035203->1:999610 1 999610 + 200 0 1 1035203 + 0 200 2992369 valid linear intronic 250 96 48 7 10000 0 1 1 1 0 0 0.869 0.869 0.2887 0.8124 2.1465 5.6435 0.9817 0 0.0571 1.9162 26.2818 0.99 0 0.0376 0.1458 0.3179 2 50 50 50 50 +5 chr1:1035203->chr1:999610 chr1 999610 + 200 0 chr1 1035203 - 0 200 3052497 valid linear exonic 750 378 189 21 10000 0 14 7 9 2 4 0.7421 0.8197 0 0.7162 0.7344 19.9882 0.9657 0 0.0178 0.7194 21.976 0.9648 0 0.0177 0.1111 0.3214 1.1429 50 50 50 50 +6 1:1035203->1:999610 1 1035203 + 0 200 1 999610 - 200 0 2992369 valid linear intronic 700 96 48 7 10000 0 1 1 1 0 0 0.869 0.869 0.2887 0.8124 2.1465 5.6435 0.9817 0 0.0571 1.9162 26.2818 0.99 0 0.0376 0.1458 0.3179 2 50 50 50 50 7 chr1:1035203->chr1:999610 chr1 1035203 - 0 200 chr1 999610 + 200 0 3052497 valid linear exonic 600 378 189 21 10000 0 14 7 9 2 4 0.7421 0.8197 0 0.7162 0.7344 19.9882 0.9657 0 0.0178 0.7194 21.976 0.9648 0 0.0177 0.1111 0.3214 1.1429 50 50 50 50 -8 chr1:1035203->chr1:999610 chr1 999610 - 200 0 chr1 1035203 + 0 200 3052497 valid linear exonic 550 378 189 21 10000 0 14 7 9 2 4 0.7421 0.8197 0 0.7162 0.7344 19.9882 0.9657 0 0.0178 0.7194 21.976 0.9648 0 0.0177 0.1111 0.3214 1.1429 50 50 50 50 +8 1:1035203->1:999610 1 999610 - 200 0 1 1035203 + 0 200 2992369 valid linear intronic 450 96 48 7 10000 0 1 1 1 0 0 0.869 0.869 0.2887 0.8124 2.1465 5.6435 0.9817 0 0.0571 1.9162 26.2818 0.99 0 0.0376 0.1458 0.3179 2 50 50 50 50 9 chr1:1035203->chr1:999610 chr1 1035203 + 0 200 chr1 999610 + 200 0 3052497 valid linear exonic 400 378 189 21 10000 0 14 7 9 2 4 0.7421 0.8197 0 0.7162 0.7344 19.9882 0.9657 0 0.0178 0.7194 21.976 0.9648 0 0.0177 0.1111 0.3214 1.1429 50 50 50 50 10 chr1:1035203->chr1:999610 chr1 999610 + 200 0 chr1 1035203 + 0 200 3052497 valid linear exonic 350 378 189 21 10000 0 14 7 9 2 4 0.7421 0.8197 0 0.7162 0.7344 19.9882 0.9657 0 0.0178 0.7194 21.976 0.9648 0 0.0177 0.1111 0.3214 1.1429 50 50 50 50 11 chr1:999610->chr1:1035203 chr1 1035203 - 200 0 chr1 999610 - 0 200 3052497 valid linear exonic 200 378 189 21 10000 0 14 7 9 2 4 0.7421 0.8197 0 0.7162 0.7344 19.9882 0.9657 0 0.0178 0.7194 21.976 0.9648 0 0.0177 0.1111 0.3214 1.1429 50 50 50 50 From 6bd77476717895194b0a240d0a4d0f0f72994dd6 Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Tue, 9 Jan 2018 16:34:24 +0100 Subject: [PATCH 25/25] flake --- drdisco/DetectOutput.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drdisco/DetectOutput.py b/drdisco/DetectOutput.py index a9ab87d0..b6c8e68e 100644 --- a/drdisco/DetectOutput.py +++ b/drdisco/DetectOutput.py @@ -443,7 +443,7 @@ def insert_in_index(index, entries, score, i): if score not in index: index[score] = {} - key = entries[0].chrA + ':' + str(entries[0].posA) + '(' + entries[0].strandA + ')-' + entries[0].chrB + ':' + str(entries[0].posB) + '(' + entries[0].strandB + ')_'+str(i) + key = entries[0].chrA + ':' + str(entries[0].posA) + '(' + entries[0].strandA + ')-' + entries[0].chrB + ':' + str(entries[0].posB) + '(' + entries[0].strandB + ')_' + str(i) index[score][key] = entries with open(output_table, 'w') as fh_out: @@ -580,7 +580,7 @@ def insert(pos, e): for e in remainder: insert_in_index(idx2, [e], e.score, q) - q +=1 + q += 1 log.info("Determining fusion gene names and generate output") # Generate output