From e8f3dcb0ad754b1f1e9b62f557988f1fdf9d8f72 Mon Sep 17 00:00:00 2001
From: Shogo Ohta <shogo@xcoo.jp>
Date: Sat, 21 Nov 2020 00:07:53 +0900
Subject: [PATCH 1/2] Implement filter for chimeras in relatively short ranges

---
 fusionfusion/short_range_chimera_filter.py | 125 +++++++++++++++++++++
 1 file changed, 125 insertions(+)
 create mode 100644 fusionfusion/short_range_chimera_filter.py

diff --git a/fusionfusion/short_range_chimera_filter.py b/fusionfusion/short_range_chimera_filter.py
new file mode 100644
index 0000000..7b04ca0
--- /dev/null
+++ b/fusionfusion/short_range_chimera_filter.py
@@ -0,0 +1,125 @@
+import collections
+import copy
+import itertools
+import pysam
+
+from . import annotationFunction
+
+class ShortRangeChimeraFilter:
+    def __init__(self, ref_gene_tb, ens_gene_tb):
+        self.ref_gene_tb = ref_gene_tb
+        self.ens_gene_tb = ens_gene_tb
+        self.interval = 500
+
+    def run(self, splicing_file, bam_file, output_file):
+        with open(splicing_file) as splicing_reader, \
+             pysam.AlignmentFile(bam_file, 'r') as bam_reader, \
+             pysam.AlignmentFile(output_file, 'w', header=bam_reader.header) as sam_writer:
+            for chr, start, end in self.load_splicing_junctions(splicing_reader):
+                pair_alns = collections.defaultdict(set)
+                for aln in self.load_alignments(bam_reader, chr, start, end):
+                    locus = (aln.reference_name, aln.reference_start)
+                    entry = (aln.query_name, aln.is_read1)
+                    if entry in pair_alns[locus]:
+                        self.write_alignment(sam_writer, aln, chr, start, end)
+                        pair_alns[locus].remove(entry)
+                    else:
+                        aln1, aln2 = self.split_alignment(aln, start)
+                        self.write_alignment(sam_writer, aln1, chr, start, end)
+                        self.write_alignment(sam_writer, aln2, chr, start, end)
+                        next_locus = (aln.next_reference_name, aln.next_reference_start)
+                        next_entry = (aln.query_name, not aln.is_read1)
+                        pair_alns[next_locus].add(next_entry)
+                # fetch and write pair reads
+                for aln in self.load_pair_alignments(bam_reader, pair_alns):
+                    self.write_alignment(sam_writer, aln, chr, start, end)
+
+    def write_alignment(self, sam_writer, aln, chr, start, end):
+        # Strip tags since they are unnecessary for succeeding processes
+        aln.set_tags([])
+        aln.query_name += '_{}:{}-{}'.format(chr, start, end+1)
+        sam_writer.write(aln)
+
+    def get_genes_at(self, chr, pos):
+        return set(annotationFunction.get_gene_info(chr, pos, self.ref_gene_tb, self.ens_gene_tb))
+
+    def load_splicing_junctions(self, splicing_reader):
+        for line in splicing_reader:
+            chr, start, end = line.split('\t')[:3]
+            start_genes = self.get_genes_at(chr, start)
+            end_genes = self.get_genes_at(chr, end)
+            if not(start_genes.intersection(end_genes)):
+                yield (chr, int(start)-1, int(end))
+
+    def load_alignments(self, bam_reader, chr, start, end):
+        def pairwise(it):
+            it1, it2 = itertools.tee(it)
+            next(it2, None)
+            return zip(it1, it2)
+
+        for aln in bam_reader.fetch(chr, start, start+1):
+            if not aln.is_proper_pair or aln.is_secondary: continue
+
+            for (_, curr_end), (next_start, _) in pairwise(aln.get_blocks()):
+                if curr_end == start and next_start == end:
+                    break
+            else:
+                continue
+            yield aln
+
+    def load_pair_alignments(self, bam_reader, pair_alns):
+        def chunked(alns, interval):
+            chrom = start = end = None
+            entries = set()
+            for (chr, pos), es in sorted(alns.items(), key=lambda x: x[0]):
+                if chrom == chr and pos <= start + interval:
+                    end = pos
+                    entries.update(es)
+                    continue
+                if entries:
+                    yield chrom, start, end, entries
+                chrom = chr
+                start = end = pos
+                entries = set(es)
+            if entries:
+                yield chrom, start, end, entries
+
+        for (chr, start, end, entries) in chunked(pair_alns, self.interval):
+            for aln in bam_reader.fetch(chr, start, end + 1):
+                locus = (aln.reference_name, aln.reference_start)
+                entry = (aln.query_name, aln.is_read1)
+                if not aln.is_secondary and entry in entries:
+                    yield aln
+
+    def split_alignment(self, aln, splicing_start):
+        elems = aln.cigartuples
+        blocks = aln.get_blocks()
+
+        elem_idx = 0
+        block_idx = 0
+        for op, len in elems:
+            if op == 0:
+                _, block_end = blocks[block_idx]
+                if block_end == splicing_start: break
+                block_idx += 1
+            elem_idx += 1
+
+        elems1 = elems[:elem_idx+1]
+        elems2 = list(itertools.dropwhile(lambda elem: elem[0] != 0, elems[elem_idx+1:]))
+        count_bases = lambda ops, elems: sum(e[1] for e in elems if e[0] in ops)
+
+        aln1 = copy.copy(aln)
+        aln2 = copy.copy(aln)
+        prim_flag, suppl_flag = aln.flag & ~0x100, aln.flag | 0x100
+        # Consider alignment with more matched bases as primary and the other as supplementary
+        if count_bases({0}, elems1) >= count_bases({0}, elems2):
+            aln1.flag, aln2.flag = prim_flag, suppl_flag
+        else:
+            aln1.flag, aln2.flag = suppl_flag, prim_flag
+        aln2.reference_start = blocks[block_idx+1][0]
+        # To reset CIGAR (and related properties), .cigartuples must be set to None first
+        aln1.cigartuples = None; aln1.cigartuples = elems1 + [(4, count_bases({0, 1, 4}, elems2))]
+        aln2.cigartuples = None; aln2.cigartuples = [(4, count_bases({0, 1, 4}, elems1))] + elems2
+        aln1.template_length, aln2.template_length = \
+            (aln.template_length, 0) if aln.template_length >= 0 else (0, aln.template_length)
+        return (aln1, aln2)

From f155e31ed0fbc344b8b358ecaf9c6ec6581feed3 Mon Sep 17 00:00:00 2001
From: Shogo Ohta <shogo@xcoo.jp>
Date: Tue, 1 Dec 2020 17:50:58 +0900
Subject: [PATCH 2/2] Wire up new filter to fusionfusion driver

---
 fusionfusion/arg_parser.py |  4 +--
 fusionfusion/run.py        | 60 ++++++++++++++++++++++++++++++--------
 2 files changed, 50 insertions(+), 14 deletions(-)

diff --git a/fusionfusion/arg_parser.py b/fusionfusion/arg_parser.py
index 62adc64..ae8becd 100644
--- a/fusionfusion/arg_parser.py
+++ b/fusionfusion/arg_parser.py
@@ -13,8 +13,8 @@ def create_parser():
     parser.add_argument("--version", action = "version", version = "%(prog)s " + __version__)
     # parser.add_argument("--version", action = "version", version = "fusionfusion-0.5.0b1")
 
-    parser.add_argument("--star", metavar = "star.Chimeric.out.sam", default = None, type = str,
-                        help = "the path to the chimeric sam file by STAR")
+    parser.add_argument("--star", metavar = "star", default = None, type = str,
+                        help = "the path prefix of STAR result files")
 
     parser.add_argument("--ms2", metavar = "ms2.bam", default = None, type = str,
                         help = "the path to the bam file by Map splice2")
diff --git a/fusionfusion/run.py b/fusionfusion/run.py
index 5473af8..55be6b1 100644
--- a/fusionfusion/run.py
+++ b/fusionfusion/run.py
@@ -2,11 +2,13 @@
 
 from __future__ import print_function
 
-import sys, os, argparse, subprocess, shutil
+import sys, os, argparse, subprocess, shutil, pysam
+import annot_utils as annot
 from . import parseJunctionInfo
 from . import filterJunctionInfo
 from . import annotationFunction
 from . import utils
+from .short_range_chimera_filter import ShortRangeChimeraFilter
 
 # import config
 from .config import *
@@ -83,12 +85,12 @@ def cluster_filter_junction(inputFilePath, outputFilePrefix, args):
 
 def fusionfusion_main(args):
 
-    starBamFile = args.star
+    starFilePrefix = args.star
     ms2BamFile = args.ms2
     th2BamFile = args.th2
     output_dir = args.out
 
-    if starBamFile == None and ms2BamFile  == None and th2BamFile == None:
+    if starFilePrefix == None and ms2BamFile  == None and th2BamFile == None:
         print("At least one of --star, --ms2 or --th2 should be included", file = sys.stderr)
         sys.exit(1)
  
@@ -122,19 +124,53 @@ def fusionfusion_main(args):
 
     ####################
     # parsing chimeric reads from bam files
-    if starBamFile is not None:
-
-        parseJunctionInfo.parseJuncInfo_STAR(starBamFile, output_dir + "/star.chimeric.tmp.txt")
-                                             
-
-        hOUT = open(output_dir + "/star.chimeric.txt", "w")
-        subprocess.check_call(["sort", "-k1,1", "-k2,2n", "-k4,4", "-k5,5n", output_dir + "/star.chimeric.tmp.txt"], stdout = hOUT)
-        hOUT.close()
+    if starFilePrefix is not None:
+
+        parseJunctionInfo.parseJuncInfo_STAR(starFilePrefix + ".Chimeric.out.sam",
+                                             output_dir + "/star.chimeric.tmp1.txt")
+
+        genome_id = args.genome_id
+        is_grc = args.grc
+        annot.gene.make_gene_info(output_dir + "/star.tmp.refGene.bed.gz", "refseq", genome_id, is_grc, False)
+        annot.gene.make_gene_info(output_dir + "/star.tmp.ensGene.bed.gz", "gencode", genome_id, is_grc, False)
+
+        with pysam.TabixFile(output_dir + "/star.tmp.refGene.bed.gz") as ref_gene_tb, \
+             pysam.TabixFile(output_dir + "/star.tmp.ensGene.bed.gz") as ens_gene_tb:
+            filt = ShortRangeChimeraFilter(ref_gene_tb, ens_gene_tb)
+            filt.run(
+                starFilePrefix + ".SJ.out.tab",
+                starFilePrefix + ".Aligned.sortedByCoord.out.bam",
+                output_dir + "/star.chimeric.tmp2.sam"
+            )
+        subprocess.check_call(["rm", output_dir + "/star.tmp.refGene.bed.gz"])
+        subprocess.check_call(["rm", output_dir + "/star.tmp.ensGene.bed.gz"])
+        subprocess.check_call(["rm", output_dir + "/star.tmp.refGene.bed.gz.tbi"])
+        subprocess.check_call(["rm", output_dir + "/star.tmp.ensGene.bed.gz.tbi"])
+
+        pysam.sort(
+            "-n",
+            "-o", output_dir + "/star.chimeric.tmp2.sorted.sam",
+            output_dir + "/star.chimeric.tmp2.sam"
+        )
+        parseJunctionInfo.parseJuncInfo_STAR(output_dir + "/star.chimeric.tmp2.sorted.sam",
+                                             output_dir + "/star.chimeric.tmp2.txt")
+
+        with open(output_dir + "/star.chimeric.txt", "w") as hOUT:
+            subprocess.check_call([
+                "bash", "-c",
+                "cat {input1} {input2} | sort -k1,1 -k2,2n -k4,4 -k5,5n".format(
+                    input1=output_dir + "/star.chimeric.tmp1.txt",
+                    input2=output_dir + "/star.chimeric.tmp2.txt"
+                )
+            ], stdout=hOUT)
 
         cluster_filter_junction(output_dir + "/star.chimeric.txt", output_dir + "/star", args)
 
         if debug_mode == False:
-            subprocess.check_call(["rm", output_dir + "/star.chimeric.tmp.txt"])
+            subprocess.check_call(["rm", output_dir + "/star.chimeric.tmp1.txt"])
+            subprocess.check_call(["rm", output_dir + "/star.chimeric.tmp2.txt"])
+            subprocess.check_call(["rm", output_dir + "/star.chimeric.tmp2.sam"])
+            subprocess.check_call(["rm", output_dir + "/star.chimeric.tmp2.sorted.sam"])
             subprocess.check_call(["rm", output_dir + "/star.chimeric.txt"])
 
     if ms2BamFile is not None: