From e8f3dcb0ad754b1f1e9b62f557988f1fdf9d8f72 Mon Sep 17 00:00:00 2001 From: Shogo Ohta Date: Sat, 21 Nov 2020 00:07:53 +0900 Subject: [PATCH 1/2] Implement filter for chimeras in relatively short ranges --- fusionfusion/short_range_chimera_filter.py | 125 +++++++++++++++++++++ 1 file changed, 125 insertions(+) create mode 100644 fusionfusion/short_range_chimera_filter.py diff --git a/fusionfusion/short_range_chimera_filter.py b/fusionfusion/short_range_chimera_filter.py new file mode 100644 index 0000000..7b04ca0 --- /dev/null +++ b/fusionfusion/short_range_chimera_filter.py @@ -0,0 +1,125 @@ +import collections +import copy +import itertools +import pysam + +from . import annotationFunction + +class ShortRangeChimeraFilter: + def __init__(self, ref_gene_tb, ens_gene_tb): + self.ref_gene_tb = ref_gene_tb + self.ens_gene_tb = ens_gene_tb + self.interval = 500 + + def run(self, splicing_file, bam_file, output_file): + with open(splicing_file) as splicing_reader, \ + pysam.AlignmentFile(bam_file, 'r') as bam_reader, \ + pysam.AlignmentFile(output_file, 'w', header=bam_reader.header) as sam_writer: + for chr, start, end in self.load_splicing_junctions(splicing_reader): + pair_alns = collections.defaultdict(set) + for aln in self.load_alignments(bam_reader, chr, start, end): + locus = (aln.reference_name, aln.reference_start) + entry = (aln.query_name, aln.is_read1) + if entry in pair_alns[locus]: + self.write_alignment(sam_writer, aln, chr, start, end) + pair_alns[locus].remove(entry) + else: + aln1, aln2 = self.split_alignment(aln, start) + self.write_alignment(sam_writer, aln1, chr, start, end) + self.write_alignment(sam_writer, aln2, chr, start, end) + next_locus = (aln.next_reference_name, aln.next_reference_start) + next_entry = (aln.query_name, not aln.is_read1) + pair_alns[next_locus].add(next_entry) + # fetch and write pair reads + for aln in self.load_pair_alignments(bam_reader, pair_alns): + self.write_alignment(sam_writer, aln, chr, start, end) + + def write_alignment(self, sam_writer, aln, chr, start, end): + # Strip tags since they are unnecessary for succeeding processes + aln.set_tags([]) + aln.query_name += '_{}:{}-{}'.format(chr, start, end+1) + sam_writer.write(aln) + + def get_genes_at(self, chr, pos): + return set(annotationFunction.get_gene_info(chr, pos, self.ref_gene_tb, self.ens_gene_tb)) + + def load_splicing_junctions(self, splicing_reader): + for line in splicing_reader: + chr, start, end = line.split('\t')[:3] + start_genes = self.get_genes_at(chr, start) + end_genes = self.get_genes_at(chr, end) + if not(start_genes.intersection(end_genes)): + yield (chr, int(start)-1, int(end)) + + def load_alignments(self, bam_reader, chr, start, end): + def pairwise(it): + it1, it2 = itertools.tee(it) + next(it2, None) + return zip(it1, it2) + + for aln in bam_reader.fetch(chr, start, start+1): + if not aln.is_proper_pair or aln.is_secondary: continue + + for (_, curr_end), (next_start, _) in pairwise(aln.get_blocks()): + if curr_end == start and next_start == end: + break + else: + continue + yield aln + + def load_pair_alignments(self, bam_reader, pair_alns): + def chunked(alns, interval): + chrom = start = end = None + entries = set() + for (chr, pos), es in sorted(alns.items(), key=lambda x: x[0]): + if chrom == chr and pos <= start + interval: + end = pos + entries.update(es) + continue + if entries: + yield chrom, start, end, entries + chrom = chr + start = end = pos + entries = set(es) + if entries: + yield chrom, start, end, entries + + for (chr, start, end, entries) in chunked(pair_alns, self.interval): + for aln in bam_reader.fetch(chr, start, end + 1): + locus = (aln.reference_name, aln.reference_start) + entry = (aln.query_name, aln.is_read1) + if not aln.is_secondary and entry in entries: + yield aln + + def split_alignment(self, aln, splicing_start): + elems = aln.cigartuples + blocks = aln.get_blocks() + + elem_idx = 0 + block_idx = 0 + for op, len in elems: + if op == 0: + _, block_end = blocks[block_idx] + if block_end == splicing_start: break + block_idx += 1 + elem_idx += 1 + + elems1 = elems[:elem_idx+1] + elems2 = list(itertools.dropwhile(lambda elem: elem[0] != 0, elems[elem_idx+1:])) + count_bases = lambda ops, elems: sum(e[1] for e in elems if e[0] in ops) + + aln1 = copy.copy(aln) + aln2 = copy.copy(aln) + prim_flag, suppl_flag = aln.flag & ~0x100, aln.flag | 0x100 + # Consider alignment with more matched bases as primary and the other as supplementary + if count_bases({0}, elems1) >= count_bases({0}, elems2): + aln1.flag, aln2.flag = prim_flag, suppl_flag + else: + aln1.flag, aln2.flag = suppl_flag, prim_flag + aln2.reference_start = blocks[block_idx+1][0] + # To reset CIGAR (and related properties), .cigartuples must be set to None first + aln1.cigartuples = None; aln1.cigartuples = elems1 + [(4, count_bases({0, 1, 4}, elems2))] + aln2.cigartuples = None; aln2.cigartuples = [(4, count_bases({0, 1, 4}, elems1))] + elems2 + aln1.template_length, aln2.template_length = \ + (aln.template_length, 0) if aln.template_length >= 0 else (0, aln.template_length) + return (aln1, aln2) From f155e31ed0fbc344b8b358ecaf9c6ec6581feed3 Mon Sep 17 00:00:00 2001 From: Shogo Ohta Date: Tue, 1 Dec 2020 17:50:58 +0900 Subject: [PATCH 2/2] Wire up new filter to fusionfusion driver --- fusionfusion/arg_parser.py | 4 +-- fusionfusion/run.py | 60 ++++++++++++++++++++++++++++++-------- 2 files changed, 50 insertions(+), 14 deletions(-) diff --git a/fusionfusion/arg_parser.py b/fusionfusion/arg_parser.py index 62adc64..ae8becd 100644 --- a/fusionfusion/arg_parser.py +++ b/fusionfusion/arg_parser.py @@ -13,8 +13,8 @@ def create_parser(): parser.add_argument("--version", action = "version", version = "%(prog)s " + __version__) # parser.add_argument("--version", action = "version", version = "fusionfusion-0.5.0b1") - parser.add_argument("--star", metavar = "star.Chimeric.out.sam", default = None, type = str, - help = "the path to the chimeric sam file by STAR") + parser.add_argument("--star", metavar = "star", default = None, type = str, + help = "the path prefix of STAR result files") parser.add_argument("--ms2", metavar = "ms2.bam", default = None, type = str, help = "the path to the bam file by Map splice2") diff --git a/fusionfusion/run.py b/fusionfusion/run.py index 5473af8..55be6b1 100644 --- a/fusionfusion/run.py +++ b/fusionfusion/run.py @@ -2,11 +2,13 @@ from __future__ import print_function -import sys, os, argparse, subprocess, shutil +import sys, os, argparse, subprocess, shutil, pysam +import annot_utils as annot from . import parseJunctionInfo from . import filterJunctionInfo from . import annotationFunction from . import utils +from .short_range_chimera_filter import ShortRangeChimeraFilter # import config from .config import * @@ -83,12 +85,12 @@ def cluster_filter_junction(inputFilePath, outputFilePrefix, args): def fusionfusion_main(args): - starBamFile = args.star + starFilePrefix = args.star ms2BamFile = args.ms2 th2BamFile = args.th2 output_dir = args.out - if starBamFile == None and ms2BamFile == None and th2BamFile == None: + if starFilePrefix == None and ms2BamFile == None and th2BamFile == None: print("At least one of --star, --ms2 or --th2 should be included", file = sys.stderr) sys.exit(1) @@ -122,19 +124,53 @@ def fusionfusion_main(args): #################### # parsing chimeric reads from bam files - if starBamFile is not None: - - parseJunctionInfo.parseJuncInfo_STAR(starBamFile, output_dir + "/star.chimeric.tmp.txt") - - - hOUT = open(output_dir + "/star.chimeric.txt", "w") - subprocess.check_call(["sort", "-k1,1", "-k2,2n", "-k4,4", "-k5,5n", output_dir + "/star.chimeric.tmp.txt"], stdout = hOUT) - hOUT.close() + if starFilePrefix is not None: + + parseJunctionInfo.parseJuncInfo_STAR(starFilePrefix + ".Chimeric.out.sam", + output_dir + "/star.chimeric.tmp1.txt") + + genome_id = args.genome_id + is_grc = args.grc + annot.gene.make_gene_info(output_dir + "/star.tmp.refGene.bed.gz", "refseq", genome_id, is_grc, False) + annot.gene.make_gene_info(output_dir + "/star.tmp.ensGene.bed.gz", "gencode", genome_id, is_grc, False) + + with pysam.TabixFile(output_dir + "/star.tmp.refGene.bed.gz") as ref_gene_tb, \ + pysam.TabixFile(output_dir + "/star.tmp.ensGene.bed.gz") as ens_gene_tb: + filt = ShortRangeChimeraFilter(ref_gene_tb, ens_gene_tb) + filt.run( + starFilePrefix + ".SJ.out.tab", + starFilePrefix + ".Aligned.sortedByCoord.out.bam", + output_dir + "/star.chimeric.tmp2.sam" + ) + subprocess.check_call(["rm", output_dir + "/star.tmp.refGene.bed.gz"]) + subprocess.check_call(["rm", output_dir + "/star.tmp.ensGene.bed.gz"]) + subprocess.check_call(["rm", output_dir + "/star.tmp.refGene.bed.gz.tbi"]) + subprocess.check_call(["rm", output_dir + "/star.tmp.ensGene.bed.gz.tbi"]) + + pysam.sort( + "-n", + "-o", output_dir + "/star.chimeric.tmp2.sorted.sam", + output_dir + "/star.chimeric.tmp2.sam" + ) + parseJunctionInfo.parseJuncInfo_STAR(output_dir + "/star.chimeric.tmp2.sorted.sam", + output_dir + "/star.chimeric.tmp2.txt") + + with open(output_dir + "/star.chimeric.txt", "w") as hOUT: + subprocess.check_call([ + "bash", "-c", + "cat {input1} {input2} | sort -k1,1 -k2,2n -k4,4 -k5,5n".format( + input1=output_dir + "/star.chimeric.tmp1.txt", + input2=output_dir + "/star.chimeric.tmp2.txt" + ) + ], stdout=hOUT) cluster_filter_junction(output_dir + "/star.chimeric.txt", output_dir + "/star", args) if debug_mode == False: - subprocess.check_call(["rm", output_dir + "/star.chimeric.tmp.txt"]) + subprocess.check_call(["rm", output_dir + "/star.chimeric.tmp1.txt"]) + subprocess.check_call(["rm", output_dir + "/star.chimeric.tmp2.txt"]) + subprocess.check_call(["rm", output_dir + "/star.chimeric.tmp2.sam"]) + subprocess.check_call(["rm", output_dir + "/star.chimeric.tmp2.sorted.sam"]) subprocess.check_call(["rm", output_dir + "/star.chimeric.txt"]) if ms2BamFile is not None: