Skip to content

Commit

Permalink
Merge pull request #12 from chrovis-genomon/pr/short-range-chimera-fi…
Browse files Browse the repository at this point in the history
…lter

近距離の融合遺伝子を検出するためのフィルタの実装
  • Loading branch information
friend1ws authored Feb 3, 2021
2 parents 6d8af69 + f155e31 commit 731a714
Show file tree
Hide file tree
Showing 3 changed files with 175 additions and 14 deletions.
4 changes: 2 additions & 2 deletions fusionfusion/arg_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@ def create_parser():
parser.add_argument("--version", action = "version", version = "%(prog)s " + __version__)
# parser.add_argument("--version", action = "version", version = "fusionfusion-0.5.0b1")

parser.add_argument("--star", metavar = "star.Chimeric.out.sam", default = None, type = str,
help = "the path to the chimeric sam file by STAR")
parser.add_argument("--star", metavar = "star", default = None, type = str,
help = "the path prefix of STAR result files")

parser.add_argument("--ms2", metavar = "ms2.bam", default = None, type = str,
help = "the path to the bam file by Map splice2")
Expand Down
60 changes: 48 additions & 12 deletions fusionfusion/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,13 @@

from __future__ import print_function

import sys, os, argparse, subprocess, shutil
import sys, os, argparse, subprocess, shutil, pysam
import annot_utils as annot
from . import parseJunctionInfo
from . import filterJunctionInfo
from . import annotationFunction
from . import utils
from .short_range_chimera_filter import ShortRangeChimeraFilter

# import config
from .config import *
Expand Down Expand Up @@ -94,12 +96,12 @@ def cluster_filter_junction(inputFilePath, outputFilePrefix, args):

def fusionfusion_main(args):

starBamFile = args.star
starFilePrefix = args.star
ms2BamFile = args.ms2
th2BamFile = args.th2
output_dir = args.out

if starBamFile == None and ms2BamFile == None and th2BamFile == None:
if starFilePrefix == None and ms2BamFile == None and th2BamFile == None:
print("At least one of --star, --ms2 or --th2 should be included", file = sys.stderr)
sys.exit(1)

Expand Down Expand Up @@ -133,19 +135,53 @@ def fusionfusion_main(args):

####################
# parsing chimeric reads from bam files
if starBamFile is not None:

parseJunctionInfo.parseJuncInfo_STAR(starBamFile, output_dir + "/star.chimeric.tmp.txt")


hOUT = open(output_dir + "/star.chimeric.txt", "w")
subprocess.check_call(["sort", "-k1,1", "-k2,2n", "-k4,4", "-k5,5n", output_dir + "/star.chimeric.tmp.txt"], stdout = hOUT)
hOUT.close()
if starFilePrefix is not None:

parseJunctionInfo.parseJuncInfo_STAR(starFilePrefix + ".Chimeric.out.sam",
output_dir + "/star.chimeric.tmp1.txt")

genome_id = args.genome_id
is_grc = args.grc
annot.gene.make_gene_info(output_dir + "/star.tmp.refGene.bed.gz", "refseq", genome_id, is_grc, False)
annot.gene.make_gene_info(output_dir + "/star.tmp.ensGene.bed.gz", "gencode", genome_id, is_grc, False)

with pysam.TabixFile(output_dir + "/star.tmp.refGene.bed.gz") as ref_gene_tb, \
pysam.TabixFile(output_dir + "/star.tmp.ensGene.bed.gz") as ens_gene_tb:
filt = ShortRangeChimeraFilter(ref_gene_tb, ens_gene_tb)
filt.run(
starFilePrefix + ".SJ.out.tab",
starFilePrefix + ".Aligned.sortedByCoord.out.bam",
output_dir + "/star.chimeric.tmp2.sam"
)
subprocess.check_call(["rm", output_dir + "/star.tmp.refGene.bed.gz"])
subprocess.check_call(["rm", output_dir + "/star.tmp.ensGene.bed.gz"])
subprocess.check_call(["rm", output_dir + "/star.tmp.refGene.bed.gz.tbi"])
subprocess.check_call(["rm", output_dir + "/star.tmp.ensGene.bed.gz.tbi"])

pysam.sort(
"-n",
"-o", output_dir + "/star.chimeric.tmp2.sorted.sam",
output_dir + "/star.chimeric.tmp2.sam"
)
parseJunctionInfo.parseJuncInfo_STAR(output_dir + "/star.chimeric.tmp2.sorted.sam",
output_dir + "/star.chimeric.tmp2.txt")

with open(output_dir + "/star.chimeric.txt", "w") as hOUT:
subprocess.check_call([
"bash", "-c",
"cat {input1} {input2} | sort -k1,1 -k2,2n -k4,4 -k5,5n".format(
input1=output_dir + "/star.chimeric.tmp1.txt",
input2=output_dir + "/star.chimeric.tmp2.txt"
)
], stdout=hOUT)

cluster_filter_junction(output_dir + "/star.chimeric.txt", output_dir + "/star", args)

if debug_mode == False:
subprocess.check_call(["rm", output_dir + "/star.chimeric.tmp.txt"])
subprocess.check_call(["rm", output_dir + "/star.chimeric.tmp1.txt"])
subprocess.check_call(["rm", output_dir + "/star.chimeric.tmp2.txt"])
subprocess.check_call(["rm", output_dir + "/star.chimeric.tmp2.sam"])
subprocess.check_call(["rm", output_dir + "/star.chimeric.tmp2.sorted.sam"])
subprocess.check_call(["rm", output_dir + "/star.chimeric.txt"])

if ms2BamFile is not None:
Expand Down
125 changes: 125 additions & 0 deletions fusionfusion/short_range_chimera_filter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
import collections
import copy
import itertools
import pysam

from . import annotationFunction

class ShortRangeChimeraFilter:
def __init__(self, ref_gene_tb, ens_gene_tb):
self.ref_gene_tb = ref_gene_tb
self.ens_gene_tb = ens_gene_tb
self.interval = 500

def run(self, splicing_file, bam_file, output_file):
with open(splicing_file) as splicing_reader, \
pysam.AlignmentFile(bam_file, 'r') as bam_reader, \
pysam.AlignmentFile(output_file, 'w', header=bam_reader.header) as sam_writer:
for chr, start, end in self.load_splicing_junctions(splicing_reader):
pair_alns = collections.defaultdict(set)
for aln in self.load_alignments(bam_reader, chr, start, end):
locus = (aln.reference_name, aln.reference_start)
entry = (aln.query_name, aln.is_read1)
if entry in pair_alns[locus]:
self.write_alignment(sam_writer, aln, chr, start, end)
pair_alns[locus].remove(entry)
else:
aln1, aln2 = self.split_alignment(aln, start)
self.write_alignment(sam_writer, aln1, chr, start, end)
self.write_alignment(sam_writer, aln2, chr, start, end)
next_locus = (aln.next_reference_name, aln.next_reference_start)
next_entry = (aln.query_name, not aln.is_read1)
pair_alns[next_locus].add(next_entry)
# fetch and write pair reads
for aln in self.load_pair_alignments(bam_reader, pair_alns):
self.write_alignment(sam_writer, aln, chr, start, end)

def write_alignment(self, sam_writer, aln, chr, start, end):
# Strip tags since they are unnecessary for succeeding processes
aln.set_tags([])
aln.query_name += '_{}:{}-{}'.format(chr, start, end+1)
sam_writer.write(aln)

def get_genes_at(self, chr, pos):
return set(annotationFunction.get_gene_info(chr, pos, self.ref_gene_tb, self.ens_gene_tb))

def load_splicing_junctions(self, splicing_reader):
for line in splicing_reader:
chr, start, end = line.split('\t')[:3]
start_genes = self.get_genes_at(chr, start)
end_genes = self.get_genes_at(chr, end)
if not(start_genes.intersection(end_genes)):
yield (chr, int(start)-1, int(end))

def load_alignments(self, bam_reader, chr, start, end):
def pairwise(it):
it1, it2 = itertools.tee(it)
next(it2, None)
return zip(it1, it2)

for aln in bam_reader.fetch(chr, start, start+1):
if not aln.is_proper_pair or aln.is_secondary: continue

for (_, curr_end), (next_start, _) in pairwise(aln.get_blocks()):
if curr_end == start and next_start == end:
break
else:
continue
yield aln

def load_pair_alignments(self, bam_reader, pair_alns):
def chunked(alns, interval):
chrom = start = end = None
entries = set()
for (chr, pos), es in sorted(alns.items(), key=lambda x: x[0]):
if chrom == chr and pos <= start + interval:
end = pos
entries.update(es)
continue
if entries:
yield chrom, start, end, entries
chrom = chr
start = end = pos
entries = set(es)
if entries:
yield chrom, start, end, entries

for (chr, start, end, entries) in chunked(pair_alns, self.interval):
for aln in bam_reader.fetch(chr, start, end + 1):
locus = (aln.reference_name, aln.reference_start)
entry = (aln.query_name, aln.is_read1)
if not aln.is_secondary and entry in entries:
yield aln

def split_alignment(self, aln, splicing_start):
elems = aln.cigartuples
blocks = aln.get_blocks()

elem_idx = 0
block_idx = 0
for op, len in elems:
if op == 0:
_, block_end = blocks[block_idx]
if block_end == splicing_start: break
block_idx += 1
elem_idx += 1

elems1 = elems[:elem_idx+1]
elems2 = list(itertools.dropwhile(lambda elem: elem[0] != 0, elems[elem_idx+1:]))
count_bases = lambda ops, elems: sum(e[1] for e in elems if e[0] in ops)

aln1 = copy.copy(aln)
aln2 = copy.copy(aln)
prim_flag, suppl_flag = aln.flag & ~0x100, aln.flag | 0x100
# Consider alignment with more matched bases as primary and the other as supplementary
if count_bases({0}, elems1) >= count_bases({0}, elems2):
aln1.flag, aln2.flag = prim_flag, suppl_flag
else:
aln1.flag, aln2.flag = suppl_flag, prim_flag
aln2.reference_start = blocks[block_idx+1][0]
# To reset CIGAR (and related properties), .cigartuples must be set to None first
aln1.cigartuples = None; aln1.cigartuples = elems1 + [(4, count_bases({0, 1, 4}, elems2))]
aln2.cigartuples = None; aln2.cigartuples = [(4, count_bases({0, 1, 4}, elems1))] + elems2
aln1.template_length, aln2.template_length = \
(aln.template_length, 0) if aln.template_length >= 0 else (0, aln.template_length)
return (aln1, aln2)

0 comments on commit 731a714

Please sign in to comment.