Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

近距離の融合遺伝子を検出するためのフィルタの実装 #12

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions fusionfusion/arg_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@ def create_parser():
parser.add_argument("--version", action = "version", version = "%(prog)s " + __version__)
# parser.add_argument("--version", action = "version", version = "fusionfusion-0.5.0b1")

parser.add_argument("--star", metavar = "star.Chimeric.out.sam", default = None, type = str,
help = "the path to the chimeric sam file by STAR")
parser.add_argument("--star", metavar = "star", default = None, type = str,
help = "the path prefix of STAR result files")

parser.add_argument("--ms2", metavar = "ms2.bam", default = None, type = str,
help = "the path to the bam file by Map splice2")
Expand Down
60 changes: 48 additions & 12 deletions fusionfusion/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,13 @@

from __future__ import print_function

import sys, os, argparse, subprocess, shutil
import sys, os, argparse, subprocess, shutil, pysam
import annot_utils as annot
from . import parseJunctionInfo
from . import filterJunctionInfo
from . import annotationFunction
from . import utils
from .short_range_chimera_filter import ShortRangeChimeraFilter

# import config
from .config import *
Expand Down Expand Up @@ -83,12 +85,12 @@ def cluster_filter_junction(inputFilePath, outputFilePrefix, args):

def fusionfusion_main(args):

starBamFile = args.star
starFilePrefix = args.star
ms2BamFile = args.ms2
th2BamFile = args.th2
output_dir = args.out

if starBamFile == None and ms2BamFile == None and th2BamFile == None:
if starFilePrefix == None and ms2BamFile == None and th2BamFile == None:
print("At least one of --star, --ms2 or --th2 should be included", file = sys.stderr)
sys.exit(1)

Expand Down Expand Up @@ -122,19 +124,53 @@ def fusionfusion_main(args):

####################
# parsing chimeric reads from bam files
if starBamFile is not None:

parseJunctionInfo.parseJuncInfo_STAR(starBamFile, output_dir + "/star.chimeric.tmp.txt")


hOUT = open(output_dir + "/star.chimeric.txt", "w")
subprocess.check_call(["sort", "-k1,1", "-k2,2n", "-k4,4", "-k5,5n", output_dir + "/star.chimeric.tmp.txt"], stdout = hOUT)
hOUT.close()
if starFilePrefix is not None:

parseJunctionInfo.parseJuncInfo_STAR(starFilePrefix + ".Chimeric.out.sam",
output_dir + "/star.chimeric.tmp1.txt")

genome_id = args.genome_id
is_grc = args.grc
annot.gene.make_gene_info(output_dir + "/star.tmp.refGene.bed.gz", "refseq", genome_id, is_grc, False)
annot.gene.make_gene_info(output_dir + "/star.tmp.ensGene.bed.gz", "gencode", genome_id, is_grc, False)

with pysam.TabixFile(output_dir + "/star.tmp.refGene.bed.gz") as ref_gene_tb, \
pysam.TabixFile(output_dir + "/star.tmp.ensGene.bed.gz") as ens_gene_tb:
filt = ShortRangeChimeraFilter(ref_gene_tb, ens_gene_tb)
filt.run(
starFilePrefix + ".SJ.out.tab",
starFilePrefix + ".Aligned.sortedByCoord.out.bam",
output_dir + "/star.chimeric.tmp2.sam"
)
subprocess.check_call(["rm", output_dir + "/star.tmp.refGene.bed.gz"])
subprocess.check_call(["rm", output_dir + "/star.tmp.ensGene.bed.gz"])
subprocess.check_call(["rm", output_dir + "/star.tmp.refGene.bed.gz.tbi"])
subprocess.check_call(["rm", output_dir + "/star.tmp.ensGene.bed.gz.tbi"])

pysam.sort(
"-n",
"-o", output_dir + "/star.chimeric.tmp2.sorted.sam",
output_dir + "/star.chimeric.tmp2.sam"
)
parseJunctionInfo.parseJuncInfo_STAR(output_dir + "/star.chimeric.tmp2.sorted.sam",
output_dir + "/star.chimeric.tmp2.txt")

with open(output_dir + "/star.chimeric.txt", "w") as hOUT:
subprocess.check_call([
"bash", "-c",
"cat {input1} {input2} | sort -k1,1 -k2,2n -k4,4 -k5,5n".format(
input1=output_dir + "/star.chimeric.tmp1.txt",
input2=output_dir + "/star.chimeric.tmp2.txt"
)
], stdout=hOUT)

cluster_filter_junction(output_dir + "/star.chimeric.txt", output_dir + "/star", args)

if debug_mode == False:
subprocess.check_call(["rm", output_dir + "/star.chimeric.tmp.txt"])
subprocess.check_call(["rm", output_dir + "/star.chimeric.tmp1.txt"])
subprocess.check_call(["rm", output_dir + "/star.chimeric.tmp2.txt"])
subprocess.check_call(["rm", output_dir + "/star.chimeric.tmp2.sam"])
subprocess.check_call(["rm", output_dir + "/star.chimeric.tmp2.sorted.sam"])
subprocess.check_call(["rm", output_dir + "/star.chimeric.txt"])

if ms2BamFile is not None:
Expand Down
125 changes: 125 additions & 0 deletions fusionfusion/short_range_chimera_filter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
import collections
import copy
import itertools
import pysam

from . import annotationFunction

class ShortRangeChimeraFilter:
def __init__(self, ref_gene_tb, ens_gene_tb):
self.ref_gene_tb = ref_gene_tb
self.ens_gene_tb = ens_gene_tb
self.interval = 500

def run(self, splicing_file, bam_file, output_file):
with open(splicing_file) as splicing_reader, \
pysam.AlignmentFile(bam_file, 'r') as bam_reader, \
pysam.AlignmentFile(output_file, 'w', header=bam_reader.header) as sam_writer:
for chr, start, end in self.load_splicing_junctions(splicing_reader):
pair_alns = collections.defaultdict(set)
for aln in self.load_alignments(bam_reader, chr, start, end):
locus = (aln.reference_name, aln.reference_start)
entry = (aln.query_name, aln.is_read1)
if entry in pair_alns[locus]:
self.write_alignment(sam_writer, aln, chr, start, end)
pair_alns[locus].remove(entry)
else:
aln1, aln2 = self.split_alignment(aln, start)
self.write_alignment(sam_writer, aln1, chr, start, end)
self.write_alignment(sam_writer, aln2, chr, start, end)
next_locus = (aln.next_reference_name, aln.next_reference_start)
next_entry = (aln.query_name, not aln.is_read1)
pair_alns[next_locus].add(next_entry)
# fetch and write pair reads
for aln in self.load_pair_alignments(bam_reader, pair_alns):
self.write_alignment(sam_writer, aln, chr, start, end)

def write_alignment(self, sam_writer, aln, chr, start, end):
# Strip tags since they are unnecessary for succeeding processes
aln.set_tags([])
aln.query_name += '_{}:{}-{}'.format(chr, start, end+1)
sam_writer.write(aln)

def get_genes_at(self, chr, pos):
return set(annotationFunction.get_gene_info(chr, pos, self.ref_gene_tb, self.ens_gene_tb))

def load_splicing_junctions(self, splicing_reader):
for line in splicing_reader:
chr, start, end = line.split('\t')[:3]
start_genes = self.get_genes_at(chr, start)
end_genes = self.get_genes_at(chr, end)
if not(start_genes.intersection(end_genes)):
yield (chr, int(start)-1, int(end))

def load_alignments(self, bam_reader, chr, start, end):
def pairwise(it):
it1, it2 = itertools.tee(it)
next(it2, None)
return zip(it1, it2)

for aln in bam_reader.fetch(chr, start, start+1):
if not aln.is_proper_pair or aln.is_secondary: continue

for (_, curr_end), (next_start, _) in pairwise(aln.get_blocks()):
if curr_end == start and next_start == end:
break
else:
continue
yield aln

def load_pair_alignments(self, bam_reader, pair_alns):
def chunked(alns, interval):
chrom = start = end = None
entries = set()
for (chr, pos), es in sorted(alns.items(), key=lambda x: x[0]):
if chrom == chr and pos <= start + interval:
end = pos
entries.update(es)
continue
if entries:
yield chrom, start, end, entries
chrom = chr
start = end = pos
entries = set(es)
if entries:
yield chrom, start, end, entries

for (chr, start, end, entries) in chunked(pair_alns, self.interval):
for aln in bam_reader.fetch(chr, start, end + 1):
locus = (aln.reference_name, aln.reference_start)
entry = (aln.query_name, aln.is_read1)
if not aln.is_secondary and entry in entries:
yield aln

def split_alignment(self, aln, splicing_start):
elems = aln.cigartuples
blocks = aln.get_blocks()

elem_idx = 0
block_idx = 0
for op, len in elems:
if op == 0:
_, block_end = blocks[block_idx]
if block_end == splicing_start: break
block_idx += 1
elem_idx += 1

elems1 = elems[:elem_idx+1]
elems2 = list(itertools.dropwhile(lambda elem: elem[0] != 0, elems[elem_idx+1:]))
count_bases = lambda ops, elems: sum(e[1] for e in elems if e[0] in ops)

aln1 = copy.copy(aln)
aln2 = copy.copy(aln)
prim_flag, suppl_flag = aln.flag & ~0x100, aln.flag | 0x100
# Consider alignment with more matched bases as primary and the other as supplementary
if count_bases({0}, elems1) >= count_bases({0}, elems2):
aln1.flag, aln2.flag = prim_flag, suppl_flag
else:
aln1.flag, aln2.flag = suppl_flag, prim_flag
aln2.reference_start = blocks[block_idx+1][0]
# To reset CIGAR (and related properties), .cigartuples must be set to None first
aln1.cigartuples = None; aln1.cigartuples = elems1 + [(4, count_bases({0, 1, 4}, elems2))]
aln2.cigartuples = None; aln2.cigartuples = [(4, count_bases({0, 1, 4}, elems1))] + elems2
aln1.template_length, aln2.template_length = \
(aln.template_length, 0) if aln.template_length >= 0 else (0, aln.template_length)
return (aln1, aln2)