Skip to content

Commit

Permalink
Merge pull request #14 from chrovis-genomon/pr/short-range-chimera-fi…
Browse files Browse the repository at this point in the history
…lter

近距離の融合遺伝子を検出するためのフィルタの実装(修正版)
  • Loading branch information
friend1ws authored Jul 2, 2021
2 parents 3486e77 + b38d88b commit 7e1b115
Show file tree
Hide file tree
Showing 8 changed files with 283 additions and 29 deletions.
10 changes: 10 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,14 @@ For STAR, our software uses the chimeric sam file
{output_prefix}.Chimeric.out.sam
```

Optionally, you can specify the following files to enable further analysis (Note that
the aligned bam must be sorted by coordinate):

```
{output_prefix}.SJ.out.tab
{output_prefix}.Aligned.sortedByCoord.out.bam
```

For MapSplice2, our software uses the read alignment file
```
alignments.sam (bam)
Expand Down Expand Up @@ -73,6 +81,8 @@ fusionfusion [-h] [--version] [--star star.Chimeric.out.sam]
[--min_allowed_contig_match_diff MIN_ALLOWED_CONTIG_MATCH_DIFF]
[--check_contig_size_other_breakpoint CHECK_CONTIG_SIZE_OTHER_BREAKPOINT]
[--filter_same_gene]
[--star_sj_tab star.SJ.out.tab]
[--star_aligned_bam star.Aligned.sortedByCoord.out.bam]
```
At least one of --star, --ms2, --th2 arguments should be specified.
The arguments of --out and --reference_genome are mandatory.
Expand Down
27 changes: 26 additions & 1 deletion fusionfusion/annotationFunction.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#! /usr/bin/env python

from __future__ import print_function
import sys, pysam, os, subprocess
import sys, pysam, os, subprocess, re
import annot_utils.gene
import annot_utils.exon

Expand Down Expand Up @@ -271,4 +271,29 @@ def merge_fusion_result(input_dir, output_file_path):
hOUT.close()

subprocess.check_call(["rm", "-rf", output_file_path + ".unsorted.tmp"])

trace_files = list(filter(os.path.exists, [
input_dir + "/star.chimeric.trace.txt",
# input_dir + "/ms2.chimeric.trace.txt",
# input_dir + "/th2.chimeric.trace.txt"
]))
if trace_files:
traced_file_path = re.sub(r'(?:\.txt)?$', '.traced.txt', output_file_path, count=1)
emit_traced(output_file_path, trace_files, traced_file_path)


def emit_traced(input_file, trace_files, output_file):
trace = {}
for trace_file in trace_files:
with open(trace_file) as t:
for line in t:
cols = line.rstrip().split('\t')
if cols[7] != '---':
trace[tuple(cols[0:7])] = cols[7]

with open(input_file) as r, open(output_file, 'w') as w:
for line in r:
cols = line.rstrip().split('\t')
source = trace.get(tuple(cols[0:7]), '---')
cols.append(source)
print('\t'.join(cols), file=w)
7 changes: 6 additions & 1 deletion fusionfusion/arg_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,12 @@ def create_parser():
parser.add_argument("--star", metavar = "star.Chimeric.out.sam", default = None, type = str,
help = "the path to the chimeric sam file by STAR")

parser.add_argument("--star_sj_tab", metavar="star.SJ.out.tab", default=None, type=str,
help="the path to the SJ.out.tab file by STAR")

parser.add_argument("--star_aligned_bam", metavar="star.Aligned.sortedByCoord.out.bam", default=None, type=str,
help="the path to the aligned bam file by STAR (which must be sorted by coordinate)")

parser.add_argument("--ms2", metavar = "ms2.bam", default = None, type = str,
help = "the path to the bam file by Map splice2")

Expand Down Expand Up @@ -87,4 +93,3 @@ def create_parser():


return parser

22 changes: 18 additions & 4 deletions fusionfusion/filterJunctionInfo.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,13 +131,15 @@ def filterPoolControl(input_file, output_file, control_file):
hout.close()


def extractSplicingPattern(inputFilePath, outputFilePath):
def extractSplicingPattern(inputFilePath, outputFilePath, traceFilePath):

# reference_genome = config.param_conf.get("alignment", "reference_genome")
reference_genome = param_conf.reference_genome

hIN = open(inputFilePath, 'r')
hOUT = open(outputFilePath, 'w')
if traceFilePath:
hTRACE = open(traceFilePath, 'w')

for line in hIN:
F = line.rstrip('\n').split('\t')
Expand Down Expand Up @@ -477,11 +479,24 @@ def extractSplicingPattern(inputFilePath, outputFilePath):
if F[2] == "+": contigSeq1 = seq_utils.reverseComplement(contigSeq1)
if F[5] == "+": contigSeq2 = seq_utils.reverseComplement(contigSeq2)

print('\t'.join(['\t'.join(F[0:6]), str(inSeq), str(len(F[8].split(';'))), str(contig1), str(contig2), contigSeq1, contigSeq2]), file = hOUT)

common_cols = F[0:6] + [str(inSeq)]
out_cols = common_cols + [
str(len(F[8].split(';'))), str(contig1), str(contig2),
contigSeq1, contigSeq2
]
print('\t'.join(out_cols), file=hOUT)
if traceFilePath:
sources = set()
for qname in F[7].split(';'):
m = re.search('@([^@]+)$', qname)
if m: sources.add(m.group(1))
trace_cols = common_cols + [','.join(sorted(sources, reverse=True)) if sources else '---']
print('\t'.join(trace_cols), file=hTRACE)

hIN.close()
hOUT.close()
if traceFilePath:
hTRACE.close()



Expand Down Expand Up @@ -625,4 +640,3 @@ def filterContigCheck(inputFilePath, outputFilePath, checkMatchFile):

hIN.close()
hOUT.close()

14 changes: 9 additions & 5 deletions fusionfusion/parseJunctionInfo.py
Original file line number Diff line number Diff line change
Expand Up @@ -343,7 +343,7 @@ def parseJuncInfo_th2(inputFilePath, outputFilePath):
hOUT.close()


def getFusInfo_STAR(juncLine):
def getFusInfo_STAR(juncLine, source=None):

# abnormal_insert_size = config.param_conf.getint("parse_condition", "abnormal_insert_size")
# min_major_clip_size = config.param_conf.getint("parse_condition", "min_major_clip_size")
Expand Down Expand Up @@ -409,7 +409,11 @@ def getFusInfo_STAR(juncLine):
coverRegion_primary = cigar_utils.getCoverRegion(F[2], F[3], F[5])
readLength_primary = len(F[9])
endPos_primary = cigar_utils.getEndPos(pos_primary, F[5])
readID_primary = F[0] + ("/1" if flags[6] == "1" else "/2")
readID_primary = "{qname}/{read}{suffix}".format(
qname=F[0],
read=(1 if flags[6] == "1" else 2),
suffix=("@" + source if source else "")
)

tempMatch = cigarSRe_right.search(F[5])
if tempMatch is not None: right_clipping_primary = int(tempMatch.group(1))
Expand Down Expand Up @@ -574,7 +578,7 @@ def getFusInfo_STAR(juncLine):



def parseJuncInfo_STAR(inputFilePath, outputFilePath):
def parseJuncInfo_STAR(inputFilePath, outputFilePath, source=None):


hIN = open(inputFilePath, 'r')
Expand All @@ -589,7 +593,7 @@ def parseJuncInfo_STAR(inputFilePath, outputFilePath):

if tempID != F[0]:
if tempID != "" and len(tempLine) == 3:
tempFusInfo = getFusInfo_STAR(tempLine)
tempFusInfo = getFusInfo_STAR(tempLine, source)
if tempFusInfo is not None:
print(tempFusInfo, file = hOUT)

Expand All @@ -602,7 +606,7 @@ def parseJuncInfo_STAR(inputFilePath, outputFilePath):


if tempID != "" and len(tempLine) == 3:
tempFusInfo = getFusInfo_STAR(tempLine)
tempFusInfo = getFusInfo_STAR(tempLine, source)
if tempFusInfo is not None:
print(tempFusInfo, file = hOUT)

Expand Down
97 changes: 80 additions & 17 deletions fusionfusion/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,18 @@

from __future__ import print_function

import sys, os, argparse, subprocess, shutil
import sys, os, argparse, subprocess, shutil, pysam
import annot_utils as annot
from . import parseJunctionInfo
from . import filterJunctionInfo
from . import annotationFunction
from . import utils
from .short_range_chimera_filter import ShortRangeChimeraFilter

# import config
from .config import *

def cluster_filter_junction(inputFilePath, outputFilePrefix, args):
def cluster_filter_junction(inputFilePath, outputFilePrefix, args, generates_trace=False):

# debug_mode = config.param_conf.getboolean("debug", "debug_mode")
debug_mode = param_conf.debug
Expand All @@ -30,8 +32,10 @@ def cluster_filter_junction(inputFilePath, outputFilePrefix, args):
shutil.copyfile(outputFilePrefix + ".chimeric.clustered.filt1.txt",
outputFilePrefix + ".chimeric.clustered.filt2.txt")

traceFilePath = outputFilePrefix + ".chimeric.trace.txt" if generates_trace else None
filterJunctionInfo.extractSplicingPattern(outputFilePrefix + ".chimeric.clustered.filt2.txt",
outputFilePrefix + ".chimeric.clustered.splicing.txt")
outputFilePrefix + ".chimeric.clustered.splicing.txt",
traceFilePath)

if args.no_blat:
annotationFunction.filterAndAnnotation(
Expand Down Expand Up @@ -134,19 +138,70 @@ def fusionfusion_main(args):
####################
# parsing chimeric reads from bam files
if starBamFile is not None:

parseJunctionInfo.parseJuncInfo_STAR(starBamFile, output_dir + "/star.chimeric.tmp.txt")


hOUT = open(output_dir + "/star.chimeric.txt", "w")
subprocess.check_call(["sort", "-k1,1", "-k2,2n", "-k4,4", "-k5,5n", output_dir + "/star.chimeric.tmp.txt"], stdout = hOUT)
hOUT.close()

cluster_filter_junction(output_dir + "/star.chimeric.txt", output_dir + "/star", args)

if debug_mode == False:
subprocess.check_call(["rm", output_dir + "/star.chimeric.tmp.txt"])
subprocess.check_call(["rm", output_dir + "/star.chimeric.txt"])
generates_trace = False
starSjTab = args.star_sj_tab
starAlignedBam = args.star_aligned_bam
if starSjTab is None or starAlignedBam is None:
if starSjTab is not None or starAlignedBam is not None:
msg = "To enable further analysis using STAR's SJ.out.tab, both --star_sj_tab and --star_aligned_bam options must be specified."
print(msg, file=sys.stderr)

parseJunctionInfo.parseJuncInfo_STAR(starBamFile, output_dir + "/star.chimeric.tmp.txt")

with open(output_dir + "/star.chimeric.txt", "w") as hOUT:
subprocess.check_call([
"sort", "-k1,1", "-k2,2n", "-k4,4", "-k5,5n",
output_dir + "/star.chimeric.tmp.txt"
], stdout=hOUT)
else:
parseJunctionInfo.parseJuncInfo_STAR(starBamFile,
output_dir + "/star.chimeric.tmp1.txt",
source="Chimeric")

genome_id = args.genome_id
is_grc = args.grc
annot.gene.make_gene_info(output_dir + "/star.tmp.refGene.bed.gz", "refseq", genome_id, is_grc, False)
annot.gene.make_gene_info(output_dir + "/star.tmp.ensGene.bed.gz", "gencode", genome_id, is_grc, False)

with pysam.TabixFile(output_dir + "/star.tmp.refGene.bed.gz") as ref_gene_tb, \
pysam.TabixFile(output_dir + "/star.tmp.ensGene.bed.gz") as ens_gene_tb:
filt = ShortRangeChimeraFilter(ref_gene_tb, ens_gene_tb)
filt.run(starSjTab, starAlignedBam, output_dir + "/star.chimeric.tmp2.sam")
os.remove(output_dir + "/star.tmp.refGene.bed.gz")
os.remove(output_dir + "/star.tmp.ensGene.bed.gz")
os.remove(output_dir + "/star.tmp.refGene.bed.gz.tbi")
os.remove(output_dir + "/star.tmp.ensGene.bed.gz.tbi")

pysam.sort(
"-n",
"-o", output_dir + "/star.chimeric.tmp2.sorted.sam",
output_dir + "/star.chimeric.tmp2.sam"
)
parseJunctionInfo.parseJuncInfo_STAR(output_dir + "/star.chimeric.tmp2.sorted.sam",
output_dir + "/star.chimeric.tmp2.txt",
source="SJ")

with open(output_dir + "/star.chimeric.txt", "w") as hOUT:
subprocess.check_call([
"sh", "-c",
"cat {input1} {input2} | sort -k1,1 -k2,2n -k4,4 -k5,5n".format(
input1=output_dir + "/star.chimeric.tmp1.txt",
input2=output_dir + "/star.chimeric.tmp2.txt"
)
], stdout=hOUT)

generates_trace = True

cluster_filter_junction(output_dir + "/star.chimeric.txt", output_dir + "/star",
args, generates_trace=generates_trace)

if not debug_mode:
for fname in ["star.chimeric.tmp.txt", "star.chimeric.tmp1.txt",
"star.chimeric.tmp2.txt", "star.chimeric.tmp2.sam",
"star.chimeric.tmp2.sorted.sam", "star.chimeric.txt"]:
path = output_dir + '/' + fname
if os.path.exists(path):
os.remove(path)

if ms2BamFile is not None:

Expand Down Expand Up @@ -196,4 +251,12 @@ def fusionfusion_main(args):
annotationFunction.merge_fusion_result(output_dir,
output_dir + "/fusion_fusion.result.txt")


if not debug_mode:
for fname in [
"star.chimeric.trace.txt",
# "ms2.chimeric.trace.txt",
# "th2.chimeric.trace.txt"
]:
path = output_dir + '/' + fname
if os.path.exists(path):
os.remove(path)
10 changes: 9 additions & 1 deletion fusionfusion/seq_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,15 @@ def getSeq(reference, region_tuple):

seq = ""
for reg in region_tuple_sorted:
for line in pysam.faidx(reference, reg[0] + ":" + str(reg[1]) + "-" + str(reg[2]), split_lines=True):
lines = []
try:
lines = pysam.faidx(reference, reg[0] + ":" + str(reg[1]) + "-" + str(reg[2]), split_lines=True)
except pysam.utils.SamtoolsError:
# With latest versions of pysam, faidx raises an error if it gets fed with
# a contradictory range (such as `chr:start-end` where start > end).
# This try-except statement is for handling such cases.
pass
for line in lines:
if line[0] == ">": continue
seq += line

Expand Down
Loading

0 comments on commit 7e1b115

Please sign in to comment.