Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

近距離の融合遺伝子を検出するためのフィルタの実装(修正版) #14

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,14 @@ For STAR, our software uses the chimeric sam file
{output_prefix}.Chimeric.out.sam
```

Optionally, you can specify the following files to enable further analysis (Note that
the aligned bam must be sorted by coordinate):

```
{output_prefix}.SJ.out.tab
{output_prefix}.Aligned.sortedByCoord.out.bam
```

For MapSplice2, our software uses the read alignment file
```
alignments.sam (bam)
Expand Down Expand Up @@ -73,6 +81,8 @@ fusionfusion [-h] [--version] [--star star.Chimeric.out.sam]
[--min_allowed_contig_match_diff MIN_ALLOWED_CONTIG_MATCH_DIFF]
[--check_contig_size_other_breakpoint CHECK_CONTIG_SIZE_OTHER_BREAKPOINT]
[--filter_same_gene]
[--star_sj_tab star.SJ.out.tab]
[--star_aligned_bam star.Aligned.sortedByCoord.out.bam]
```
At least one of --star, --ms2, --th2 arguments should be specified.
The arguments of --out and --reference_genome are mandatory.
Expand Down
27 changes: 26 additions & 1 deletion fusionfusion/annotationFunction.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#! /usr/bin/env python

from __future__ import print_function
import sys, pysam, os, subprocess
import sys, pysam, os, subprocess, re
import annot_utils.gene
import annot_utils.exon

Expand Down Expand Up @@ -271,4 +271,29 @@ def merge_fusion_result(input_dir, output_file_path):
hOUT.close()

subprocess.check_call(["rm", "-rf", output_file_path + ".unsorted.tmp"])

trace_files = list(filter(os.path.exists, [
input_dir + "/star.chimeric.trace.txt",
# input_dir + "/ms2.chimeric.trace.txt",
# input_dir + "/th2.chimeric.trace.txt"
]))
if trace_files:
traced_file_path = re.sub(r'(?:\.txt)?$', '.traced.txt', output_file_path, count=1)
emit_traced(output_file_path, trace_files, traced_file_path)


def emit_traced(input_file, trace_files, output_file):
trace = {}
for trace_file in trace_files:
with open(trace_file) as t:
for line in t:
cols = line.rstrip().split('\t')
if cols[7] != '---':
trace[tuple(cols[0:7])] = cols[7]

with open(input_file) as r, open(output_file, 'w') as w:
for line in r:
cols = line.rstrip().split('\t')
source = trace.get(tuple(cols[0:7]), '---')
cols.append(source)
print('\t'.join(cols), file=w)
7 changes: 6 additions & 1 deletion fusionfusion/arg_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,12 @@ def create_parser():
parser.add_argument("--star", metavar = "star.Chimeric.out.sam", default = None, type = str,
help = "the path to the chimeric sam file by STAR")

parser.add_argument("--star_sj_tab", metavar="star.SJ.out.tab", default=None, type=str,
help="the path to the SJ.out.tab file by STAR")

parser.add_argument("--star_aligned_bam", metavar="star.Aligned.sortedByCoord.out.bam", default=None, type=str,
help="the path to the aligned bam file by STAR (which must be sorted by coordinate)")

parser.add_argument("--ms2", metavar = "ms2.bam", default = None, type = str,
help = "the path to the bam file by Map splice2")

Expand Down Expand Up @@ -87,4 +93,3 @@ def create_parser():


return parser

22 changes: 18 additions & 4 deletions fusionfusion/filterJunctionInfo.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,13 +131,15 @@ def filterPoolControl(input_file, output_file, control_file):
hout.close()


def extractSplicingPattern(inputFilePath, outputFilePath):
def extractSplicingPattern(inputFilePath, outputFilePath, traceFilePath):

# reference_genome = config.param_conf.get("alignment", "reference_genome")
reference_genome = param_conf.reference_genome

hIN = open(inputFilePath, 'r')
hOUT = open(outputFilePath, 'w')
if traceFilePath:
hTRACE = open(traceFilePath, 'w')

for line in hIN:
F = line.rstrip('\n').split('\t')
Expand Down Expand Up @@ -477,11 +479,24 @@ def extractSplicingPattern(inputFilePath, outputFilePath):
if F[2] == "+": contigSeq1 = seq_utils.reverseComplement(contigSeq1)
if F[5] == "+": contigSeq2 = seq_utils.reverseComplement(contigSeq2)

print('\t'.join(['\t'.join(F[0:6]), str(inSeq), str(len(F[8].split(';'))), str(contig1), str(contig2), contigSeq1, contigSeq2]), file = hOUT)

common_cols = F[0:6] + [str(inSeq)]
out_cols = common_cols + [
str(len(F[8].split(';'))), str(contig1), str(contig2),
contigSeq1, contigSeq2
]
print('\t'.join(out_cols), file=hOUT)
if traceFilePath:
sources = set()
for qname in F[7].split(';'):
m = re.search('@([^@]+)$', qname)
if m: sources.add(m.group(1))
trace_cols = common_cols + [','.join(sorted(sources, reverse=True)) if sources else '---']
print('\t'.join(trace_cols), file=hTRACE)

hIN.close()
hOUT.close()
if traceFilePath:
hTRACE.close()



Expand Down Expand Up @@ -625,4 +640,3 @@ def filterContigCheck(inputFilePath, outputFilePath, checkMatchFile):

hIN.close()
hOUT.close()

14 changes: 9 additions & 5 deletions fusionfusion/parseJunctionInfo.py
Original file line number Diff line number Diff line change
Expand Up @@ -343,7 +343,7 @@ def parseJuncInfo_th2(inputFilePath, outputFilePath):
hOUT.close()


def getFusInfo_STAR(juncLine):
def getFusInfo_STAR(juncLine, source=None):

# abnormal_insert_size = config.param_conf.getint("parse_condition", "abnormal_insert_size")
# min_major_clip_size = config.param_conf.getint("parse_condition", "min_major_clip_size")
Expand Down Expand Up @@ -409,7 +409,11 @@ def getFusInfo_STAR(juncLine):
coverRegion_primary = cigar_utils.getCoverRegion(F[2], F[3], F[5])
readLength_primary = len(F[9])
endPos_primary = cigar_utils.getEndPos(pos_primary, F[5])
readID_primary = F[0] + ("/1" if flags[6] == "1" else "/2")
readID_primary = "{qname}/{read}{suffix}".format(
qname=F[0],
read=(1 if flags[6] == "1" else 2),
suffix=("@" + source if source else "")
)

tempMatch = cigarSRe_right.search(F[5])
if tempMatch is not None: right_clipping_primary = int(tempMatch.group(1))
Expand Down Expand Up @@ -574,7 +578,7 @@ def getFusInfo_STAR(juncLine):



def parseJuncInfo_STAR(inputFilePath, outputFilePath):
def parseJuncInfo_STAR(inputFilePath, outputFilePath, source=None):


hIN = open(inputFilePath, 'r')
Expand All @@ -589,7 +593,7 @@ def parseJuncInfo_STAR(inputFilePath, outputFilePath):

if tempID != F[0]:
if tempID != "" and len(tempLine) == 3:
tempFusInfo = getFusInfo_STAR(tempLine)
tempFusInfo = getFusInfo_STAR(tempLine, source)
if tempFusInfo is not None:
print(tempFusInfo, file = hOUT)

Expand All @@ -602,7 +606,7 @@ def parseJuncInfo_STAR(inputFilePath, outputFilePath):


if tempID != "" and len(tempLine) == 3:
tempFusInfo = getFusInfo_STAR(tempLine)
tempFusInfo = getFusInfo_STAR(tempLine, source)
if tempFusInfo is not None:
print(tempFusInfo, file = hOUT)

Expand Down
97 changes: 80 additions & 17 deletions fusionfusion/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,18 @@

from __future__ import print_function

import sys, os, argparse, subprocess, shutil
import sys, os, argparse, subprocess, shutil, pysam
import annot_utils as annot
from . import parseJunctionInfo
from . import filterJunctionInfo
from . import annotationFunction
from . import utils
from .short_range_chimera_filter import ShortRangeChimeraFilter

# import config
from .config import *

def cluster_filter_junction(inputFilePath, outputFilePrefix, args):
def cluster_filter_junction(inputFilePath, outputFilePrefix, args, generates_trace=False):

# debug_mode = config.param_conf.getboolean("debug", "debug_mode")
debug_mode = param_conf.debug
Expand All @@ -30,8 +32,10 @@ def cluster_filter_junction(inputFilePath, outputFilePrefix, args):
shutil.copyfile(outputFilePrefix + ".chimeric.clustered.filt1.txt",
outputFilePrefix + ".chimeric.clustered.filt2.txt")

traceFilePath = outputFilePrefix + ".chimeric.trace.txt" if generates_trace else None
filterJunctionInfo.extractSplicingPattern(outputFilePrefix + ".chimeric.clustered.filt2.txt",
outputFilePrefix + ".chimeric.clustered.splicing.txt")
outputFilePrefix + ".chimeric.clustered.splicing.txt",
traceFilePath)

if args.no_blat:
annotationFunction.filterAndAnnotation(
Expand Down Expand Up @@ -134,19 +138,70 @@ def fusionfusion_main(args):
####################
# parsing chimeric reads from bam files
if starBamFile is not None:

parseJunctionInfo.parseJuncInfo_STAR(starBamFile, output_dir + "/star.chimeric.tmp.txt")


hOUT = open(output_dir + "/star.chimeric.txt", "w")
subprocess.check_call(["sort", "-k1,1", "-k2,2n", "-k4,4", "-k5,5n", output_dir + "/star.chimeric.tmp.txt"], stdout = hOUT)
hOUT.close()

cluster_filter_junction(output_dir + "/star.chimeric.txt", output_dir + "/star", args)

if debug_mode == False:
subprocess.check_call(["rm", output_dir + "/star.chimeric.tmp.txt"])
subprocess.check_call(["rm", output_dir + "/star.chimeric.txt"])
generates_trace = False
starSjTab = args.star_sj_tab
starAlignedBam = args.star_aligned_bam
if starSjTab is None or starAlignedBam is None:
if starSjTab is not None or starAlignedBam is not None:
msg = "To enable further analysis using STAR's SJ.out.tab, both --star_sj_tab and --star_aligned_bam options must be specified."
print(msg, file=sys.stderr)

parseJunctionInfo.parseJuncInfo_STAR(starBamFile, output_dir + "/star.chimeric.tmp.txt")

with open(output_dir + "/star.chimeric.txt", "w") as hOUT:
subprocess.check_call([
"sort", "-k1,1", "-k2,2n", "-k4,4", "-k5,5n",
output_dir + "/star.chimeric.tmp.txt"
], stdout=hOUT)
else:
parseJunctionInfo.parseJuncInfo_STAR(starBamFile,
output_dir + "/star.chimeric.tmp1.txt",
source="Chimeric")

genome_id = args.genome_id
is_grc = args.grc
annot.gene.make_gene_info(output_dir + "/star.tmp.refGene.bed.gz", "refseq", genome_id, is_grc, False)
annot.gene.make_gene_info(output_dir + "/star.tmp.ensGene.bed.gz", "gencode", genome_id, is_grc, False)

with pysam.TabixFile(output_dir + "/star.tmp.refGene.bed.gz") as ref_gene_tb, \
pysam.TabixFile(output_dir + "/star.tmp.ensGene.bed.gz") as ens_gene_tb:
filt = ShortRangeChimeraFilter(ref_gene_tb, ens_gene_tb)
filt.run(starSjTab, starAlignedBam, output_dir + "/star.chimeric.tmp2.sam")
os.remove(output_dir + "/star.tmp.refGene.bed.gz")
os.remove(output_dir + "/star.tmp.ensGene.bed.gz")
os.remove(output_dir + "/star.tmp.refGene.bed.gz.tbi")
os.remove(output_dir + "/star.tmp.ensGene.bed.gz.tbi")

pysam.sort(
"-n",
"-o", output_dir + "/star.chimeric.tmp2.sorted.sam",
output_dir + "/star.chimeric.tmp2.sam"
)
parseJunctionInfo.parseJuncInfo_STAR(output_dir + "/star.chimeric.tmp2.sorted.sam",
output_dir + "/star.chimeric.tmp2.txt",
source="SJ")

with open(output_dir + "/star.chimeric.txt", "w") as hOUT:
subprocess.check_call([
"sh", "-c",
"cat {input1} {input2} | sort -k1,1 -k2,2n -k4,4 -k5,5n".format(
input1=output_dir + "/star.chimeric.tmp1.txt",
input2=output_dir + "/star.chimeric.tmp2.txt"
)
], stdout=hOUT)

generates_trace = True

cluster_filter_junction(output_dir + "/star.chimeric.txt", output_dir + "/star",
args, generates_trace=generates_trace)

if not debug_mode:
for fname in ["star.chimeric.tmp.txt", "star.chimeric.tmp1.txt",
"star.chimeric.tmp2.txt", "star.chimeric.tmp2.sam",
"star.chimeric.tmp2.sorted.sam", "star.chimeric.txt"]:
path = output_dir + '/' + fname
if os.path.exists(path):
os.remove(path)

if ms2BamFile is not None:

Expand Down Expand Up @@ -196,4 +251,12 @@ def fusionfusion_main(args):
annotationFunction.merge_fusion_result(output_dir,
output_dir + "/fusion_fusion.result.txt")


if not debug_mode:
for fname in [
"star.chimeric.trace.txt",
# "ms2.chimeric.trace.txt",
# "th2.chimeric.trace.txt"
]:
path = output_dir + '/' + fname
if os.path.exists(path):
os.remove(path)
10 changes: 9 additions & 1 deletion fusionfusion/seq_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,15 @@ def getSeq(reference, region_tuple):

seq = ""
for reg in region_tuple_sorted:
for line in pysam.faidx(reference, reg[0] + ":" + str(reg[1]) + "-" + str(reg[2]), split_lines=True):
lines = []
try:
lines = pysam.faidx(reference, reg[0] + ":" + str(reg[1]) + "-" + str(reg[2]), split_lines=True)
except pysam.utils.SamtoolsError:
# With latest versions of pysam, faidx raises an error if it gets fed with
# a contradictory range (such as `chr:start-end` where start > end).
# This try-except statement is for handling such cases.
pass
for line in lines:
if line[0] == ">": continue
seq += line

Expand Down
Loading