-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathctat-vif
executable file
·320 lines (232 loc) · 11.2 KB
/
ctat-vif
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import os
import subprocess
import tempfile
import logging
import sys
VERSION = "1.5.0"
if sys.version_info[0] < 3:
print("This script requires Python 3")
exit(1)
logging.basicConfig(level=logging.INFO,
format='%(asctime)s : %(levelname)s : %(message)s',
datefmt='%H:%M:%S')
logger = logging.getLogger(__name__)
import argparse
import json
utildir = os.path.join(os.path.dirname(__file__), "util")
sys.path.insert(0, utildir)
def is_in_path(path):
try:
if (
subprocess.run(
["which", path], stdout=subprocess.PIPE, stderr=subprocess.STDOUT
).returncode
!= 0
):
return False
except:
return False
return True
def main():
parser = argparse.ArgumentParser(
description="Virus insertion finder"
)
parser.add_argument("--sample_id", type=str, help="sample id", required=True)
parser.add_argument("--left", type=str, required=False, help="Path to one of the two paired samples")
parser.add_argument("--right", help="Path to one of the two paired samples")
parser.add_argument(
"--genome_lib_dir",
default=os.environ.get("CTAT_GENOME_LIB"),
help="Genome lib directory - see http://FusionFilter.github.io for details. Uses env variable CTAT_GENOME_LIB as default",
)
parser.add_argument("--viral_fasta", help="Viral fasta (default: under --genome_lib_dir, VIF/virus_db.fasta")
parser.add_argument("--star_index_plus_virus", type=str, help="STAR index containing both host and viral genomes, default: under --genome_lib_dir, VIF/hg_plus_viraldb.fasta.star.idx")
parser.add_argument("--cpu", type=int, help="Number of CPUs for multi-threaded steps")
parser.add_argument(
"--min_reads", type=int, help="Filter insertion sites candidates that do not have at least 'min_reads' for use with chimeric contig construction (2nd phase)",
default=5
)
parser.add_argument("--star_init_two_pass_mode", help="STAR init 2-pass mapping mode", choices=['Basic', 'None'],
default='None')
parser.add_argument("--star_validate_two_pass_mode", help="STAR validate 2-pass mapping mode", choices=['Basic', 'None'],
default='None')
parser.add_argument(
"--no_remove_duplicates", action="store_true", help="do NOT remove duplicates"
)
parser.add_argument(
"--no_clean_reads", action='store_true', help='do NOT clean reads (trimmomatic and polyA-stripper)'
)
parser.add_argument(
"--star_init_only", action="store_true", help="Only perform initial STAR chimeric junction analysis"
)
parser.add_argument(
"--cromwell", help="Optional path to cromwell jar file",
)
parser.add_argument(
"-O",
"--outputdir",
dest="outputdir",
required=True,
type=str,
help="Output directory",
)
parser.add_argument("--version", action='store_true', help="show version tag: {}".format(VERSION))
parser.add_argument("--max_hits", default=50, type=int, help='maximum number of multimapping allowed for chimeric reads (1=unique chim reads only)')
parser.add_argument("--min_flank_frac_uniq", default=0.0, type=float, help="minimum fraction of unique kmers in flanking sequence (k=5)")
## kickstart to phase 2
parser.add_argument("--kickstart_p2_hg_unmapped_left_fq", type=str, default=None, help="kickstart to phase 2 using unmapped left /1 reads ")
parser.add_argument("--kickstart_p2_hg_unmapped_right_fq", type=str, default=None, help="kickstart to phase 2 using unmapped right /2 reads ")
## kickstart to phase 3
parser.add_argument("--kickstart_p3_human_virus_chimJ", type=str, default=None, help="kickstart to phase 3 using human/virus chimeric reads file; requires p2 inputs")
parser.add_argument("--kickstart_p3_human_virus_bam", type=str, default=None, help="kickstart to phase 3 using human & virus alignments bam file; requires p2 inputs")
parser.add_argument("--kickstart_p3_human_virus_bai", type=str, default=None, help="kicstart to phase 3 using human & virus bam indexed bai file; requires p2 inputs")
args = parser.parse_args()
if args.version:
exit(VERSION)
if not args.genome_lib_dir:
exit("\n\tError, must specify --genome_lib_dir or have env var CTAT_GENOME_LIB set accordingly")
left = args.left
right = args.right
ctat_genome_lib_path = os.path.abspath(args.genome_lib_dir)
if (not left) and (not args.kickstart_p2_hg_unmapped_left_fq):
exit("Error, need to specify --left or --kickstart_p2_hg_unmapped_left_fq")
if args.kickstart_p3_human_virus_chimJ and not (args.kickstart_p3_human_virus_bam and args.kickstart_p3_human_virus_bai):
exit("Must specify kickstart_p3_human_virus_bam and kickstart_p3_human_virus_bam when kickstart_p3_human_virus_chimJ is specified")
if args.kickstart_p3_human_virus_chimJ and not args.kickstart_p2_hg_unmapped_left_fq:
exit("Must specify kickstart_p2_hg_unmapped_left_fq when kickstart_p3_human_virus_chimJ is used")
viral_fasta = args.viral_fasta
if not viral_fasta:
viral_fasta = os.path.join(ctat_genome_lib_path, "VIF/virus_db.fasta")
if not os.path.exists(viral_fasta):
exit("Error, cannot locate {}".format(viral_fasta))
star_index_plus_virus = args.star_index_plus_virus
if not star_index_plus_virus:
star_index_plus_virus = os.path.join(ctat_genome_lib_path, "VIF/hg_plus_viraldb.fasta.star.idx")
if not os.path.exists(star_index_plus_virus):
exit("Error, cannot locate {}".format(star_index_plus_virus))
cromwell = args.cromwell
remove_duplicates = not args.no_remove_duplicates
star_init_only = args.star_init_only
min_reads = args.min_reads
max_hits = args.max_hits
min_flank_frac_uniq = args.min_flank_frac_uniq
cpu = args.cpu
output_directory = os.path.abspath(args.outputdir)
star_validate_two_pass_mode = args.star_validate_two_pass_mode
star_init_two_pass_mode = args.star_init_two_pass_mode
if ctat_genome_lib_path is None or not os.path.exists(ctat_genome_lib_path):
exit("Missing path to CTAT_GENOME_LIB in $CTAT_GENOME_LIB.")
if left is not None:
left = os.path.abspath(left)
if right is not None:
right = os.path.abspath(right)
base_dir = os.path.dirname(os.path.realpath(__file__))
util_dir = os.path.join(base_dir, "util")
ctat_genome_lib_path = os.path.abspath(ctat_genome_lib_path)
if not os.path.exists(ctat_genome_lib_path):
exit("CTAT_GENOME_LIB at {} not found".format(ctat_genome_lib_path))
gtf = os.path.join(ctat_genome_lib_path, "ref_annot.gtf")
ref_fasta = os.path.join(ctat_genome_lib_path, "ref_genome.fa")
if not os.path.exists(gtf):
exit('{} not found'.format(gtf))
if not os.path.exists(ref_fasta):
exit('{} not found'.format(ref_fasta))
input_json = {}
input_json['sample_id'] = args.sample_id
input_json["ref_genome_gtf"] = gtf
input_json["ref_genome_fasta"] = ref_fasta
input_json['autodetect_cpu'] = False
input_json['star_cpu'] = cpu
input_json['star_init_two_pass_mode'] = star_init_two_pass_mode
input_json['star_validate_two_pass_mode'] = star_validate_two_pass_mode
star_index_human_only = ref_fasta + ".star.idx"
input_json['star_index_human_only_dirpath'] = star_index_human_only
input_json['star_index_human_plus_virus_dirpath'] = star_index_plus_virus
input_json['viral_fasta'] = viral_fasta
input_json['clean_reads'] = not args.no_clean_reads
if left is not None:
input_json["left"] = left
if right is not None:
input_json["right"] = right
input_json['min_reads'] = min_reads
input_json['max_hits'] = max_hits
input_json['min_flank_frac_uniq'] = min_flank_frac_uniq
input_json['star_init_only'] = star_init_only
input_json["remove_duplicates"] = remove_duplicates
if cpu is None:
cpu = min(16, os.cpu_count())
input_json["star_cpu"] = cpu
input_json["util_dir"] = util_dir
input_json["docker"] = "a/b" # hack to make cromwell happy
## kicstart options:
if args.kickstart_p2_hg_unmapped_left_fq:
input_json["hg_unmapped_left_fq"] = args.kickstart_p2_hg_unmapped_left_fq
if args.kickstart_p2_hg_unmapped_right_fq:
input_json["hg_unmapped_right_fq"] = args.kickstart_p2_hg_unmapped_right_fq
if args.kickstart_p3_human_virus_chimJ:
input_json["human_virus_chimJ"] = args.kickstart_p3_human_virus_chimJ
input_json["human_virus_bam"] = args.kickstart_p3_human_virus_bam
input_json["human_virus_bai"] = args.kickstart_p3_human_virus_bai
#_, json_file = tempfile.mkstemp(suffix=".json")
#_, meta_file = tempfile.mkstemp(suffix=".json")
json_file = os.path.abspath("ctat-vif.input.json")
meta_file = os.path.abspath("ctat-vif.meta.json")
wdl_dir = os.path.join(base_dir, "WDL")
cromwell_jar = None
if cromwell is None:
for f in os.listdir(wdl_dir):
if f.startswith('cromwell-') and f.endswith('.jar'):
cromwell_jar = os.path.join(wdl_dir, f)
break
else:
cromwell_jar = os.path.abspath(cromwell)
if cromwell_jar is None or not os.path.exists(cromwell_jar):
exit(
"Cromwell jar file not found. Please run download_cromwell.sh to download."
)
cromwell_conf = os.path.join(wdl_dir, "cromwell.conf")
if not os.path.exists(cromwell_conf):
exit("Cromwell configuration not found")
if output_directory is not None and not os.path.exists(output_directory):
os.makedirs(output_directory)
input_json_with_prefix = {}
for key in input_json:
input_json_with_prefix["ctat_vif.{}".format(key)] = input_json[key]
with open(json_file, "wt") as out:
out.write(json.dumps(input_json_with_prefix, indent=4)
)
cromwell_args = [
"java",
"-Dconfig.file={}".format(cromwell_conf),
"-jar",
cromwell_jar,
"run",
"-i",
json_file,
'-m',
meta_file,
os.path.join(wdl_dir, "ctat_VIF.wdl"),
]
logger.info("CMD: {}".format(" ".join(cromwell_args)))
subprocess.run(cromwell_args, cwd=output_directory)
#os.remove(json_file)
with open(meta_file, 'rt', encoding='UTF-8') as f:
metadata = json.load(f)
#os.remove(meta_file)
if 'outputs' in metadata:
outputs = metadata['outputs']
for key in outputs:
path = outputs[key]
if path is not None and isinstance(path, str) and os.path.exists(path):
name = os.path.basename(path)
dst = name if output_directory is None else os.path.join(output_directory, name)
if os.path.exists(dst):
os.remove(dst)
os.symlink(path, dst)
if __name__ == "__main__":
if "--version" in sys.argv:
print(f"ctat-vif v{VERSION}")
sys.exit(0)
main()