-
Notifications
You must be signed in to change notification settings - Fork 18
/
ctat_mutations
executable file
·569 lines (457 loc) · 18.7 KB
/
ctat_mutations
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import os
import subprocess
import tempfile
import sys
VERSION = "ctat_mutations-4.3.0"
#VERSION = "ctat_mutations-BLEEDING_EDGE-postv4.3.0"
if sys.version_info[0] < 3:
print("This script requires Python 3")
exit(1)
import argparse
import json
import logging
logging.basicConfig(level=logging.INFO,
format='%(asctime)s : %(levelname)s : %(message)s',
datefmt='%H:%M:%S')
logger = logging.getLogger(__name__)
"""
_____ _______ _______ __ __ _ _ _______ _______ _____ ____ _ _
/ ____|__ __|/\|__ __| | \/ | | | |__ __|/\|__ __|_ _/ __ \| \ | |
| | | | / \ | | ______ | \ / | | | | | | / \ | | | || | | | \| |
| | | | / /\ \ | | |______| | |\/| | | | | | | / /\ \ | | | || | | | . ` |
| |____ | |/ ____ \| | | | | | |__| | | |/ ____ \| | _| || |__| | |\ |
\_____| |_/_/ \_\_| |_| |_|\____/ |_/_/ \_\_| |_____\____/|_| \_|
"""
def is_in_path(path):
try:
if (
subprocess.run(
["which", path], stdout=subprocess.PIPE, stderr=subprocess.STDOUT
).returncode
!= 0
):
return False
except:
return False
return True
def main():
parser = argparse.ArgumentParser(
description="Performs mutation detection in RNA-Seq"
)
parser.add_argument("--left", help="Path to one of the two paired samples")
parser.add_argument("--right", help="Path to one of the two paired samples")
parser.add_argument("--version", action='store_true', help="show version: {}".format(VERSION))
parser.add_argument(
"--bam",
help="Previously aligned bam file. When VCF is provided, the output from ApplyBQSR should be provided as the bam input.",
)
parser.add_argument(
"--vcf",
help="Previously generated vcf file to annotate and filter. When provided, the output from ApplyBQSR should be provided as the bam input.",
)
parser.add_argument("--sample_id", required=True, help="Sample id")
parser.add_argument(
"--extra_fasta", help="Extra genome to use in alignment and variant calling"
)
parser.add_argument(
"--genome_lib_dir",
default=os.environ.get("CTAT_GENOME_LIB"),
help="Genome lib directory - see http://FusionFilter.github.io for details. Uses env variable CTAT_GENOME_LIB as default",
)
parser.add_argument(
"--intervals",
default=None,
type=str,
help="intervals file defining target regions for variant calling"
)
parser.add_argument(
"--HC_xtra_args",
help="extra arguments to provide to GATK HaplotypeCaller",
default=None,
type=str
)
parser.add_argument(
"--variant_ready_bam",
action="store_true",
help="Skip initial read processing involving setting read groups, duplicate marking, splitNcigar, and base recalibration... go immediately on to calling variants",
)
parser.add_argument(
"--no_bqsr",
action="store_true",
help="Skip base quality score recalibration of bam files before variant calling.",
)
parser.add_argument(
"--filter_ready_vcf",
action="store_true",
help="a fully annotated vcf is provided, so move directly on to filtering/boosting"
)
parser.add_argument(
"--sequencing_platform",
default="ILLUMINA",
help="Sequencing platform used to generate the sample",
)
parser.add_argument(
"--boosting_method",
default="none",
choices=[
"none",
"AdaBoost",
"LR",
"NGBoost",
"RF",
"SGBoost",
"SVM_RBF",
"SVML",
"XGBoost",
],
help="Variant calling boosting method",
)
parser.add_argument(
"--boosting_alg_type",
default="classifier",
choices=["classifier", "regressor"],
help="Boosting algorithm type: classifier or regressor",
)
parser.add_argument(
"--boosting_score_threshold",
default=0.05,
type=float,
help="Minimum score threshold for boosted variant selection",
)
parser.add_argument(
"--boosting_attributes",
default="AC,ALT,BaseQRankSum,DJ,DP,ED,Entropy,ExcessHet,FS,Homopolymer,LEN,MLEAF,MMF,QUAL,REF,RPT,RS,ReadPosRankSum,SAO,SOR,TCR,TDM,VAF,VMMF",
help="Variant attributes on which to perform boosting",
)
parser.add_argument(
"--no_include_read_var_pos_annotations",
action="store_true",
help="Do not include variant position in read annotations (debugging/troubleshooting purposes only) Removes: VPR,VAF,VMMF,PctExtPos",
)
parser.add_argument(
"--cpu", type=int, help="Number of CPUs for multi-threaded steps (e.g. STAR, variant position annotation)",
)
parser.add_argument(
"--variant_scatter_count",
type=int,
default=6,
help="Number of parallel variant caller jobs",
)
parser.add_argument(
"--no_annotate_variants", action='store_true', help='skips all variant annotations, just focuses on getting the basic vcf file'
)
parser.add_argument(
"--no_cravat", action="store_true", help="Skips CRAVAT annotations, also skips partitioning of cancer-related variants given they rely on cravat annotations",
)
parser.add_argument(
"--no_filter_cancer_variants", action="store_true", help="no filtering of cancer variants to generate separate cancer.* outputs",
default= False
)
parser.add_argument(
"--no_mark_duplicates", action="store_true", help="Do not mark duplicates"
)
parser.add_argument(
"--gatk",
default="gatk",
help="Optional path to GATK when not running in a docker container",
)
parser.add_argument(
"--cromwell", help="Optional path to cromwell jar file",
)
parser.add_argument(
"-O",
"--outputdir",
dest="outputdir",
required=True,
help="Output directory",
)
parser.add_argument(
"--star_limitBAMsortRAM",
dest="star_limitBAMsortRAM",
required=False,
type=str,
help="value for STAR star_limitBAMsortRAM parameter"
)
parser.add_argument(
"--is_long_reads",
action='store_true',
default=False,
help="long reads (eg. PacBio) are input as --left ")
parser.add_argument("--is_single_cells",
action='store_true',
default=False,
help="single cell transcriptome reads used - note read name formatting required as: cellbarcode^UMI^readname to be recognized.")
# parser.add_argument(
# "--star_memory",
# help="Memory for STAR alignment step"
# )
# parser.add_argument(
# "--mark_duplicates_memory",
# help="Memory for mark duplicates step"
# )
# parser.add_argument(
# "--split_n_cigar_reads_memory",
# help="Memory for split n cigar reads step"
# )
args = parser.parse_args()
if args.version:
exit(VERSION)
left = args.left
right = args.right
bam = args.bam
vcf = args.vcf
ctat_genome_lib_path = args.genome_lib_dir
variant_ready_bam = args.variant_ready_bam
filter_ready_vcf = args.filter_ready_vcf
no_bqsr = args.no_bqsr
sequencing_platform = args.sequencing_platform
boosting_method = args.boosting_method
variant_scatter_count = args.variant_scatter_count
boosting_alg_type = args.boosting_alg_type
boosting_score_threshold = args.boosting_score_threshold
boosting_attributes = args.boosting_attributes
no_include_read_var_pos_annotations = args.no_include_read_var_pos_annotations
no_cravat = args.no_cravat
no_filter_cancer_variants = args.no_filter_cancer_variants
sample_id = args.sample_id
gatk_path = args.gatk
cromwell = args.cromwell
no_mark_duplicates = args.no_mark_duplicates
extra_fasta = args.extra_fasta
cpu = args.cpu
hc_xtra_args = args.HC_xtra_args
output_directory = args.outputdir
# mark_duplicates_memory = args.mark_duplicates_memory
# star_memory = args.star_memory
# split_n_cigar_reads_memory = args.split_n_cigar_reads_memory
if left is None and bam is None and not (vcf is not None and filter_ready_vcf):
exit("Either FASTQ files or a previously aligned bam or annotated vcf file must be provided.")
if ctat_genome_lib_path is None or not os.path.exists(ctat_genome_lib_path):
exit("Missing path to CTAT_GENOME_LIB in $CTAT_GENOME_LIB.")
if left is not None:
left = os.path.abspath(left)
if right is not None:
right = os.path.abspath(right)
if bam is not None:
bam = os.path.abspath(bam)
if vcf is not None:
vcf = os.path.abspath(vcf)
vcf_index = vcf + ".tbi"
if not os.path.exists(vcf_index):
exit("provided vcf must be indexed")
ctat_genome_lib_path = os.path.abspath(ctat_genome_lib_path)
if not os.path.exists(ctat_genome_lib_path):
exit("CTAT_GENOME_LIB at {} not found".format(ctat_genome_lib_path))
gtf = os.path.join(ctat_genome_lib_path, "ref_annot.gtf")
ref_dict = os.path.join(ctat_genome_lib_path, "ref_genome.dict")
ref_fasta = os.path.join(ctat_genome_lib_path, "ref_genome.fa")
ref_fasta_index = os.path.join(ctat_genome_lib_path, "ref_genome.fa.fai")
star_reference = os.path.join(ctat_genome_lib_path, "ref_genome.fa.star.idx")
mm2_genome_idx = os.path.join(ctat_genome_lib_path, "ref_genome.fa.mm2")
mm2_splice_bed = os.path.join(ctat_genome_lib_path, "ref_annot.gtf.mm2.splice.bed")
intervals_file = args.intervals
if intervals_file:
intervals_file = os.path.abspath(intervals_file)
ctat_mutation_lib_path = os.path.abspath(
os.path.join(ctat_genome_lib_path, "ctat_mutation_lib")
)
if not os.path.exists(ctat_mutation_lib_path):
exit("ctat_mutation_lib at {} not found".format(ctat_mutation_lib_path))
db_snp_vcf = os.path.join(ctat_mutation_lib_path, "dbsnp.vcf.gz")
db_snp_vcf_index = os.path.join(ctat_mutation_lib_path, "dbsnp.vcf.gz.tbi")
gnomad_vcf = os.path.join(ctat_mutation_lib_path, "gnomad-lite.vcf.gz")
gnomad_vcf_index = os.path.join(ctat_mutation_lib_path, "gnomad-lite.vcf.gz.csi")
rna_editing_vcf = os.path.join(ctat_mutation_lib_path, "RNAediting.library.vcf.gz")
rna_editing_vcf_index = os.path.join(
ctat_mutation_lib_path, "RNAediting.library.vcf.gz.csi"
)
cosmic_vcf = os.path.join(ctat_mutation_lib_path, "cosmic.vcf.gz")
cosmic_vcf_index = os.path.join(ctat_mutation_lib_path, "cosmic.vcf.gz.csi")
repeat_mask_bed = os.path.join(ctat_mutation_lib_path, "repeats_ucsc_gb.bed.gz")
ref_splice_adj_regions_bed = os.path.join(
ctat_mutation_lib_path, "ref_annot.splice_adj.bed.gz"
)
ref_bed = os.path.join(ctat_mutation_lib_path, "refGene.sort.bed.gz")
cravat_lib_directory = os.path.join(ctat_mutation_lib_path, "cravat")
properties = os.path.join(ctat_mutation_lib_path, "properties.json")
if not os.path.exists(properties):
exit(
"Cannot find properties.json file in {}/ctat_mutation_lib".format(
ctat_genome_lib_path
)
)
with open(properties, "rt") as pr:
property_dict = json.load(pr)
genome_version = property_dict["Genome_version"]
input_json = {}
input_json["genome_version"] = genome_version
input_json["sample_id"] = sample_id
if left is not None:
input_json["left"] = left
if right is not None:
input_json["right"] = right
if bam is not None:
input_json["bam"] = bam
if not os.path.exists(bam):
raise ValueError("Bam not found")
for bai_path in [bam + ".bai", os.path.splitext(bam)[0] + ".bai"]:
if os.path.exists(bai_path):
input_json["bai"] = bai_path
break
if input_json.get("bai") is None:
raise ValueError("No bam index found")
if vcf is not None:
input_json["vcf"] = vcf
input_json["vcf_index"] = vcf_index
if intervals_file is not None:
input_json["intervals"] = intervals_file
if extra_fasta is not None:
input_json["extra_fasta"] = extra_fasta
if no_mark_duplicates:
input_json["mark_duplicates"] = False
if cpu is None:
cpu = min(16, os.cpu_count())
if hc_xtra_args is not None:
input_json["haplotype_caller_xtra_args"] = hc_xtra_args
if args.is_long_reads:
if not os.path.exists(mm2_genome_idx) or not os.path.exists(mm2_splice_bed):
logger.critical("Error, missing ctat genome lib minimap2 resources. Be sure to run the ctat mutation lib prep step (see wiki docs)")
sys.exit(4)
input_json["is_long_reads"] = True
input_json["mm2_genome_idx"] = mm2_genome_idx
input_json["mm2_splice_bed"] = mm2_splice_bed
if args.is_single_cells:
input_json["singlecell_mode"] = True
input_json["star_cpu"] = cpu
input_json["variant_filtration_cpu"] = cpu
input_json["variant_annotation_cpu"] = cpu
input_json["gtf"] = gtf
input_json["ref_dict"] = ref_dict
input_json["ref_fasta"] = ref_fasta
input_json["ref_fasta_index"] = ref_fasta_index
input_json["db_snp_vcf"] = db_snp_vcf
input_json["db_snp_vcf_index"] = db_snp_vcf_index
input_json["gnomad_vcf"] = gnomad_vcf
input_json["gnomad_vcf_index"] = gnomad_vcf_index
input_json["rna_editing_vcf"] = rna_editing_vcf
input_json["rna_editing_vcf_index"] = rna_editing_vcf_index
input_json["cosmic_vcf"] = cosmic_vcf
input_json["cosmic_vcf_index"] = cosmic_vcf_index
input_json["repeat_mask_bed"] = repeat_mask_bed
input_json["ref_splice_adj_regions_bed"] = ref_splice_adj_regions_bed
input_json["ref_bed"] = ref_bed
if args.no_annotate_variants:
input_json["annotate_variants"] = False
if no_cravat:
input_json["incl_cravat"] = False
no_filter_cancer_variants = True # relies largely on cravat
elif cravat_lib_directory is not None:
input_json["cravat_lib_dir"] = cravat_lib_directory
input_json["star_reference_dir"] = star_reference
input_json["filter_cancer_variants"] = not no_filter_cancer_variants
input_json["apply_bqsr"] = not no_bqsr
input_json["variant_ready_bam"] = variant_ready_bam
input_json["filter_ready_vcf"] = filter_ready_vcf
input_json["boosting_alg_type"] = boosting_alg_type
input_json["boosting_method"] = boosting_method
input_json["boosting_attributes"] = boosting_attributes.split(",")
input_json["boosting_score_threshold"] = boosting_score_threshold
input_json["sequencing_platform"] = sequencing_platform
input_json["docker"] = "a/b" # hack to make cromwell happy
input_json["variant_scatter_count"] = variant_scatter_count
if args.star_limitBAMsortRAM:
input_json["star_limitBAMsortRAM"] = args.star_limitBAMsortRAM
if boosting_method == "none":
# disable the time-consuming annotations that aren't essential.
input_json[
"include_read_var_pos_annotations"
] = False
input_json[
"incl_blat_ED"
] = False
else:
input_json[
"include_read_var_pos_annotations"
] = not no_include_read_var_pos_annotations
if gatk_path is not None:
input_json["gatk_path"] = gatk_path
if not is_in_path(gatk_path):
exit("GATK not found")
if not is_in_path("pblat"):
exit("pblat not found")
# if mark_duplicates_memory is not None:
# input_json['mark_duplicates_memory'] = mark_duplicates_memory
#
# if split_n_cigar_reads_memory is not None:
# input_json['split_n_cigar_reads_memory'] = split_n_cigar_reads_memory
#
# if star_memory is not None:
# input_json['star_memory'] = star_memory
_, json_file = tempfile.mkstemp(suffix=".json")
_, meta_file = tempfile.mkstemp(suffix=".json")
base_dir = os.path.dirname(os.path.realpath(__file__))
scripts_dir = os.path.join(base_dir, "src")
plugins_dir = os.path.join(base_dir, "plugins")
input_json["plugins_path"] = plugins_dir
input_json["scripts_path"] = scripts_dir
wdl_dir = os.path.join(base_dir, "WDL")
cromwell_jar = None
if cromwell is None:
for f in os.listdir(wdl_dir):
if f.startswith('cromwell-') and f.endswith('.jar'):
cromwell_jar = os.path.join(wdl_dir, f)
break
else:
cromwell_jar = os.path.abspath(cromwell)
if cromwell_jar is None or not os.path.exists(cromwell_jar):
exit(
"Cromwell jar file not found. Please run download_cromwell.sh to download."
)
cromwell_conf = os.path.join(wdl_dir, "local_provider_config.inc.conf")
if not os.path.exists(cromwell_conf):
exit("Cromwell configuration not found")
if not os.path.exists(output_directory):
os.makedirs(output_directory)
tmpdir = os.path.join(output_directory, "__tmpdir")
if not os.path.exists(tmpdir):
os.makedirs(tmpdir)
input_json_with_prefix = {}
for key in input_json:
input_json_with_prefix["ctat_mutations.{}".format(key)] = input_json[key]
with open(json_file, "wt") as out:
out.write(json.dumps(input_json_with_prefix))
cromwell_args = [
"java",
"-Djava.io.tmpdir={}".format(tmpdir),
"-Dconfig.file={}".format(cromwell_conf),
"-jar",
cromwell_jar,
"run",
"-i",
json_file,
'-m',
meta_file,
os.path.join(wdl_dir, "ctat_mutations.wdl"),
]
logger.info("CMD: " + " ".join(cromwell_args))
subprocess.run(cromwell_args, cwd=output_directory)
os.remove(json_file)
with open(meta_file, 'rt', encoding='UTF-8') as f:
metadata = json.load(f)
os.remove(meta_file)
if 'outputs' in metadata:
outputs = metadata['outputs']
for key in outputs:
path = outputs[key]
if path is not None and isinstance(path, str) and os.path.exists(path):
name = os.path.basename(path)
dst = name if output_directory is None else os.path.join(output_directory, name)
if os.path.lexists(dst):
os.remove(dst)
os.symlink(path, dst)
if __name__ == "__main__":
if "--version" in sys.argv:
exit(VERSION)
main()