-
Notifications
You must be signed in to change notification settings - Fork 0
/
wes_automator_preprocess.py
executable file
·101 lines (84 loc) · 4.56 KB
/
wes_automator_preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
#!/usr/bin/env python
"""Len Taing 2021 (TGBTG)
WES automator config file processor
Problem: we're manually tailoring wes automator config files that come from
the software team to add things like commit string, wes image, etc.
NOTE: WILL Happily CLOBBER values so be careful!
"""
import os
import sys
import time
import re
from optparse import OptionParser
import ruamel.yaml
from ruamel.yaml.scalarstring import SingleQuotedScalarString
def checkInstanceName(instance_name):
"""Checks to make sure that instance_name is a string that conforms to
this regex: '(?:[a-z](?:[-a-z0-9]{0,61}[a-z0-9])?)'"""
prog = re.compile(r'(?:[a-z](?:[-a-z0-9]{0,61}[a-z0-9])?)')
return prog.match(instance_name)
def main():
usage = "USAGE: %prog -c [cores] -d [disk size] -l [cimac center] -s [somatic_caller] -b [google bucket path] -w [wes_commit] -i [wes_image] -r [wes_ref_snapshot] -t [trim_soft_clip]"
optparser = OptionParser(usage=usage)
optparser.add_option("-l", "--cimac_center", help="cimac center")
optparser.add_option("-b", "--google_bucket_path", help="google bucket path, can use {CIMAC_ID} wildcard")
optparser.add_option("-w", "--wes_commit", help="wes commit string")
optparser.add_option("-i", "--image", help="wes image")
optparser.add_option("-r", "--wes_ref_snapshot", help="wes reference snapshot")
optparser.add_option("-c", "--cores", help="num. cores (default: 64)", default=64)
optparser.add_option("-d", "--disk_size", help="disk size in GiB (default: 500)", default=500)
optparser.add_option("-s", "--somatic_caller", help="somatic_caller (default: tnscope)", default="tnscope")
optparser.add_option("-p", "--sentieon_path", help="sentieon_path (default: /home/taing/sentieon/sentieon-genomics-201808.05/bin/)", default="/home/taing/sentieon/sentieon-genomics-201808.05/bin/")
optparser.add_option("-t", "--trim_soft_clip", help="trim soft clip (default: false)", action="store_true", default=False)
optparser.add_option("-n", "--tumor_only", help="tumor_only run (default: false)", action="store_true", default=False)
optparser.add_option("-f", "--directory", help="directory (default: '.')", default=".")
(options, args) = optparser.parse_args(sys.argv)
#Convert options to a dictionary
#ref: https://stackoverflow.com/questions/1753460/python-optparse-values-instance
options_dict = vars(options)
if not options.cimac_center or not options.google_bucket_path or not options.wes_commit or not options.image or not options.wes_ref_snapshot:
print("\nERROR: Please define cimac_center, google_bucket_path, wes_commit, image, and wes_ref_snapshot. These are required fields.\n")
optparser.print_help()
sys.exit(-1)
files = [f for f in os.listdir(options.directory) if f.endswith('.yaml')]
#print(options_dict)
#print(files)
#remove directory from the list
config_dir = options.directory
del options_dict['directory']
for f in files:
# PARSE the yaml file
config_f = open(os.path.join(config_dir, f))
config = ruamel.yaml.round_trip_load(config_f.read())
config_f.close()
#override the params
for (k,v) in options_dict.items():
#direct values, e.g. ints, bools
if k == 'cores' or k == 'disk_size':
config[k] = int(v)
elif k == 'trim_soft_clip' or k == "tumor_only":
config[k] = v
else: #put it string in quotes
#ref: https://stackoverflow.com/questions/39262556/preserve-quotes-and-also-add-data-with-quotes-in-ruamel
config[k] = SingleQuotedScalarString(v)
#CHECK that instance_name matches this
#regex '(?:[a-z](?:[-a-z0-9]{0,61}[a-z0-9])?)'
if not checkInstanceName(config['instance_name']):
print("Warning: instance_name in %s is invalid." % f)
#special case for google_bucket, which is in the form of
#assume that the name of the file is the CIMAC ID
#otherwise we can use config['instance_name'] or
#config['samples']['tumor']
iid = ".".join(f.split(".")[:-1])
#remove 'processed' from the name, just in case we re-ran
iid = iid.replace("_processed", "")
bucket = options.google_bucket_path.replace("{CIMAC_ID}", iid)
config['google_bucket_path'] = bucket
#write new config
new_fname = "%s_processed.yaml" % iid
#print(new_fname)
out = open(os.path.join(config_dir, new_fname), "w")
ruamel.yaml.round_trip_dump(config, out)
out.close()
if __name__=='__main__':
main()