-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathntedit_run_pipeline.smk
270 lines (237 loc) · 12.7 KB
/
ntedit_run_pipeline.smk
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
#!/usr/bin/env snakemake -s
# Snakefile for ntedit pipeline
import os
# Read parameters from config or set default values
draft=config["draft"]
reads_prefix_full=config["reads"] if "reads" in config else ""
reads_prefix=os.path.basename(reads_prefix_full)
reads_dirname=os.path.dirname(reads_prefix_full) if os.path.dirname(reads_prefix_full) != "" else "."
k=config["kmer"]
# ancestry parameters
genomes = config["genomes"] if "genomes" in config else ""
genome_prefix = ".".join([os.path.basename(os.path.realpath(genome)).removesuffix(".fa").removesuffix(".fasta").removesuffix(".fna") for genome in genomes])
if draft is None or reads_prefix is None or k is None:
raise ValueError("You must specify draft, reads, and k in your command. See 'snakemake -s ntedit_run_pipeline.smk help' for more information.")
# Check that draft file exists
if not os.path.isfile(draft):
raise ValueError("Draft file does not exist. Please check that the file name is correct and that the file is in the current working directory.")
# Check that reads files exist
if [f"{reads_dirname}/{file}" for file in os.listdir(reads_dirname) if file.startswith(reads_prefix) and file.endswith((".fq", ".fq.gz", ".fastq", ".fastq.gz", ".fa", ".fa.gz", ".fasta", ".fasta.gz"))] == []:
if genomes == "":
raise ValueError("Reads files do not exist. Please check that the prefix is correct and that the files are in the current working directory.")
reads_files = [(f"{reads_dirname}/{file}", file) for file in os.listdir(reads_dirname) \
if file.startswith(reads_prefix) and \
file.endswith((".fq", ".fq.gz", ".fastq", ".fastq.gz", ".fa", ".fa.gz", ".fasta", ".fasta.gz"))]
reads_files = [file if reads_dirname == "." else full_path for full_path, file in reads_files]
# Check that k is an integer
try:
k = int(k)
except ValueError:
raise ValueError("k must be an integer.")
# Check that k is greater than 0
if k <= 0:
raise ValueError("k must be greater than 0.")
# Common parameters
t = config["threads"] if "threads" in config else 4
b = config["b_prefix"] + "_" if "b_prefix" in config and config["b_prefix"] != "" else ""
# ntStat parameters
solid = config["solid"] if "solid" in config else False
cutoff = config["cutoff"] if "cutoff" in config else 2
# ntEdit parameters
z = config["z_param"] if "z_param" in config else 100
i = config["i_param"] if "i_param" in config else 5
d = config["d_param"] if "d_param" in config else 5
x = config["x_param"] if "x_param" in config else 5.000
y = config["y_param"] if "y_param" in config else 9.000
cap = config["cap"] if "cap" in config else (int(k * 3 / 2) if k is not None else None)
m = config["m_param"] if "m_param" in config else 0
v = config["verbose"] if "verbose" in config else 0
a = config["a_param"] if "a_param" in config else 0
j = config["j_param"] if "j_param" in config else 3
s = config["s_param"] if "s_param" in config else 0
X = config["X_param"] if "X_param" in config else -1
Y = config["Y_param"] if "Y_param" in config else -1
p = config["p_param"] if "p_param" in config else 1
q = config["q_param"] if "q_param" in config else 255
l = config["l_vcf"] if "l_vcf" in config else ""
e = config["e_param"] if "e_param" in config else 0.01
if l != "":
if not os.path.isfile(l):
raise ValueError("VCF file not found. Please check that the file name is correct.")
if X != -1 or Y != -1:
if X == -1:
X = 0.5
if Y == -1:
Y = 0.5
# time command
mac_time_command = "command time -l -o"
linux_time_command = "command time -v -o"
time_command = mac_time_command if os.uname().sysname == "Darwin" else linux_time_command
rule all:
input:
f"{b}ntedit_k{k}_edited.fa"
rule help:
shell:
"""
echo ""
echo "Usage: snakemake -s ntedit_run_pipeline.smk [target]"
echo ""
echo "Targets:"
echo " ntedit run ntedit to polish draft genome assembly, REQUIRED"
echo " ntedit_cbf run ntedit using a counting Bloom filter to polish draft genome assembly, REQUIRED"
echo ""
echo "Options:"
echo " draft draft genome assembly. Must be specified with exact FILE NAME. Ex: draft=myDraft.fa (FASTA, Multi-FASTA, and/or gzipped compatible), REQUIRED"
echo " reads prefix of reads file(s). All files in the working directory with the specified prefix will be used for polishing (fastq, fasta, gz, bz, zip), REQUIRED"
echo " time logs time and memory usage to file for main steps (Set to 1 to enable logging)"
echo " k k-mer size, REQUIRED"
echo " t number of threads [default=4]"
echo " b output file prefix, OPTIONAL"
echo ""
echo "Options specific to ntStat:"
echo " solid output the solid k-mers (non-erroneous k-mers), True = yes, False = no [default=False]"
echo " cutoff the minimum coverage of k-mers in output bloom filter, [default=2, ignored if solid=True]"
echo ""
echo "Options specific to ntEdit:"
echo " z minimum contig length [default=100]"
echo " i maximum number of insertion bases to try, range 0-5, [default=5]"
echo " d maximum number of deletions bases to try, range 0-10, [default=5]"
echo " x k/x ratio for the number of k-mers that should be missing, [default=5.000]"
echo " y k/y ratio for the number of edited k-mers that should be present, [default=9.000]"
echo " j controls size of k-mer subset. When checking subset of k-mers, check every jth k-mer, [default=3]"
echo " cap cap for the number of base insertions that can be made at one position, [default=k*1.5]"
echo " X ratio of number of k-mers in the k subset that should be missing in order to attempt fix (higher=stringent), [default=0.5]"
echo " Y ratio of number of k-mers in the k subset that should be present to accept an edit (higher=stringent), [default=0.5]"
echo " m mode of editing, range 0-2, [default=0]"
echo " 0: best substitution, or first good indel"
echo " 1: best substitution, or best indel"
echo " 2: best edit overall (suggestion that you reduce i and d for performance)"
echo " a Soft masks missing k-mer positions having no fix (-v 1 = yes, default = 0, no)"
echo " s SNV mode. Overrides draft k-mer checks, forcing reassessment at each position (1 = yes, default = 0, no. EXPERIMENTAL)"
echo " v verbose mode (1 = yes, default = 0, no)"
echo " p, minimum k-mer coverage threshold (CBF only) [default=minimum of counting Bloom filter\n"
echo " counts, cannot be larger than 255]\n"
echo " q, maximum k-mer coverage threshold (CBF only) [default=255, largest possible value]\n"
echo ""
echo "Example: Polishing myDraft.fa with myReads1.fq and myReads2.fq"
echo " snakemake -s ntedit_run_pipeline.smk ntedit draft=myDraft.fa reads=myReads cutoff=2 or"
echo " snakemake -s ntedit_run_pipeline.smk ntedit draft=myDraft.fa reads=myReads solid=true"
echo ""
echo "Make sure your read files all have the same prefix, as indicated by 'reads=<prefix>'. The Snakefile will use all files in the current working directory with this prefix for polishing."
echo "To ensure that the pipeline runs correctly, make sure that the following tools are in your PATH: ntedit, ntstat"
echo "You must either specify the cutoff parameter or define solid=true in your command to set it automatically"
echo "If one of X/Y is set, ntEdit will use those parameters instead. Otherwise, it uses x/y by default."
"""
rule ntedit:
input:
bloom_filter=f"{reads_prefix}_k{k}.bf"
output:
edited_draft=f"{b}ntedit_k{k}_edited.fa"
params:
prefix = f"{b}ntedit_k{k}",
benchmark = f"{time_command} ntedit_{reads_prefix}.time",
ratio = f"-X {X} -Y {Y}" if X != -1 or Y != -1 else "",
vcf = f"-l {l}" if l != "" else ""
shell:
"{params.benchmark} ntedit -r {input.bloom_filter} -f {draft} -b {params.prefix} -t {t} -z {z} -i {i} -d {d} -x {x} -y {y} -c {cap} -m {m} -v {v} -a {a} -j {j} {params.ratio} -s {s} {params.vcf}"
rule ntstat:
input:
hist=f"{reads_prefix}_k{k}.hist",
reads_files = reads_files
output:
bloom_filter=f"{reads_prefix}_k{k}.bf"
params:
benchmark = f"{time_command} ntstat_{reads_prefix}_k{k}.time",
min_cutoff = f"-cmin 0" if solid else f"-cmin {cutoff}"
shell:
"{params.benchmark} ntstat filter -k {k} {params.min_cutoff} -t {t} -e {e} -f {input.hist} -o {output.bloom_filter} {input.reads_files}"
rule ntcard:
input:
reads_files = reads_files
output:
hist=f"{reads_prefix}_k{k}.hist"
params:
benchmark = f"{time_command} ntcard_{reads_prefix}_k{k}.time"
shell:
"{params.benchmark} ntcard -k {k} -t {t} -p {reads_prefix} {input.reads_files}"
rule ntedit_cbf:
input:
bloom_filter=f"{reads_prefix}_k{k}.cbf"
output:
edited_draft=f"{b}ntedit_k{k}_cbf_p{p}_q{q}_edited.fa"
params:
prefix = f"{b}ntedit_k{k}_cbf_p{p}_q{q}",
benchmark = f"{time_command} ntedit_{reads_prefix}_k{k}_cbf_p{p}_q{q}.time",
ratio = f"-X {X} -Y {Y}" if X != -1 or Y != -1 else "",
vcf = f"-l {l}" if l != "" else ""
shell:
"{params.benchmark} ntedit -r {input.bloom_filter} -f {draft} -b {params.prefix} -p {p} -q {q} -t {t} -z {z} -i {i} -d {d} -x {x} -y {y} -c {cap} -m {m} -v {v} -a {a} -j {j} {params.ratio} -s {s} {params.vcf}"
rule ntstat_cbf:
input:
hist=f"{reads_prefix}_k{k}.hist",
reads_files = reads_files
output:
bloom_filter=f"{reads_prefix}_k{k}.cbf"
params:
benchmark = f"{time_command} ntstat_{reads_prefix}_k{k}_cbf.time",
min_cutoff = f"-cmin 0" if solid else f"-cmin {cutoff}"
shell:
"{params.benchmark} ntstat filter -k {k} {params.min_cutoff} -t {t} -e {e} --counts -f {input.hist} -o {output.bloom_filter} {input.reads_files}"
rule ntedit_snv_genome:
input: f"{genome_prefix}_ntedit_k{k}_variants.vcf"
rule ntedit_snv_reads:
input: f"{reads_prefix}_ntedit_k{k}_variants.vcf"
rule ntedit_snv_reads_cbf:
input: f"{reads_prefix}_ntedit_k{k}_cbf_variants.vcf"
rule ntedit_snv_cbf:
input:
bloom_filter = expand("{{prefix}}_k{k}.cbf", k=k)
output:
out_vcf=expand("{{prefix}}_ntedit_k{k}_cbf_variants.vcf", k=k)
params:
prefix = expand("{{prefix}}_ntedit_k{k}_cbf", k=k),
benchmark = expand("{time_command} ntedit_{{prefix}}.time", time_command=time_command),
ratio = f"-X {X} -Y {Y}" if X != -1 or Y != -1 else "",
vcf = f"-l {l}" if l != "" else ""
shell:
"{params.benchmark} ntedit -r {input.bloom_filter} -f {draft} -b {params.prefix} -t {t} -z {z} -y {y} -v {v} -a {a} -j {j} {params.ratio} -s 1 {params.vcf}"
rule ntedit_snv:
input:
bloom_filter = expand("{{prefix}}_k{k}.bf", k=k)
output:
out_vcf=expand("{{prefix}}_ntedit_k{k}_variants.vcf", k=k)
params:
prefix = expand("{{prefix}}_ntedit_k{k}", k=k),
benchmark = expand("{time_command} ntedit_{{prefix}}.time", time_command=time_command),
ratio = f"-X {X} -Y {Y}" if X != -1 or Y != -1 else "",
vcf = f"-l {l}" if l != "" else ""
shell:
"{params.benchmark} ntedit -r {input.bloom_filter} -f {draft} -b {params.prefix} -t {t} -z {z} -y {y} -v {v} -a {a} -j {j} {params.ratio} -s 1 {params.vcf}"
rule ntedit_genome_bf:
input:
ntcard_f_stats = f"{genome_prefix}.k{k}.tsv",
genomes = genomes
output:
f"{genome_prefix}_k{k}.bf"
params:
options = f"-k {k} -t {t}",
benchmark = f"{time_command} make_bf_{genome_prefix}_k{k}.time"
run:
with open(input.ntcard_f_stats, 'r', encoding="utf8") as fin:
for line in fin:
_, f, num = line.strip().split("\t")
if f == "F0":
num_elements = num
break
shell("{params.benchmark} ntedit_make_genome_bf --genome {input.genomes} {params.options} --num_elements {num_elements} -o {output}")
rule genomes_ntcard:
input:
genomes = genomes
output:
hist=f"{genome_prefix}.k{k}.hist",
f_stats=f"{genome_prefix}.k{k}.tsv"
params:
benchmark = f"{time_command} ntcard_{genome_prefix}_k{k}.time",
options = f"-t {t} -k {k}"
shell:
"{params.benchmark} ntcard {params.options} -o {output.hist} {input.genomes} &> {output.f_stats}"