-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSnakefile
executable file
·363 lines (328 loc) · 10.2 KB
/
Snakefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
"""
Snakemake workflow for ST microbiome data analysis pipeline
Spatial Transcriptomics meets Microbiome
"""
import sys
from os.path import join
# ----------------------------------------------------------
# Define the pipeline configuration file
configfile: 'config.yaml'
# ----------------------------------------------------------
# Define variables
OD = config['results']
# ----------------------------------------------------------
# From input FileList create SAMPLES array and temp BAM directory
# and create symbolic links to all BAM files
file_list = config['file_list']
SAMPLES = []
INDIR = os.path.join(OD, 'bam')
BAMDIR = INDIR
if not os.path.exists(BAMDIR):
os.makedirs(BAMDIR)
with open(config['file_list']) as f:
for line in f:
(sample, src) = line.split()
SAMPLES.append(sample)
#a[sample] = src
dest = os.path.join(BAMDIR, sample+'.bam')
if os.path.exists (src):
if not os.path.islink(dest):
os.symlink(src, dest)
else:
raise SystemExit('File does not exist:'+src)
#print(SAMPLES)
# ----------------------------------------------------------
# Declare local rules, ie not submitted to the cluster
localrules: all, profiling
# ----------------------------------------------------------
"""
main rule
"""
rule all:
input:
os.path.join(OD, 'multiqc_QC', 'multiqc_report.html'),
expand(os.path.join(OD, '{sample}', '{sample}_profiling-output.txt'), sample=SAMPLES)
# ----------------------------------------------------------
"""
extracting unmapped reads from possorted_genome_bam.bam file
the rule processes all .bam files in input directory INDIR
"""
rule samtools_view:
input:
os.path.join(INDIR, '{sample}.bam'),
output:
temp(os.path.join(OD, '{sample}', '{sample}_unm.bam'))
threads: 4
resources:
mem_mb=10000
shell:
"""
samtools view \
--output-fmt BAM \
--bam \
--require-flags 4 \
--threads {threads} \
{input} > {output}
"""
# ----------------------------------------------------------
"""
sort extracted reads
"""
rule samtools_sort:
input:
os.path.join(OD, '{sample}', '{sample}_unm.bam')
output:
temp(os.path.join(OD, '{sample}', '{sample}_unm_srt.bam'))
threads: 4
resources:
mem_mb=10000
shell:
"""
samtools sort -n \
--output-fmt BAM \
--threads {threads} \
{input} > {output}
"""
# ----------------------------------------------------------
"""
converting bam file to R1 and R2 fastq files (in original format)
Mark directory as TEMP ?
"""
rule bamtofastq:
input:
os.path.join(OD, '{sample}', '{sample}_unm_srt.bam')
output:
temp(os.path.join(OD, '{sample}', '{sample}_bamtofastq.done')),
temp(directory(os.path.join(OD, '{sample}', 'fastq')))
params:
dir = os.path.join(OD, '{sample}')
threads: 4
resources:
mem_mb=10000
shell:
"""
rm -rf {params.dir}/fastq && \
mkdir -p {params.dir} && \
bamtofastq \
--nthreads={threads} \
{input} \
{params.dir}/fastq \
&& touch {output[0]}
"""
# ----------------------------------------------------------
"""
concatenating multiple fastq files into one R1
"""
rule concatenate1:
input:
os.path.join(OD, '{sample}', '{sample}_bamtofastq.done'),
os.path.join(OD, '{sample}', 'fastq')
output:
temp(os.path.join(OD, '{sample}', '{sample}_R1.fastq.gz'))
params:
dir = os.path.join(OD, '{sample}', 'fastq')
threads: 1
resources:
mem_mb=10000
shell:
"""
rm -f {output}
for name in {params.dir}/*/*_R1_*.fastq.gz; do
cat $name >> {output}
done
"""
# ----------------------------------------------------------
"""
concatenating multiple fastq files into one R2
"""
rule concatenate2:
input:
os.path.join(OD, '{sample}', '{sample}_bamtofastq.done'),
os.path.join(OD, '{sample}', 'fastq')
output:
temp(os.path.join(OD, '{sample}', '{sample}_R2.fastq.gz'))
params:
dir = os.path.join(OD, '{sample}', 'fastq')
threads: 1
resources:
mem_mb=10000
shell:
"""
rm -f {output}
for name in {params.dir}/*/*_R2_*.fastq.gz; do
cat $name >> {output}
done
"""
# ----------------------------------------------------------
"""
extracting BC & UMI from R1 and placing on R2 read header with umi_tools
"""
rule umi:
input:
fq1 = os.path.join(OD, '{sample}', '{sample}_R1.fastq.gz'),
fq2 = os.path.join(OD, '{sample}', '{sample}_R2.fastq.gz')
output:
fq1 = temp(os.path.join(OD, '{sample}', '{sample}_R1_extracted.fastq.gz')),
fq2 = temp(os.path.join(OD, '{sample}', '{sample}_R2_extracted.fastq.gz'))
params:
bc = 'CCCCCCCCCCCCCCCCNNNNNNNNNNNN'
threads: 1
resources:
mem_mb=10000
shell:
"""
umi_tools extract \
--bc-pattern={params.bc} \
--stdin {input.fq1} \
--stdout {output.fq1} \
--read2-in {input.fq2} \
--read2-out={output.fq2}
"""
# ----------------------------------------------------------
"""
Quality control on R2 fastq files
cutadapt
-g trims adapters from 5'end (TSO adapter sequence) and polyA
-m removing reads shorter than minimum read length of 31
-n removing up to 2 adapters
"""
rule cutadapt:
input:
os.path.join(OD, '{sample}', '{sample}_R2_extracted.fastq.gz')
output:
fq = temp(os.path.join(OD, '{sample}', '{sample}_trimTSO.fq.gz')),
report = temp(os.path.join(OD, '{sample}', '{sample}_cutadapt.txt'))
params:
adaptor = 'AAGCAGTGGTATCAACGCAGAGTACATGGG',
polyA = '^A\{30\}'
threads: 6
resources:
mem_mb=10000
shell:
"""
cutadapt \
--front {params.adaptor} \
--front {params.polyA} \
--times 2 \
--minimum-length 31 \
--cores {threads} \
--output {output.fq} \
{input} > {output.report}
"""
# ----------------------------------------------------------
"""
fastp
-l minimum read length = 31 (kraken kmer length)
-A disables automatic adapter trimming
--trim_poly_X trims polyX sequences from 3' end
trimmomatic-style sliding window quality trimming
NOTE the tools outputs this:
WARNING: you specified the options for cutting by quality, but forogt to enable any of cut_front/cut_tail/cut_right. This will have no effect.
"""
rule fastp:
input:
os.path.join(OD, '{sample}', '{sample}_trimTSO.fq.gz')
output:
html = os.path.join(OD, '{sample}', '{sample}_fastp.html'),
fq = os.path.join(OD, '{sample}', '{sample}_trim.fq.gz'),
json = temp(os.path.join(OD, '{sample}', '{sample}_fastp.json'))
params:
threads: 6
resources:
mem_mb=10000
shell:
"""
fastp --in1 {input} \
--cut_right_window_size 4 \
--cut_right_mean_quality 20 \
--disable_adapter_trimming \
--length_required 31 \
--thread {threads} \
--dont_eval_duplication \
--trim_poly_x \
--out1 {output.fq} \
--html {output.html} \
--json {output.json}
"""
# ----------------------------------------------------------
"""
Kraken2 profiling
How much memory is required ?
Step 1: Classify reads with kraken2 (with KrakenUniq option)
"""
rule classify:
input:
os.path.join(OD, '{sample}', '{sample}_trim.fq.gz')
output:
txt = protected(os.path.join(OD, '{sample}', '{sample}_kraken-output.txt')),
report = protected(os.path.join(OD, '{sample}', '{sample}_kraken-report.txt'))
params:
db = config['kraken_db']
threads: config['kraken_threads']
resources:
mem_mb=100000
shell:
"""
kraken2 --db {params.db} \
--memory-mapping \
--confidence 0.1 \
--threads {threads} \
--use-names \
--gzip-compressed \
--report-minimizer-data \
--output {output.txt} \
--report {output.report} \
{input}
"""
# ----------------------------------------------------------
"""
Kraken2 profiling
Step 2: modifying kraken output as input for R package
"""
rule profiling:
input:
os.path.join(OD, '{sample}', '{sample}_kraken-output.txt')
output:
f = temp(os.path.join(OD, '{sample}', '{sample}_filtered-output.txt')),
p = os.path.join(OD, '{sample}', '{sample}_profiling-output.txt')
shell:
"""
### extract only classified reads & remove all human reads
sed -e '/^U/d' -e '/sapiens/d' {input} > {output.f}
### extract spatial BC & UMIs into separate tabs
sed -i 's/\_/\t/g' {output.f}
### extract taxid into separate tab
sed -i -e 's/ (taxid /\t/g' -e 's/)//g' {output.f}
### extract only CB, UMI, taxid columns
awk -F \"\t\" '{{ print $3,\"\t\",$4,\"\t\",$6 }}' {output.f} > {output.p}
"""
# ----------------------------------------------------------
"""
MultiQC for cutadapt and fastp
and remove BAM directory
"""
rule multiqc:
input:
expand(os.path.join(OD, '{sample}', '{sample}_fastp.html'), sample=SAMPLES),
expand(os.path.join(OD, '{sample}', '{sample}_fastp.json'), sample=SAMPLES),
expand(os.path.join(OD, '{sample}', '{sample}_cutadapt.txt'), sample=SAMPLES),
expand(os.path.join(OD, '{sample}', '{sample}_kraken-report.txt'), sample=SAMPLES)
output:
os.path.join(OD, 'multiqc_QC', 'multiqc_report.html')
params:
indir = OD,
outdir = os.path.join(OD, 'multiqc_QC')
threads: 1
resources:
mem_mb=10000
shell:
"""
multiqc \
--force \
--module fastp \
--module kraken \
--module cutadapt \
--outdir {params.outdir} \
{params.indir} \
&& rm -rf {BAMDIR}
"""