forked from ggloor/miseq_bin
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdada2_workflow_1.2.R
185 lines (154 loc) · 7.72 KB
/
dada2_workflow_1.2.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
# Updated: 18-Oct-2017
# This script is modified by JM from GG's original based on the dada2 tutorial found here:
#http://benjjneb.github.io/dada2/tutorial.html
# For more information to run the pipeline, scroll down to the README at:
# https://github.com/ggloor/miseq_bin
# This workflow is set up to be run on cjelli, you will need to modify for your own machine
#-------------------------------------------------------
# Before running
#-------------------------------------------------------
# 1) Demultiplex your samples (assign each read to a sample based on the barcode) using demultiplex_dada2.pl
# 2) Start R
# OR...SOURCE THIS SCRIPT WITH
# nohup Rscript dada2_workflow.R &
#-------------------------------------------------------
# Setup
#-------------------------------------------------------
#Paths to reference taxonomy and reads
taxpath<-"/Volumes/longlunch/seq/annotationDB/dada2/silva_nr_v123_train_set.fa.gz" #cjelli
#taxpath<-"/Volumes/data/annotationDB/dada2/silva_nr_v123_train_set.fa.gz" #agrajag
reads<-"demultiplex_reads"
#Load needed libraries
library(dada2); packageVersion("dada2")
#Dump the R sessioninfo for later
writeLines(capture.output(sessionInfo()), "RsessionInfo_dada2.txt")
#-------------------------------------------------------
#list the files
#list.files(path)
# Get the filenames with relative path
# sort to ensure same order of fwd/rev reads
fnFs <- sort(list.files(reads, pattern="-R1.fastq", full.names=TRUE))
fnRs <- sort(list.files(reads, pattern="-R2.fastq", full.names=TRUE))
# Get sample names only (remove path, and everything after the first "-")
# Assuming filenames have format: SAMPLENAME-XXX.fastq
sample.names <- sapply(strsplit(basename(fnFs), "-"), `[`, 1)
### Old method
# Sort ensures forward/reverse reads are in same order
#fnFs1 <- sort(list.files(reads, pattern="-R1.fastq"))
#fnRs2 <- sort(list.files(reads, pattern="-R2.fastq"))
# Extract sample names, assuming filenames have format: SAMPLENAME-XXX.fastq
#sample.names <- sapply(strsplit(fnFs1, "-"), `[`, 1)
# Specify the full path to the fnFs and fnRs
#fnFs <- file.path(reads, fnFs1)
#fnRs <- file.path(reads, fnRs2)
#-------------------------------------------------------
# Check read quality
#-------------------------------------------------------
# check a random set of samples
# Should change this to check other reads
pdf("qualprofiles.pdf")
plotQualityProfile(fnFs[[1]])
plotQualityProfile(fnRs[[1]])
plotQualityProfile(fnFs[[10]])
plotQualityProfile(fnRs[[10]])
plotQualityProfile(fnFs[[20]])
plotQualityProfile(fnRs[[20]])
dev.off()
#-------------------------------------------------------
# Filter reads based on QC
#-------------------------------------------------------
# Make filenames for the filtered fastq files
filtFs <- paste0(reads, "/", sample.names, "-F-filt.fastq.gz")
filtRs <- paste0(reads, "/", sample.names, "-R-filt.fastq.gz")
# the length must be equal to or shorter than the read!!
# that means 187 and 178 for V4 with paired 2x220 with 8 mer barcodes
# that means 183 and 174 for V4 with paired 2x220 with 12 mer barcodes
# DO NOT trim from the 5' end since primers and barcodes already trimmed off
for(i in seq_along(fnFs)) {
fastqPairedFilter(c(fnFs[i], fnRs[i]), c(filtFs[i], filtRs[i]),
truncLen=c(183,174),
maxN=0,
maxEE=c(2,2),
compress=TRUE, verbose=TRUE)
}
#example parameters. For paired reads, used a vector (2,2)
#truncQ=2, #truncate reads after a quality score of 2 or less
#truncLen=130, #truncate after 130 bases
#trimLeft=10, #remove 10 bases off the 5’ end of the sequence
#maxN=0, #Don’t allow any Ns in sequence
#maxEE=2, #A maximum number of expected errors
#rm.phix=TRUE, #Remove lingering PhiX (control DNA used in sequencing) as it is likely there is some.
# filtered reads are output to demultiplex_reads
#----------------------------------------------------------------
# IF YOUR PROGRAM CRASHES YOU CAN USE THIS AS A CONTINUE POINT
# SKIP THIS otherwise and go straight to dereplication
# as long as your filtered reads were output
# Use the commands below to lead the data, then go to the dereplication step
#---------------------------------------------------------------------------
###Load needed libraries and paths
#library(dada2)
#
#taxpath<-"/Volumes/longlunch/seq/annotationDB/dada2silva_nr_v123_train_set.fa.gz"
#reads<-"demultiplex_reads"
#
##get the filenames with relative path
##sort to ensure same order
#filtFs <- sort(list.files(reads, pattern="-F-filt.fastq.gz", full.names=TRUE))
#filtRs <- sort(list.files(reads, pattern="-R-filt.fastq.gz", full.names=TRUE))
##get sample names only (remove path, and everything after the first "-")
#sample.names <- sapply(strsplit(basename(filtFs), "-"), `[`, 1)
#-------------------------------------------------------
#-------------------------------------------------------
# Dereplication
#-------------------------------------------------------
# Dereplication combines all identical sequencing reads into into “unique sequences” with a corresponding “abundance”: the number of reads with that unique sequence
# Dereplication substantially reduces computation time by eliminating redundant comparisons.
derepFs <- derepFastq(filtFs, verbose=TRUE)
derepRs <- derepFastq(filtRs, verbose=TRUE)
# Name the derep-class objects by the sample names
names(derepFs) <- sample.names
names(derepRs) <- sample.names
#-------------------------------------------------------
# Sample inference - SLOW!!
#-------------------------------------------------------
# SLOW!!
# Perform joint sample inference and error rate estimation (takes a few minutes)
# this is the dada2 magic function that determines which reads are likely
# TRUE and which reads are likely derived by PCR or seq error
dadaFs <- dada(derepFs, err=inflateErr(tperr1,3), selfConsist = TRUE)
dadaRs <- dada(derepRs, err=inflateErr(tperr1,3), selfConsist = TRUE)
# overlap the ends of the forward and reverse reads
mergers <- mergePairs(dadaFs, derepFs, dadaRs, derepRs, verbose=TRUE)
# make the sequence table, samples are by rows
seqtab <- makeSequenceTable(mergers[names(mergers) != "Mock"])
# summarize the output by length
table(nchar(colnames(seqtab)))
# remove chimeras and save in seqtab.nochim
seqtab.nochim <- removeBimeraDenovo(seqtab, verbose=TRUE)
#let's write the table, just in case
#samples are rows
write.table(seqtab.nochim, file="dada2_nochim_temp.txt", sep="\t", col.names=NA, quote=F)
#-------------------------------------------------------
# Assign taxonomy
#-------------------------------------------------------
# NOTE: This is an APPROXIMATE taxonomy and may not be the best method for your data
# There are many ways/databases to assign taxonomy, we are only using one.
taxa <- assignTaxonomy(seqtab.nochim, taxpath)
colnames(taxa) <- c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus")
#get the taxonomy string
#merge columns 1 to 6 to get the full taxonomy (to genus)
un.tax <- unname(taxa)
tax.vector <- apply(un.tax, 1, function(x){paste(x[1:6], collapse=":")})
#add taxonomy to the table
seqtab.nochim.tax<-rbind(seqtab.nochim, tax.vector)
#transpose the table so samples are columns
t.seqtab.nochim.tax<-t(seqtab.nochim.tax)
#remove the rownames (ASU sequences) to a separate table and replace with arbitrary ASU numbers
# NOTE: in this case that ASUs are not the traditional "97% identical" sequence units since dada2 only collapses at 100%
otu.seqs<-rownames(t.seqtab.nochim.tax)
otu.num<-paste("ASU", seq(from = 0, to = nrow(t.seqtab.nochim.tax)-1), sep="_")
rownames(t.seqtab.nochim.tax)<-otu.num
#get the tables!
# These are what you will use for all your downtream analysis
write.table(t.seqtab.nochim.tax, file="dada2_nochim_tax.txt", sep="\t", col.names=NA, quote=F)
write.table(otu.seqs, file="otu_seqs.txt", sep="\t", row.names=otu.num, col.names=F, quote=F)