forked from ngs-docs/angus
-
Notifications
You must be signed in to change notification settings - Fork 0
/
drosophila_htseq.R
144 lines (104 loc) · 5.68 KB
/
drosophila_htseq.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
# Before you begin, download all of the *_htseq_counts.txt files -- to do so, launch an Amazon EC2 instance, create an EBS volume from snapshot
# snap-642349cb (which has all the pre-computed files), and attach and mount your volume on your instance. Then, you can copy the cout files
# generated by htseq to your local computer to open with R.
samples <- c("ORE_wt_rep1","ORE_wt_rep2","ORE_sdE3_rep1","ORE_sdE3_rep2","SAM_wt_rep1","SAM_wt_rep2","SAM_sdE3_rep1","SAM_sdE3_rep2","HYB_wt_rep1","HYB_wt_rep2","HYB_sdE3_rep1","HYB_sdE3_rep2")
#A function to read one of the count files produced by HTSeq
read.sample <- function(sample.name) {
file.name <- paste(sample.name, "_htseq_counts.txt", sep="")
result <- read.delim(file.name, col.names=c("gene", "count"), sep="\t", colClasses=c("character", "numeric"))
}
#Read the first sample
sample.1 <- read.sample(samples[1])
#Read the second sample
sample.2 <- read.sample(samples[2])
#Let's make sure the first and second samples have the same number of rows and the same genes in each row
nrow(sample.1) == nrow(sample.2)
all(sample.1$gene == sample.2$gene)
#Now let's combine them all into one dataset
all.data <- sample.1
all.data <- cbind(sample.1, sample.2$count)
for (c in 3:length(samples)) {
temp.data <- read.sample(samples[c])
all.data <- cbind(all.data, temp.data$count)
}
#We now have a data frame with all the data in it:
head(all.data)
#But you'll notice that the column names are not very informative. Let's fix that
colnames(all.data)[2:ncol(all.data)] <- samples
#Now look:
head(all.data)
#Let's look at the bottom of the data table
tail(all.data)
#There are some rows that we don't really want to include in our analysis... let's remove them (there are 5 in total)
all.data <- all.data[1:(nrow(all.data)-5),]
tail(all.data)
#Now we're ready to start working with DESeq!
#If you don't already have DESeq installed on your computer, you will need to install it:
source("http://bioconductor.org/biocLite.R")
biocLite("DESeq")
#Now that the library is installed, we need to load it into memory so we can access its functions:
library("DESeq")
#First, we need to get rid of the first column, because DESeq insists on having only numbers; we can save the information in it as the row names
raw.deseq.data <- all.data[,2:ncol(all.data)]
rownames(raw.deseq.data) <- all.data$gene
head(raw.deseq.data)
#Create metadata
wing.design <- data.frame(
row.names=samples,
background=c(rep("ORE", 4), rep("SAM", 4), rep("HYB", 4)),
genotype=rep( c("wt", "wt", "sdE3", "sdE3"), 3 ),
libType=rep("paired-end", 12)
)
#Double check it...
wing.design
deseq.data <- newCountDataSet(raw.deseq.data, wing.design)
#Note that DESeq also has a function newCountDataSetFromHTSeqCount that can automatically handle merging all the raw data files together,
# but I wanted to show how we could do this ourselves as an exercise
#Now we have to do the "normalization" step and estimate the dispersion for each gene
deseq.data <- estimateSizeFactors(deseq.data)
deseq.data <- estimateDispersions(deseq.data)
#Let's make sure the dispersion estimates look reasonable
plotDispEsts(deseq.data)
#In this case it looks ok, although the number of points on the graph is relatively modest compared to most RNA-seq studies, since
# we have intentionally excluded all non-X genes
#Note that if the dispersion estimates don't look good you may need to tweak the parameters for the estimateDispersions() function
# (e.g., maybe try using fitType="local")
#Now let's fit some models...
fit.full <- fitNbinomGLMs(deseq.data, count ~ background + genotype + background:genotype)
fit.nointeraction <- fitNbinomGLMs(deseq.data, count ~ background + genotype)
fit.background <- fitNbinomGLMs(deseq.data, count ~ background)
fit.genotype <- fitNbinomGLMs(deseq.data, count ~ genotype)
fit.null <- fitNbinomGLMs(deseq.data, count ~ 1)
#And do some pairwise comparisons to see which ones fit best for each gene...
#E.g., which genes show an interaction between background and genotype?
pvals.interaction <- nbinomGLMTest(fit.full, fit.nointeraction)
padj.interaction <- p.adjust(pvals.interaction, method="BH")
fit.full[(padj.interaction <= 0.05) & !is.na(padj.interaction),]
#For which genes is the full model a better fit than the null model?
pvals.fullnull <- nbinomGLMTest(fit.full, fit.null)
padj.fullnull <- p.adjust(pvals.fullnull, method="BH")
fit.full[(padj.fullnull <= 0.05) & !is.na(padj.fullnull),]
##########################################################################################
#Some exploratory plotting -- essentially identical to what's in the DESeq documentation...
cdsFullBlind = estimateDispersions( deseq.data, method = "blind" )
vsdFull = varianceStabilizingTransformation( cdsFullBlind )
library("RColorBrewer")library("gplots")
# Note: if you get an error message when you try to run the previous two lines,
# you may need to install the libraries, like this:
install.packages("RColorBrewer")
install.packages("gplots")
# After the libraries installed, don't forget to load them by running the library() calls again
select = order(rowMeans(counts(deseq.data)), decreasing=TRUE)[1:30]hmcol = colorRampPalette(brewer.pal(9, "GnBu"))(100)
# Heatmap of count table -- transformed counts
heatmap.2(exprs(vsdFull)[select,], col = hmcol, trace="none", margin=c(10, 6))
# Heatmap of count table -- untransformed counts
heatmap.2(counts(deseq.data)[select,], col = hmcol, trace="none", margin=c(10,6))
# Heatmap of sample-to-sample distances
dists = dist( t( exprs(vsdFull) ) )mat = as.matrix( dists )rownames(mat) = colnames(mat) = with(pData(cdsFullBlind), paste(background, genotype, sep=" : "))
heatmap.2(mat, trace="none", col = rev(hmcol), margin=c(13, 13))
# PCA
print(plotPCA(vsdFull, intgroup=c("background", "genotype")))