drosophila_htseq.R

# Before you begin, download all of the *_htseq_counts.txt files -- to do so, launch an Amazon EC2 instance, create an EBS volume from snapshot
# snap-642349cb (which has all the pre-computed files), and attach and mount your volume on your instance. Then, you can copy the cout files
# generated by htseq to your local computer to open with R.

samples <- c("ORE_wt_rep1","ORE_wt_rep2","ORE_sdE3_rep1","ORE_sdE3_rep2","SAM_wt_rep1","SAM_wt_rep2","SAM_sdE3_rep1","SAM_sdE3_rep2","HYB_wt_rep1","HYB_wt_rep2","HYB_sdE3_rep1","HYB_sdE3_rep2")

#A function to read one of the count files produced by HTSeq
read.sample <- function(sample.name) {
	file.name <- paste(sample.name, "_htseq_counts.txt", sep="")
	result <- read.delim(file.name, col.names=c("gene", "count"), sep="\t", colClasses=c("character", "numeric"))
}

#Read the first sample
sample.1 <- read.sample(samples[1])

#Read the second sample
sample.2 <- read.sample(samples[2])

#Let's make sure the first and second samples have the same number of rows and the same genes in each row
nrow(sample.1) == nrow(sample.2)
all(sample.1$gene == sample.2$gene)

#Now let's combine them all into one dataset
all.data <- sample.1
all.data <- cbind(sample.1, sample.2$count)
for (c in 3:length(samples)) {
	temp.data <- read.sample(samples[c])
	all.data <- cbind(all.data, temp.data$count)
}

#We now have a data frame with all the data in it:
head(all.data)

#But you'll notice that the column names are not very informative. Let's fix that
colnames(all.data)[2:ncol(all.data)] <- samples

#Now look:
head(all.data)

#Let's look at the bottom of the data table
tail(all.data)

#There are some rows that we don't really want to include in our analysis... let's remove them (there are 5 in total)
all.data <- all.data[1:(nrow(all.data)-5),]

tail(all.data)

#Now we're ready to start working with DESeq!
#If you don't already have DESeq installed on your computer, you will need to install it:
source("http://bioconductor.org/biocLite.R")
biocLite("DESeq")

#Now that the library is installed, we need to load it into memory so we can access its functions:
library("DESeq")

#First, we need to get rid of the first column, because DESeq insists on having only numbers; we can save the information in it as the row names
raw.deseq.data <- all.data[,2:ncol(all.data)]
rownames(raw.deseq.data) <- all.data$gene

head(raw.deseq.data)

#Create metadata
wing.design <- data.frame(
	row.names=samples,
	background=c(rep("ORE", 4), rep("SAM", 4), rep("HYB", 4)),
	genotype=rep( c("wt", "wt", "sdE3", "sdE3"), 3 ),
	libType=rep("paired-end", 12)
)
#Double check it...
wing.design 


deseq.data <- newCountDataSet(raw.deseq.data, wing.design)

#Note that DESeq also has a function newCountDataSetFromHTSeqCount that can automatically handle merging all the raw data files together,
# but I wanted to show how we could do this ourselves as an exercise

#Now we have to do the "normalization" step and estimate the dispersion for each gene
deseq.data <- estimateSizeFactors(deseq.data)
deseq.data <- estimateDispersions(deseq.data)

#Let's make sure the dispersion estimates look reasonable
plotDispEsts(deseq.data)

#In this case it looks ok, although the number of points on the graph is relatively modest compared to most RNA-seq studies, since
# we have intentionally excluded all non-X genes
#Note that if the dispersion estimates don't look good you may need to tweak the parameters for the estimateDispersions() function
# (e.g., maybe try using fitType="local")

#Now let's fit some models...

fit.full <- fitNbinomGLMs(deseq.data, count ~ background + genotype + background:genotype)
fit.nointeraction <- fitNbinomGLMs(deseq.data, count ~ background + genotype)
fit.background <- fitNbinomGLMs(deseq.data, count ~ background)
fit.genotype <- fitNbinomGLMs(deseq.data, count ~ genotype)
fit.null <- fitNbinomGLMs(deseq.data, count ~ 1)

#And do some pairwise comparisons to see which ones fit best for each gene...
#E.g., which genes show an interaction between background and genotype?

pvals.interaction <- nbinomGLMTest(fit.full, fit.nointeraction)
padj.interaction <- p.adjust(pvals.interaction, method="BH")
fit.full[(padj.interaction <= 0.05) & !is.na(padj.interaction),]

#For which genes is the full model a better fit than the null model?
pvals.fullnull <- nbinomGLMTest(fit.full, fit.null)
padj.fullnull <- p.adjust(pvals.fullnull, method="BH")
fit.full[(padj.fullnull <= 0.05) & !is.na(padj.fullnull),]


##########################################################################################
#Some exploratory plotting -- essentially identical to what's in the DESeq documentation...

cdsFullBlind = estimateDispersions( deseq.data, method = "blind" )
vsdFull = varianceStabilizingTransformation( cdsFullBlind )

library("RColorBrewer")library("gplots")
# Note: if you get an error message when you try to run the previous two lines,
# you may need to install the libraries, like this:
install.packages("RColorBrewer")
install.packages("gplots")
# After the libraries installed, don't forget to load them by running the library() calls again

select = order(rowMeans(counts(deseq.data)), decreasing=TRUE)[1:30]hmcol = colorRampPalette(brewer.pal(9, "GnBu"))(100)

# Heatmap of count table -- transformed counts
heatmap.2(exprs(vsdFull)[select,], col = hmcol, trace="none", margin=c(10, 6))

# Heatmap of count table -- untransformed counts
heatmap.2(counts(deseq.data)[select,], col = hmcol, trace="none", margin=c(10,6))

# Heatmap of sample-to-sample distances
dists = dist( t( exprs(vsdFull) ) )mat = as.matrix( dists )rownames(mat) = colnames(mat) = with(pData(cdsFullBlind), paste(background, genotype, sep=" : "))
heatmap.2(mat, trace="none", col = rev(hmcol), margin=c(13, 13))


# PCA
print(plotPCA(vsdFull, intgroup=c("background", "genotype")))