From 4ae686174ecfd7d287f9211d301579c6599ec6d4 Mon Sep 17 00:00:00 2001
From: Kelly Street <kstreet@berkeley.edu>
Date: Mon, 20 Jun 2016 10:54:41 -0700
Subject: [PATCH] added text to GAM portion of slingshot vignette, added gam
 package to DESCRIPTION

---
 DESCRIPTION             |  1 +
 vignettes/slingshot.Rmd | 31 +++++++++++++++++--------------
 2 files changed, 18 insertions(+), 14 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index b89bdbe..bffde00 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -18,6 +18,7 @@ Suggests:
   slingshot,
   SummarizedExperiment,
   MAST,
+  gam,
   BiocStyle,
   knitr,
   rmarkdown,
diff --git a/vignettes/slingshot.Rmd b/vignettes/slingshot.Rmd
index 1fcc307..8aaa69e 100644
--- a/vignettes/slingshot.Rmd
+++ b/vignettes/slingshot.Rmd
@@ -57,7 +57,7 @@ data('full_pca')
 
 ## Examine dimensionality reduction
 # plot3d(pcaX, aspect = 'iso')
-
+pairs(pcaX[,1:3], asp = 1)
 ```
 
 # Step 1: Assign clusters to lineages with `get_lineages`
@@ -122,38 +122,41 @@ The output of `get_curves` is a list with one element per curve. Each element is
 
 # Step 3: Find temporally expressed genes
 
+Typically, the next step will be to find genes that change their expression as a function of developmental time. This can be done using the full genes-by-samples data matrix, but we will use the subset consisting of the 1,000 most variable genes.
+
 ```{r genedata}
 data('var_genes')
-# pst1 <- crv[[1]]$pseudotime
-# pst2 <- crv[[2]]$pseudotime
+```
 
+For a quick analysis, we will regress each gene on the two pseudotime vectors we have generated, using a general additive model (GAM). This allows us to detect non-linear patterns in gene expression over developmental time.
 
-gam.pval <- gam.fit <- vector("list",length(crv))
+```{r fitgam}
+gam.pval <- vector("list",length(crv))
 for(l in 1:length(crv)){
   t <- crv[[l]]$pseudotime
   y <- vargenes[,! is.na(t)]
   t <- t[! is.na(t)]
-  gam.res <- apply(y,1,function(z)
-  {
-    d <- data.frame(z=z, pt=t)
-    tmp <- gam(z ~ lo(pt), data=d)
+  gam.pval[[l]] <- apply(y,1,function(z){
+    d <- data.frame(z=z, t=t)
+    tmp <- gam(z ~ lo(t), data=d)
     p <- summary(tmp)[4][[1]][1,5]
-    f <- tmp$fitted.value
-    list(p=p,f=f)
+    p
   })
-  gam.pval[[l]] <- unlist(lapply(gam.res, function(z) z$p))
-  gam.fit[[l]] <- lapply(gam.res, function(z) z$f)
 }
+```
+
+We can then pick out the top genes for each lineage and visualize their expression over developmental time with a heatmap.
 
+```{r heatmaps}
 topgenes1 <- names(sort(gam.pval[[1]], decreasing = FALSE))[1:100]
 heatdata1 <- vargenes[rownames(vargenes) %in% topgenes1, order(crv[[1]]$pseudotime, na.last = NA)]
 heatclus1 <- clus[order(crv[[1]]$pseudotime, na.last = NA)]
-# plotHeatmap(heatdata1, clusterSamples=FALSE, sampleData = data.frame(Cluster = heatclus1))
+plotHeatmap(heatdata1, clusterSamples=FALSE, sampleData = data.frame(Cluster = heatclus1))
 
 topgenes2 <- names(sort(gam.pval[[2]], decreasing = FALSE))[1:100]
 heatdata2 <- vargenes[rownames(vargenes) %in% topgenes2, order(crv[[2]]$pseudotime, na.last = NA)]
 heatclus2 <- clus[order(crv[[2]]$pseudotime, na.last = NA)]
-# plotHeatmap(heatdata2, clusterSamples=FALSE, sampleData = data.frame(Cluster = heatclus2))
+plotHeatmap(heatdata2, clusterSamples=FALSE, sampleData = data.frame(Cluster = heatclus2))
 ```