carpentries-incubator · ailithewing · May 16, 2022 · May 16, 2022 · May 16, 2022 · May 17, 2022
diff --git a/Gemfile b/Gemfile
@@ -8,3 +8,5 @@ git_source(:github) {|repo_name| "https://github.com/#{repo_name}" }
 ruby '>=2.5.8'
 
 gem 'github-pages', group: :jekyll_plugins
+
+gem "webrick", "~> 1.7"
diff --git a/_episodes/01-introduction-to-high-dimensional-data.md b/_episodes/01-introduction-to-high-dimensional-data.md
@@ -445,6 +445,27 @@ library("minfi")
 {: .language-r}
 
 
+
+~~~
+Warning: package 'S4Vectors' was built under R version 4.1.1
+~~~
+{: .warning}
+
+
+
+~~~
+Warning: package 'GenomeInfoDb' was built under R version 4.1.1
+~~~
+{: .warning}
+
+
+
+~~~
+Warning: package 'MatrixGenerics' was built under R version 4.1.1
+~~~
+{: .warning}
+
+
 ~~~
 browseVignettes("minfi")
 ~~~

diff --git a/_episodes/02-high-dimensional-regression.md b/_episodes/02-high-dimensional-regression.md
@@ -48,6 +48,33 @@ Let's read in the data for this episode:
 ~~~
 library("here")
 library("minfi")
+~~~
+{: .language-r}
+
+
+
+~~~
+Warning: package 'S4Vectors' was built under R version 4.1.1
+~~~
+{: .warning}
+
+
+
+~~~
+Warning: package 'GenomeInfoDb' was built under R version 4.1.1
+~~~
+{: .warning}
+
+
+
+~~~
+Warning: package 'MatrixGenerics' was built under R version 4.1.1
+~~~
+{: .warning}
+
+
+
+~~~
 methylation <- readRDS(here("data/methylation.rds"))
 ~~~
 {: .language-r}
@@ -262,6 +289,12 @@ for low-dimensional data may be very slow when applied to high-dimensional data.
 Ideally when performing regression, we want to identify cases like this,
 where there is a clear assocation, and we probably "don't need" statistics:
 
+
+~~~
+Warning: package 'ggplot2' was built under R version 4.1.2
+~~~
+{: .warning}
+
 <img src="../fig/rmd-02-example1-1.png" title="An example of a strong linear association between a continuous phenotype (age) on the x-axis and a feature of interest (DNA methylation at a given locus) on the y-axis. A strong linear relationship with a positive slope exists between the two." alt="An example of a strong linear association between a continuous phenotype (age) on the x-axis and a feature of interest (DNA methylation at a given locus) on the y-axis. A strong linear relationship with a positive slope exists between the two." width="432" style="display: block; margin: auto;" />
 
 or equivalently for a discrete covariate:
@@ -359,6 +392,19 @@ the coefficients and the associated hypothesis tests in this model:
 
 ~~~
 library("broom")
+~~~
+{: .language-r}
+
+
+
+~~~
+Warning: package 'broom' was built under R version 4.1.2
+~~~
+{: .warning}
+
+
+
+~~~
 tidy(lm_age_methyl1)
 ~~~
 {: .language-r}
@@ -1003,7 +1049,7 @@ with a lot of features!
 
 
 ~~~
-p_raw <- coef_df$p.value
+p_raw <- toptab_age$P.Value
 p_fwer <- p.adjust(p_raw, method = "bonferroni")
 library("ggplot2")
 ggplot() +

diff --git a/_episodes/03-regression-regularisation.md b/_episodes/03-regression-regularisation.md
@@ -56,6 +56,33 @@ First, let's read in the data from the last lesson.
 ~~~
 library("here")
 library("minfi")
+~~~
+{: .language-r}
+
+
+
+~~~
+Warning: package 'S4Vectors' was built under R version 4.1.1
+~~~
+{: .warning}
+
+
+
+~~~
+Warning: package 'GenomeInfoDb' was built under R version 4.1.1
+~~~
+{: .warning}
+
+
+
+~~~
+Warning: package 'MatrixGenerics' was built under R version 4.1.1
+~~~
+{: .warning}
+
+
+
+~~~
 methylation <- readRDS(here("data/methylation.rds"))
 
 ## here, we transpose the matrix to have features as rows and samples as columns
@@ -213,7 +240,7 @@ different combinations of slope and intercept accomplish this objective.
 Mathematically, we can write the sum of squared residuals as
 
 $$
-    \sum_{i=1}^N (\hat{y}_i - y_i)^2
+    \sum_{i=1}^N ( y_i-X_i\beta)^2
 $$
 
 where $\hat{y}_i$ is the predicted y value for each input data
@@ -310,11 +337,15 @@ rely on the input data being scaled like this.
 
 ~~~
 ## centre and scale the data
-methyl_mat <- scale(methyl_mat, center = TRUE, scale = TRUE)
+methyl_mat_sc <- scale(methyl_mat, center = TRUE, scale = TRUE)
 
 coef_horvath <- coef_horvath[1:20, ]
 features <- coef_horvath$CpGmarker
-horvath_mat <- methyl_mat[, features]
+horvath_mat_sc <- methyl_mat_sc[, features]
+
+## Extract scales and centres for later
+methyl_mat_sc_scales <- attr(methyl_mat_sc, "scaled:scale")[1:20]
+methyl_mat_sc_centres <- attr(methyl_mat_sc, "scaled:center")[1:20]
 
 ## Generate an index to split the data
 set.seed(42)
@@ -338,9 +369,9 @@ train_ind <- sample(nrow(methyl_mat), 25)
 > >    
 > >    
 > >    ~~~
-> >    train_mat <- horvath_mat[train_ind, ]
+> >    train_mat_sc <- horvath_mat_sc[train_ind, ]
 > >    train_age <- age[train_ind]
-> >    test_mat <- horvath_mat[-train_ind, ]
+> >    test_mat_sc <- horvath_mat_sc[-train_ind, ]
 > >    test_age <- age[-train_ind]
 > >    ~~~
 > >    {: .language-r}
@@ -349,7 +380,7 @@ train_ind <- sample(nrow(methyl_mat), 25)
 > >    
 > >    
 > >    ~~~
-> >    fit_horvath <- lm(train_age ~ ., data = as.data.frame(train_mat))
+> >    fit_horvath <- lm(train_age ~ ., data = as.data.frame(train_mat_sc))
 > >    ~~~
 > >    {: .language-r}
 > > 
@@ -383,7 +414,7 @@ higher than the error on the training data!
 mse <- function(true, prediction) {
     mean((true - prediction)^2)
 }
-pred_lm <- predict(fit_horvath, newdata = as.data.frame(test_mat))
+pred_lm <- predict(fit_horvath, newdata = as.data.frame(test_mat_sc))
 err_lm <- mse(test_age, pred_lm)
 err_lm
 ~~~
@@ -410,7 +441,7 @@ abline(coef = 0:1, lty = "dashed")
 
 <img src="../fig/rmd-03-test-plot-lm-1.png" title="Alt" alt="Alt" width="432" style="display: block; margin: auto;" />
 
-# Ridge regression
+# What is regularisation? (using ridge regression as an example)
 
 One way to tackle these many correlated variables with lots of noise is
 *regularisation*.
@@ -436,7 +467,7 @@ coefficients, $\beta$.
 This is also sometimes called the $L^2$ norm. This is defined as
 
 $$
-    \left\lVert \beta\right\lVert_2 = \sqrt{\sum_{j=1}^p \beta_j^2}
+    \left\lVert \beta\right\lVert_2 = \sqrt{\sum_{j=1}^p \beta_j^2} 
 $$
 
 To control this, we specify that the solution for the equation above
@@ -445,7 +476,7 @@ we try to minimise a function that includes our $L^2$ norm scaled by a
 factor that is usually written $\lambda$.
 
 $$
-    \left(\sum_{i=1}^N y_i - X_i\beta\right) + \lambda \left\lVert \beta \right\lVert_2
+     \sum_{i=1}^N \biggl( y_i - X_i\beta\biggr)^2  + \lambda \left\lVert \beta \right\lVert_2 ^2
 $$
 
 Another way of thinking about this is that when finding the best model, we're
@@ -469,6 +500,26 @@ regularised and ordinary least squares.
 
 ~~~
 library("glmnet")
+~~~
+{: .language-r}
+
+
+
+~~~
+Warning: package 'glmnet' was built under R version 4.1.2
+~~~
+{: .warning}
+
+
+
+~~~
+## glmnet() performs scaling by default, supply un-scaled data:
+horvath_mat <- methyl_mat[, features] # select the first 20 sites as before
+train_mat <- horvath_mat[train_ind, ] # use the same individuals as selected before
+test_mat <- horvath_mat[-train_ind, ]
+
+
+
 ridge_fit <- glmnet(x = train_mat, y = train_age, alpha = 0)
 plot(ridge_fit, xvar = "lambda")
 abline(h = 0, lty = "dashed")
@@ -609,7 +660,8 @@ abline(v = log(chosen_lambda), lty = "dashed")
 > >    
 > >    ~~~
 > >    par(mfrow = c(1, 1))
-> >    plot(coef(fit_horvath), coef(ridge_fit, s = which_min_err),
+> >    plot(coef(fit_horvath) * c(1, methyl_mat_sc_scales) + c(0, methyl_mat_sc_centres),
+> >    coef(ridge_fit, s = which_min_err),
 > >        pch = 19
 > >    )
 > >    abline(coef = 0:1, lty = "dashed")

diff --git a/_episodes/04-regression-feature-selection.md b/_episodes/04-regression-feature-selection.md
@@ -60,6 +60,33 @@ possible combination of features to find the best one.
 ~~~
 library("here")
 library("minfi")
+~~~
+{: .language-r}
+
+
+
+~~~
+Warning: package 'S4Vectors' was built under R version 4.1.1
+~~~
+{: .warning}
+
+
+
+~~~
+Warning: package 'GenomeInfoDb' was built under R version 4.1.1
+~~~
+{: .warning}
+
+
+
+~~~
+Warning: package 'MatrixGenerics' was built under R version 4.1.1
+~~~
+{: .warning}
+
+
+
+~~~
 methylation <- readRDS(here("data/methylation.rds"))
 
 ## here, we transpose the matrix to have features as rows and samples as columns

diff --git a/_episodes/05-principal-component-analysis.md b/_episodes/05-principal-component-analysis.md
@@ -561,13 +561,47 @@ library("PCAtools")
 ~~~
 {: .language-r}
 
+
+
+~~~
+Warning: package 'ggplot2' was built under R version 4.1.2
+~~~
+{: .warning}
+
 We will first load the microarray breast cancer gene expression data and
 associated metadata, downloaded from the
 [Gene Expression Omnibus](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE2990).
 
 
 ~~~
 library("SummarizedExperiment")
+~~~
+{: .language-r}
+
+
+
+~~~
+Warning: package 'MatrixGenerics' was built under R version 4.1.1
+~~~
+{: .warning}
+
+
+
+~~~
+Warning: package 'S4Vectors' was built under R version 4.1.1
+~~~
+{: .warning}
+
+
+
+~~~
+Warning: package 'GenomeInfoDb' was built under R version 4.1.1
+~~~
+{: .warning}
+
+
+
+~~~
 cancer <- readRDS(here::here("data/cancer_expression.rds"))
 mat <- assay(cancer)
 metadata <- colData(cancer)
@@ -763,8 +797,8 @@ represents.
 > > 215281_x_at 0.003923775 0.003179556 -0.0004388192 9.664648e-05 0.003501335
 > >                    PC86        PC87          PC88         PC89         PC90
 > > 215281_x_at -0.00112973 0.006489667 -0.0005039785 -0.004296355 -0.002751513
-> >                    PC91
-> > 215281_x_at -0.01747638
+> >                   PC91
+> > 215281_x_at 0.01181236
 > > ~~~
 > > {: .output}
 > > 
@@ -826,8 +860,8 @@ represents.
 > > 211122_s_at 0.004995447 -0.008404118 0.00442875 -0.001027912 0.006104406
 > >                    PC82        PC83         PC84       PC85       PC86
 > > 211122_s_at -0.01988441 0.009667348 -0.008248781 0.01198369 0.01221713
-> >                     PC87        PC88        PC89        PC90        PC91
-> > 211122_s_at -0.003864842 -0.02876816 -0.01771452 -0.02164973 -0.02164521
+> >                     PC87        PC88        PC89        PC90       PC91
+> > 211122_s_at -0.003864842 -0.02876816 -0.01771452 -0.02164973 0.01215707
 > > ~~~
 > > {: .output}
 > > The function `pca` is used to perform PCA, and uses as inputs a matrix
@@ -946,7 +980,7 @@ biplot(pc, lab = rownames(pc$metadata), pointSize = 1, labSize = 1)
 
 
 ~~~
-Warning: ggrepel: 7 unlabeled data points (too many overlaps). Consider
+Warning: ggrepel: 6 unlabeled data points (too many overlaps). Consider
 increasing max.overlaps
 ~~~
 {: .warning}
@@ -967,7 +1001,7 @@ plotloadings(pc, labSize = 3)
 
 
 ~~~
-Warning: ggrepel: 38 unlabeled data points (too many overlaps). Consider
+Warning: ggrepel: 35 unlabeled data points (too many overlaps). Consider
 increasing max.overlaps
 ~~~
 {: .warning}
@@ -998,7 +1032,7 @@ detecting genes on each principal component.
 > > 
 > > 
 > > ~~~
-> > Warning: ggrepel: 35 unlabeled data points (too many overlaps). Consider
+> > Warning: ggrepel: 34 unlabeled data points (too many overlaps). Consider
 > > increasing max.overlaps
 > > ~~~
 > > {: .warning}
Original file line number	Diff line number	Diff line change
Expand Up		@@ -8,3 +8,5 @@ git_source(:github) {\|repo_name\| "https://github.com/#{repo_name}" }
		ruby '>=2.5.8'

		gem 'github-pages', group: :jekyll_plugins

		gem "webrick", "~> 1.7"