From 6c45420feb15650012927168b60add2b4a539ca2 Mon Sep 17 00:00:00 2001 From: Alan O'Callaghan Date: Fri, 1 Mar 2024 16:45:38 +0000 Subject: [PATCH] [automation] transform lesson to sandpaper --- .github/workflows/README.md | 198 +++ .github/workflows/pr-close-signal.yaml | 23 + .github/workflows/pr-comment.yaml | 185 +++ .github/workflows/pr-post-remove-branch.yaml | 32 + .github/workflows/pr-preflight.yaml | 39 + .github/workflows/pr-receive.yaml | 131 ++ .github/workflows/sandpaper-main.yaml | 61 + .github/workflows/sandpaper-version.txt | 1 + .github/workflows/update-cache.yaml | 125 ++ .github/workflows/update-workflows.yaml | 66 + .github/workflows/workbench-beta-phase.yml | 60 + .gitignore | 41 +- CODE_OF_CONDUCT.md | 8 +- CONTRIBUTING.md | 269 ++-- LICENSE.md | 100 +- README.md | 24 +- _extras/figures.md | 79 -- _extras/guide.md | 6 - _extras/slides.md | 13 - config.yaml | 88 ++ ...-introduction-to-high-dimensional-data.Rmd | 435 ++++++ episodes/02-high-dimensional-regression.Rmd | 1201 +++++++++++++++++ episodes/03-regression-regularisation.Rmd | 1157 ++++++++++++++++ episodes/04-principal-component-analysis.Rmd | 841 ++++++++++++ episodes/05-factor-analysis.Rmd | 318 +++++ episodes/06-k-means.Rmd | 601 +++++++++ episodes/07-hierarchical.Rmd | 741 ++++++++++ {_extras => episodes/data}/.gitkeep | 0 {data => episodes/data}/cancer_expression.R | 190 +-- {data => episodes/data}/cancer_expression.rds | Bin {data => episodes/data}/coefHorvath-raw.txt | 0 {data => episodes/data}/coefHorvath.R | 0 {data => episodes/data}/coefHorvath.rds | Bin {data => episodes/data}/methylation.R | 0 {data => episodes/data}/methylation.rds | Bin {data => episodes/data}/prostate.R | 0 {data => episodes/data}/prostate.csv | 0 {data => episodes/data}/prostate.rds | Bin {data => episodes/data}/scrnaseq.R | 0 {data => episodes/data}/scrnaseq.rds | Bin {data => episodes/fig}/.gitkeep | 0 .../fig}/bio_index_vs_percentage_fallow.png | Bin {fig => episodes/fig}/bs_fs.png | Bin {fig => episodes/fig}/bs_fs_lasso.png | Bin {fig => episodes/fig}/cross_validation.png | Bin {fig => episodes/fig}/cross_validation.svg | Bin .../fig}/hierarchical_clustering_1.png | Bin .../fig}/hierarchical_clustering_2.png | Bin .../fig}/hierarchical_clustering_3.png | Bin {fig => episodes/fig}/intro-scatterplot.png | Bin {fig => episodes/fig}/intro-table.png | Bin {fig => episodes/fig}/kmeans.R | 0 {fig => episodes/fig}/kmeans.gif | Bin .../fig}/microbiome_schematic.png | Bin .../fig}/one_dimension_species_vs_site.png | Bin {fig => episodes/fig}/pca-animation.R | 0 {fig => episodes/fig}/pendulum.gif | Bin .../fig}/rmd-01-pairs-prostate-1.png | Bin {fig => episodes/fig}/rmd-01-plot-lm-1.png | Bin {fig => episodes/fig}/rmd-01-plot-lm-2.png | Bin {fig => episodes/fig}/rmd-01-plot-lm-3.png | Bin {fig => episodes/fig}/rmd-01-plot-lm-4.png | Bin .../fig}/rmd-01-plot-random-1.png | Bin .../fig}/rmd-01-plot-random-2.png | Bin .../fig}/rmd-01-plot-random-3.png | Bin {fig => episodes/fig}/rmd-02-example1-1.png | Bin {fig => episodes/fig}/rmd-02-example2-1.png | Bin {fig => episodes/fig}/rmd-02-example3-1.png | Bin {fig => episodes/fig}/rmd-02-heatmap-1.png | Bin {fig => episodes/fig}/rmd-02-histx-1.png | Bin {fig => episodes/fig}/rmd-02-limmavolc1-1.png | Bin {fig => episodes/fig}/rmd-02-limmavolc2-1.png | Bin {fig => episodes/fig}/rmd-02-p-fdr-1.png | Bin {fig => episodes/fig}/rmd-02-p-fwer-1.png | Bin .../fig}/rmd-02-plot-fdr-fwer-1.png | Bin .../fig}/rmd-02-plot-limma-lm-effect-1.png | Bin .../fig}/rmd-02-plot-limma-lm-pval-1.png | Bin .../fig}/rmd-02-plot-lm-methyl1-1.png | Bin .../fig}/rmd-02-screening-cor-1.png | Bin .../fig}/rmd-02-screening-var-1.png | Bin {fig => episodes/fig}/rmd-02-tdist-1.png | Bin .../fig}/rmd-02-volcplotfake-1.png | Bin {fig => episodes/fig}/rmd-03-binomial-1.png | Bin .../fig}/rmd-03-chooselambda-1.png | Bin .../fig}/rmd-03-coef-ridge-lm-1.png | Bin .../fig}/rmd-03-corr-mat-meth-1.png | Bin .../fig}/rmd-03-corr-mat-prostate-1.png | Bin {fig => episodes/fig}/rmd-03-elastic-1.png | Bin .../fig}/rmd-03-elastic-contour-1.png | Bin {fig => episodes/fig}/rmd-03-elastic-cv-1.png | Bin .../fig}/rmd-03-elastic-plot-1.png | Bin .../fig}/rmd-03-heatmap-lasso-1.png | Bin {fig => episodes/fig}/rmd-03-lasso-cv-1.png | Bin {fig => episodes/fig}/rmd-03-plot-ridge-1.png | Bin .../fig}/rmd-03-plot-ridge-prediction-1.png | Bin {fig => episodes/fig}/rmd-03-plotlas-1.png | Bin {fig => episodes/fig}/rmd-03-regplot-1.png | Bin {fig => episodes/fig}/rmd-03-ridgeplot-1.png | Bin .../fig}/rmd-03-shrink-lasso-1.png | Bin .../fig}/rmd-03-test-plot-lm-1.png | Bin {fig => episodes/fig}/rmd-04-fit-scale-1.png | Bin {fig => episodes/fig}/rmd-04-likelihood-1.png | Bin {fig => episodes/fig}/rmd-04-residuals-1.png | Bin {fig => episodes/fig}/rmd-05-biplot-ex-1.png | Bin {fig => episodes/fig}/rmd-05-pairsplot-1.png | Bin {fig => episodes/fig}/rmd-05-pca-biplot-1.png | Bin .../fig}/rmd-05-pca-biplot-ex2-1.png | Bin .../fig}/rmd-05-pca-loadings-1.png | Bin {fig => episodes/fig}/rmd-05-scree-ex-1.png | Bin .../fig}/rmd-05-stats-biplot-1.png | Bin {fig => episodes/fig}/rmd-05-var-hist-1.png | Bin {fig => episodes/fig}/rmd-05-vardf-plot-1.png | Bin {fig => episodes/fig}/rmd-06-biplot-1.png | Bin {fig => episodes/fig}/rmd-07-k-ex-1.png | Bin {fig => episodes/fig}/rmd-07-ordplot-ex-1.png | Bin {fig => episodes/fig}/rmd-07-ordplot-ex-2.png | Bin {fig => episodes/fig}/rmd-07-ordplot-ex-3.png | Bin {fig => episodes/fig}/rmd-07-ordplot-ex-4.png | Bin {fig => episodes/fig}/rmd-07-ordplot1-1.png | Bin {fig => episodes/fig}/rmd-07-ordplot2-1.png | Bin {fig => episodes/fig}/rmd-07-ordplot3-1.png | Bin .../fig}/rmd-07-ordplots-123-1.png | Bin .../fig}/rmd-07-ordplots-123-2.png | Bin .../fig}/rmd-07-ordplots-123-3.png | Bin {fig => episodes/fig}/rmd-07-stressplot-1.png | Bin {fig => episodes/fig}/rmd-07-vegan-3d-1.png | Bin {fig => episodes/fig}/rmd-08-boots-1.png | Bin {fig => episodes/fig}/rmd-08-bs-ex-1.png | Bin {fig => episodes/fig}/rmd-08-bs-heatmap-1.png | Bin .../fig}/rmd-08-fake-cluster-1.png | Bin {fig => episodes/fig}/rmd-08-kmeans-1.png | Bin {fig => episodes/fig}/rmd-08-kmeans-ex-1.png | Bin .../fig}/rmd-08-plot-silhouette-1.png | Bin {fig => episodes/fig}/rmd-08-silhouette-1.png | Bin .../fig}/rmd-08-silhouette-ex-1.png | Bin .../fig}/rmd-08-unnamed-chunk-1-1.png | Bin .../fig}/rmd-09-clust-cor-cor-example-1.png | Bin .../fig}/rmd-09-clust-euc-cor-example-1.png | Bin {fig => episodes/fig}/rmd-09-cutree-1.png | Bin {fig => episodes/fig}/rmd-09-dunn-ex-1.png | Bin .../fig}/rmd-09-h-k-ex-plot-1.png | Bin .../fig}/rmd-09-h-k-ex-plot-2.png | Bin .../fig}/rmd-09-hclust-fig3-1.png | Bin .../fig}/rmd-09-hclust-fig3-2.png | Bin .../fig}/rmd-09-heatmap-clust-1.png | Bin .../fig}/rmd-09-heatmap-cor-cor-example-1.png | Bin .../fig}/rmd-09-heatmap-cor-cor-example-2.png | Bin .../fig}/rmd-09-heatmap-cor-example-1.png | Bin .../fig}/rmd-09-heatmap-noclust-1.png | Bin .../fig}/rmd-09-lineplot-cor-example-1.png | Bin .../fig}/rmd-09-plot-clust-average-1.png | Bin .../fig}/rmd-09-plot-clust-centroid-1.png | Bin .../fig}/rmd-09-plot-clust-comp-1.png | Bin .../fig}/rmd-09-plot-clust-dunn-1.png | Bin .../fig}/rmd-09-plot-clust-mcq-1.png | Bin .../fig}/rmd-09-plot-clust-median-1.png | Bin .../fig}/rmd-09-plot-clust-method-1.png | Bin .../fig}/rmd-09-plot-clust-method-2.png | Bin .../fig}/rmd-09-plot-clust-single-1.png | Bin .../fig}/rmd-09-plot-clust-ward-1.png | Bin .../fig}/rmd-09-plotclustex-1.png | Bin .../fig}/rmd-09-plotexample-1.png | Bin {fig => episodes/fig}/rmd-10-fit-dlnorm-1.png | Bin {fig => episodes/fig}/rmd-10-fit-mixem-1.png | Bin {fig => episodes/fig}/rmd-10-fit-univar-1.png | Bin .../fig}/rmd-10-mix-converged-1.png | Bin {fig => episodes/fig}/rmd-10-mix-expt-1.png | Bin {fig => episodes/fig}/rmd-10-mix2-1.png | Bin {fig => episodes/fig}/rmd-10-mix3-1.png | Bin {fig => episodes/fig}/rmd-10-mix3_2-1.png | Bin {fig => episodes/fig}/rmd-10-mixture-1.png | Bin {fig => episodes/fig}/rmd-10-mixture-2.png | Bin .../fig}/rmd-10-mixture-animation-1.png | Bin .../fig}/rmd-10-mixture-data-1.png | Bin {fig => episodes/fig}/rmd-10-mvnorm-1.png | Bin {fig => episodes/fig}/rmd-10-mvnormcor-1.png | Bin {fig => episodes/fig}/rmd-10-norms-1.png | Bin {fig => episodes/fig}/rmd-10-pcs-1.png | Bin {fig => episodes/fig}/rmd-10-reddim-1.png | Bin {fig => episodes/fig}/rmd-10-tsne-1.png | Bin {fig => episodes/fig}/rmd-10-unimodal-1.png | Bin {fig => episodes/fig}/rotating.gif | Bin {fig => episodes/fig}/silhouette5.png | Bin {fig => episodes/fig}/table_for_fa.png | Bin {fig => episodes/fig}/training_test.png | Bin .../fig}/two_dimension_species_vs_site.png | Bin {fig => episodes/fig}/validation.png | Bin {fig => episodes/fig}/validation.tex | 0 {fig => episodes/files}/.gitkeep | 0 files/.gitkeep | 0 index.md | 30 +- {_extras => instructors}/about.md | 5 +- instructors/instructor-notes.md | 9 + instructors/slides.md | 16 + reference.md => learners/reference.md | 6 +- setup.md => learners/setup.md | 7 +- profiles/learner-profiles.md | 5 + renv/activate.R | 1180 ++++++++++++++++ renv/profile | 1 + renv/profiles/lesson-requirements/renv.lock | 407 ++++++ site/README.md | 2 + 201 files changed, 8270 insertions(+), 431 deletions(-) create mode 100755 .github/workflows/README.md create mode 100755 .github/workflows/pr-close-signal.yaml create mode 100755 .github/workflows/pr-comment.yaml create mode 100755 .github/workflows/pr-post-remove-branch.yaml create mode 100755 .github/workflows/pr-preflight.yaml create mode 100755 .github/workflows/pr-receive.yaml create mode 100755 .github/workflows/sandpaper-main.yaml create mode 100644 .github/workflows/sandpaper-version.txt create mode 100755 .github/workflows/update-cache.yaml create mode 100755 .github/workflows/update-workflows.yaml create mode 100644 .github/workflows/workbench-beta-phase.yml delete mode 100644 _extras/figures.md delete mode 100644 _extras/guide.md delete mode 100644 _extras/slides.md create mode 100644 config.yaml create mode 100644 episodes/01-introduction-to-high-dimensional-data.Rmd create mode 100644 episodes/02-high-dimensional-regression.Rmd create mode 100644 episodes/03-regression-regularisation.Rmd create mode 100644 episodes/04-principal-component-analysis.Rmd create mode 100644 episodes/05-factor-analysis.Rmd create mode 100644 episodes/06-k-means.Rmd create mode 100644 episodes/07-hierarchical.Rmd rename {_extras => episodes/data}/.gitkeep (100%) rename {data => episodes/data}/cancer_expression.R (96%) rename {data => episodes/data}/cancer_expression.rds (100%) rename {data => episodes/data}/coefHorvath-raw.txt (100%) rename {data => episodes/data}/coefHorvath.R (100%) rename {data => episodes/data}/coefHorvath.rds (100%) rename {data => episodes/data}/methylation.R (100%) rename {data => episodes/data}/methylation.rds (100%) rename {data => episodes/data}/prostate.R (100%) rename {data => episodes/data}/prostate.csv (100%) rename {data => episodes/data}/prostate.rds (100%) rename {data => episodes/data}/scrnaseq.R (100%) rename {data => episodes/data}/scrnaseq.rds (100%) rename {data => episodes/fig}/.gitkeep (100%) rename {fig => episodes/fig}/bio_index_vs_percentage_fallow.png (100%) rename {fig => episodes/fig}/bs_fs.png (100%) rename {fig => episodes/fig}/bs_fs_lasso.png (100%) rename {fig => episodes/fig}/cross_validation.png (100%) rename {fig => episodes/fig}/cross_validation.svg (100%) rename {fig => episodes/fig}/hierarchical_clustering_1.png (100%) rename {fig => episodes/fig}/hierarchical_clustering_2.png (100%) rename {fig => episodes/fig}/hierarchical_clustering_3.png (100%) rename {fig => episodes/fig}/intro-scatterplot.png (100%) rename {fig => episodes/fig}/intro-table.png (100%) rename {fig => episodes/fig}/kmeans.R (100%) rename {fig => episodes/fig}/kmeans.gif (100%) rename {fig => episodes/fig}/microbiome_schematic.png (100%) rename {fig => episodes/fig}/one_dimension_species_vs_site.png (100%) rename {fig => episodes/fig}/pca-animation.R (100%) rename {fig => episodes/fig}/pendulum.gif (100%) rename {fig => episodes/fig}/rmd-01-pairs-prostate-1.png (100%) rename {fig => episodes/fig}/rmd-01-plot-lm-1.png (100%) rename {fig => episodes/fig}/rmd-01-plot-lm-2.png (100%) rename {fig => episodes/fig}/rmd-01-plot-lm-3.png (100%) rename {fig => episodes/fig}/rmd-01-plot-lm-4.png (100%) rename {fig => episodes/fig}/rmd-01-plot-random-1.png (100%) rename {fig => episodes/fig}/rmd-01-plot-random-2.png (100%) rename {fig => episodes/fig}/rmd-01-plot-random-3.png (100%) rename {fig => episodes/fig}/rmd-02-example1-1.png (100%) rename {fig => episodes/fig}/rmd-02-example2-1.png (100%) rename {fig => episodes/fig}/rmd-02-example3-1.png (100%) rename {fig => episodes/fig}/rmd-02-heatmap-1.png (100%) rename {fig => episodes/fig}/rmd-02-histx-1.png (100%) rename {fig => episodes/fig}/rmd-02-limmavolc1-1.png (100%) rename {fig => episodes/fig}/rmd-02-limmavolc2-1.png (100%) rename {fig => episodes/fig}/rmd-02-p-fdr-1.png (100%) rename {fig => episodes/fig}/rmd-02-p-fwer-1.png (100%) rename {fig => episodes/fig}/rmd-02-plot-fdr-fwer-1.png (100%) rename {fig => episodes/fig}/rmd-02-plot-limma-lm-effect-1.png (100%) rename {fig => episodes/fig}/rmd-02-plot-limma-lm-pval-1.png (100%) rename {fig => episodes/fig}/rmd-02-plot-lm-methyl1-1.png (100%) rename {fig => episodes/fig}/rmd-02-screening-cor-1.png (100%) rename {fig => episodes/fig}/rmd-02-screening-var-1.png (100%) rename {fig => episodes/fig}/rmd-02-tdist-1.png (100%) rename {fig => episodes/fig}/rmd-02-volcplotfake-1.png (100%) rename {fig => episodes/fig}/rmd-03-binomial-1.png (100%) rename {fig => episodes/fig}/rmd-03-chooselambda-1.png (100%) rename {fig => episodes/fig}/rmd-03-coef-ridge-lm-1.png (100%) rename {fig => episodes/fig}/rmd-03-corr-mat-meth-1.png (100%) rename {fig => episodes/fig}/rmd-03-corr-mat-prostate-1.png (100%) rename {fig => episodes/fig}/rmd-03-elastic-1.png (100%) rename {fig => episodes/fig}/rmd-03-elastic-contour-1.png (100%) rename {fig => episodes/fig}/rmd-03-elastic-cv-1.png (100%) rename {fig => episodes/fig}/rmd-03-elastic-plot-1.png (100%) rename {fig => episodes/fig}/rmd-03-heatmap-lasso-1.png (100%) rename {fig => episodes/fig}/rmd-03-lasso-cv-1.png (100%) rename {fig => episodes/fig}/rmd-03-plot-ridge-1.png (100%) rename {fig => episodes/fig}/rmd-03-plot-ridge-prediction-1.png (100%) rename {fig => episodes/fig}/rmd-03-plotlas-1.png (100%) rename {fig => episodes/fig}/rmd-03-regplot-1.png (100%) rename {fig => episodes/fig}/rmd-03-ridgeplot-1.png (100%) rename {fig => episodes/fig}/rmd-03-shrink-lasso-1.png (100%) rename {fig => episodes/fig}/rmd-03-test-plot-lm-1.png (100%) rename {fig => episodes/fig}/rmd-04-fit-scale-1.png (100%) rename {fig => episodes/fig}/rmd-04-likelihood-1.png (100%) rename {fig => episodes/fig}/rmd-04-residuals-1.png (100%) rename {fig => episodes/fig}/rmd-05-biplot-ex-1.png (100%) rename {fig => episodes/fig}/rmd-05-pairsplot-1.png (100%) rename {fig => episodes/fig}/rmd-05-pca-biplot-1.png (100%) rename {fig => episodes/fig}/rmd-05-pca-biplot-ex2-1.png (100%) rename {fig => episodes/fig}/rmd-05-pca-loadings-1.png (100%) rename {fig => episodes/fig}/rmd-05-scree-ex-1.png (100%) rename {fig => episodes/fig}/rmd-05-stats-biplot-1.png (100%) rename {fig => episodes/fig}/rmd-05-var-hist-1.png (100%) rename {fig => episodes/fig}/rmd-05-vardf-plot-1.png (100%) rename {fig => episodes/fig}/rmd-06-biplot-1.png (100%) rename {fig => episodes/fig}/rmd-07-k-ex-1.png (100%) rename {fig => episodes/fig}/rmd-07-ordplot-ex-1.png (100%) rename {fig => episodes/fig}/rmd-07-ordplot-ex-2.png (100%) rename {fig => episodes/fig}/rmd-07-ordplot-ex-3.png (100%) rename {fig => episodes/fig}/rmd-07-ordplot-ex-4.png (100%) rename {fig => episodes/fig}/rmd-07-ordplot1-1.png (100%) rename {fig => episodes/fig}/rmd-07-ordplot2-1.png (100%) rename {fig => episodes/fig}/rmd-07-ordplot3-1.png (100%) rename {fig => episodes/fig}/rmd-07-ordplots-123-1.png (100%) rename {fig => episodes/fig}/rmd-07-ordplots-123-2.png (100%) rename {fig => episodes/fig}/rmd-07-ordplots-123-3.png (100%) rename {fig => episodes/fig}/rmd-07-stressplot-1.png (100%) rename {fig => episodes/fig}/rmd-07-vegan-3d-1.png (100%) rename {fig => episodes/fig}/rmd-08-boots-1.png (100%) rename {fig => episodes/fig}/rmd-08-bs-ex-1.png (100%) rename {fig => episodes/fig}/rmd-08-bs-heatmap-1.png (100%) rename {fig => episodes/fig}/rmd-08-fake-cluster-1.png (100%) rename {fig => episodes/fig}/rmd-08-kmeans-1.png (100%) rename {fig => episodes/fig}/rmd-08-kmeans-ex-1.png (100%) rename {fig => episodes/fig}/rmd-08-plot-silhouette-1.png (100%) rename {fig => episodes/fig}/rmd-08-silhouette-1.png (100%) rename {fig => episodes/fig}/rmd-08-silhouette-ex-1.png (100%) rename {fig => episodes/fig}/rmd-08-unnamed-chunk-1-1.png (100%) rename {fig => episodes/fig}/rmd-09-clust-cor-cor-example-1.png (100%) rename {fig => episodes/fig}/rmd-09-clust-euc-cor-example-1.png (100%) rename {fig => episodes/fig}/rmd-09-cutree-1.png (100%) rename {fig => episodes/fig}/rmd-09-dunn-ex-1.png (100%) rename {fig => episodes/fig}/rmd-09-h-k-ex-plot-1.png (100%) rename {fig => episodes/fig}/rmd-09-h-k-ex-plot-2.png (100%) rename {fig => episodes/fig}/rmd-09-hclust-fig3-1.png (100%) rename {fig => episodes/fig}/rmd-09-hclust-fig3-2.png (100%) rename {fig => episodes/fig}/rmd-09-heatmap-clust-1.png (100%) rename {fig => episodes/fig}/rmd-09-heatmap-cor-cor-example-1.png (100%) rename {fig => episodes/fig}/rmd-09-heatmap-cor-cor-example-2.png (100%) rename {fig => episodes/fig}/rmd-09-heatmap-cor-example-1.png (100%) rename {fig => episodes/fig}/rmd-09-heatmap-noclust-1.png (100%) rename {fig => episodes/fig}/rmd-09-lineplot-cor-example-1.png (100%) rename {fig => episodes/fig}/rmd-09-plot-clust-average-1.png (100%) rename {fig => episodes/fig}/rmd-09-plot-clust-centroid-1.png (100%) rename {fig => episodes/fig}/rmd-09-plot-clust-comp-1.png (100%) rename {fig => episodes/fig}/rmd-09-plot-clust-dunn-1.png (100%) rename {fig => episodes/fig}/rmd-09-plot-clust-mcq-1.png (100%) rename {fig => episodes/fig}/rmd-09-plot-clust-median-1.png (100%) rename {fig => episodes/fig}/rmd-09-plot-clust-method-1.png (100%) rename {fig => episodes/fig}/rmd-09-plot-clust-method-2.png (100%) rename {fig => episodes/fig}/rmd-09-plot-clust-single-1.png (100%) rename {fig => episodes/fig}/rmd-09-plot-clust-ward-1.png (100%) rename {fig => episodes/fig}/rmd-09-plotclustex-1.png (100%) rename {fig => episodes/fig}/rmd-09-plotexample-1.png (100%) rename {fig => episodes/fig}/rmd-10-fit-dlnorm-1.png (100%) rename {fig => episodes/fig}/rmd-10-fit-mixem-1.png (100%) rename {fig => episodes/fig}/rmd-10-fit-univar-1.png (100%) rename {fig => episodes/fig}/rmd-10-mix-converged-1.png (100%) rename {fig => episodes/fig}/rmd-10-mix-expt-1.png (100%) rename {fig => episodes/fig}/rmd-10-mix2-1.png (100%) rename {fig => episodes/fig}/rmd-10-mix3-1.png (100%) rename {fig => episodes/fig}/rmd-10-mix3_2-1.png (100%) rename {fig => episodes/fig}/rmd-10-mixture-1.png (100%) rename {fig => episodes/fig}/rmd-10-mixture-2.png (100%) rename {fig => episodes/fig}/rmd-10-mixture-animation-1.png (100%) rename {fig => episodes/fig}/rmd-10-mixture-data-1.png (100%) rename {fig => episodes/fig}/rmd-10-mvnorm-1.png (100%) rename {fig => episodes/fig}/rmd-10-mvnormcor-1.png (100%) rename {fig => episodes/fig}/rmd-10-norms-1.png (100%) rename {fig => episodes/fig}/rmd-10-pcs-1.png (100%) rename {fig => episodes/fig}/rmd-10-reddim-1.png (100%) rename {fig => episodes/fig}/rmd-10-tsne-1.png (100%) rename {fig => episodes/fig}/rmd-10-unimodal-1.png (100%) rename {fig => episodes/fig}/rotating.gif (100%) rename {fig => episodes/fig}/silhouette5.png (100%) rename {fig => episodes/fig}/table_for_fa.png (100%) rename {fig => episodes/fig}/training_test.png (100%) rename {fig => episodes/fig}/two_dimension_species_vs_site.png (100%) rename {fig => episodes/fig}/validation.png (100%) rename {fig => episodes/fig}/validation.tex (100%) rename {fig => episodes/files}/.gitkeep (100%) delete mode 100644 files/.gitkeep rename {_extras => instructors}/about.md (69%) create mode 100644 instructors/instructor-notes.md create mode 100644 instructors/slides.md rename reference.md => learners/reference.md (67%) rename setup.md => learners/setup.md (95%) create mode 100644 profiles/learner-profiles.md create mode 100644 renv/activate.R create mode 100644 renv/profile create mode 100644 renv/profiles/lesson-requirements/renv.lock create mode 100644 site/README.md diff --git a/.github/workflows/README.md b/.github/workflows/README.md new file mode 100755 index 00000000..d6edf88d --- /dev/null +++ b/.github/workflows/README.md @@ -0,0 +1,198 @@ +# Carpentries Workflows + +This directory contains workflows to be used for Lessons using the {sandpaper} +lesson infrastructure. Two of these workflows require R (`sandpaper-main.yaml` +and `pr-recieve.yaml`) and the rest are bots to handle pull request management. + +These workflows will likely change as {sandpaper} evolves, so it is important to +keep them up-to-date. To do this in your lesson you can do the following in your +R console: + +```r +# Install/Update sandpaper +options(repos = c(carpentries = "https://carpentries.r-universe.dev/", + CRAN = "https://cloud.r-project.org")) +install.packages("sandpaper") + +# update the workflows in your lesson +library("sandpaper") +update_github_workflows() +``` + +Inside this folder, you will find a file called `sandpaper-version.txt`, which +will contain a version number for sandpaper. This will be used in the future to +alert you if a workflow update is needed. + +What follows are the descriptions of the workflow files: + +## Deployment + +### 01 Build and Deploy (sandpaper-main.yaml) + +This is the main driver that will only act on the main branch of the repository. +This workflow does the following: + + 1. checks out the lesson + 2. provisions the following resources + - R + - pandoc + - lesson infrastructure (stored in a cache) + - lesson dependencies if needed (stored in a cache) + 3. builds the lesson via `sandpaper:::ci_deploy()` + +#### Caching + +This workflow has two caches; one cache is for the lesson infrastructure and +the other is for the the lesson dependencies if the lesson contains rendered +content. These caches are invalidated by new versions of the infrastructure and +the `renv.lock` file, respectively. If there is a problem with the cache, +manual invaliation is necessary. You will need maintain access to the repository +and you can either go to the actions tab and [click on the caches button to find +and invalidate the failing cache](https://github.blog/changelog/2022-10-20-manage-caches-in-your-actions-workflows-from-web-interface/) +or by setting the `CACHE_VERSION` secret to the current date (which will +invalidate all of the caches). + +## Updates + +### Setup Information + +These workflows run on a schedule and at the maintainer's request. Because they +create pull requests that update workflows/require the downstream actions to run, +they need a special repository/organization secret token called +`SANDPAPER_WORKFLOW` and it must have the `public_repo` and `workflow` scope. + +This can be an individual user token, OR it can be a trusted bot account. If you +have a repository in one of the official Carpentries accounts, then you do not +need to worry about this token being present because the Carpentries Core Team +will take care of supplying this token. + +If you want to use your personal account: you can go to + +to create a token. Once you have created your token, you should copy it to your +clipboard and then go to your repository's settings > secrets > actions and +create or edit the `SANDPAPER_WORKFLOW` secret, pasting in the generated token. + +If you do not specify your token correctly, the runs will not fail and they will +give you instructions to provide the token for your repository. + +### 02 Maintain: Update Workflow Files (update-workflow.yaml) + +The {sandpaper} repository was designed to do as much as possible to separate +the tools from the content. For local builds, this is absolutely true, but +there is a minor issue when it comes to workflow files: they must live inside +the repository. + +This workflow ensures that the workflow files are up-to-date. The way it work is +to download the update-workflows.sh script from GitHub and run it. The script +will do the following: + +1. check the recorded version of sandpaper against the current version on github +2. update the files if there is a difference in versions + +After the files are updated, if there are any changes, they are pushed to a +branch called `update/workflows` and a pull request is created. Maintainers are +encouraged to review the changes and accept the pull request if the outputs +are okay. + +This update is run ~~weekly or~~ on demand. + +### 03 Maintain: Update Package Cache (update-cache.yaml) + +For lessons that have generated content, we use {renv} to ensure that the output +is stable. This is controlled by a single lockfile which documents the packages +needed for the lesson and the version numbers. This workflow is skipped in +lessons that do not have generated content. + +Because the lessons need to remain current with the package ecosystem, it's a +good idea to make sure these packages can be updated periodically. The +update cache workflow will do this by checking for updates, applying them in a +branch called `updates/packages` and creating a pull request with _only the +lockfile changed_. + +From here, the markdown documents will be rebuilt and you can inspect what has +changed based on how the packages have updated. + +## Pull Request and Review Management + +Because our lessons execute code, pull requests are a secruity risk for any +lesson and thus have security measures associted with them. **Do not merge any +pull requests that do not pass checks and do not have bots commented on them.** + +This series of workflows all go together and are described in the following +diagram and the below sections: + +![Graph representation of a pull request](https://carpentries.github.io/sandpaper/articles/img/pr-flow.dot.svg) + +### Pre Flight Pull Request Validation (pr-preflight.yaml) + +This workflow runs every time a pull request is created and its purpose is to +validate that the pull request is okay to run. This means the following things: + +1. The pull request does not contain modified workflow files +2. If the pull request contains modified workflow files, it does not contain + modified content files (such as a situation where @carpentries-bot will + make an automated pull request) +3. The pull request does not contain an invalid commit hash (e.g. from a fork + that was made before a lesson was transitioned from styles to use the + workbench). + +Once the checks are finished, a comment is issued to the pull request, which +will allow maintainers to determine if it is safe to run the +"Receive Pull Request" workflow from new contributors. + +### Recieve Pull Request (pr-recieve.yaml) + +**Note of caution:** This workflow runs arbitrary code by anyone who creates a +pull request. GitHub has safeguarded the token used in this workflow to have no +priviledges in the repository, but we have taken precautions to protect against +spoofing. + +This workflow is triggered with every push to a pull request. If this workflow +is already running and a new push is sent to the pull request, the workflow +running from the previous push will be cancelled and a new workflow run will be +started. + +The first step of this workflow is to check if it is valid (e.g. that no +workflow files have been modified). If there are workflow files that have been +modified, a comment is made that indicates that the workflow is not run. If +both a workflow file and lesson content is modified, an error will occurr. + +The second step (if valid) is to build the generated content from the pull +request. This builds the content and uploads three artifacts: + +1. The pull request number (pr) +2. A summary of changes after the rendering process (diff) +3. The rendered files (build) + +Because this workflow builds generated content, it follows the same general +process as the `sandpaper-main` workflow with the same caching mechanisms. + +The artifacts produced are used by the next workflow. + +### Comment on Pull Request (pr-comment.yaml) + +This workflow is triggered if the `pr-recieve.yaml` workflow is successful. +The steps in this workflow are: + +1. Test if the workflow is valid and comment the validity of the workflow to the + pull request. +2. If it is valid: create an orphan branch with two commits: the current state + of the repository and the proposed changes. +3. If it is valid: update the pull request comment with the summary of changes + +Importantly: if the pull request is invalid, the branch is not created so any +malicious code is not published. + +From here, the maintainer can request changes from the author and eventually +either merge or reject the PR. When this happens, if the PR was valid, the +preview branch needs to be deleted. + +### Send Close PR Signal (pr-close-signal.yaml) + +Triggered any time a pull request is closed. This emits an artifact that is the +pull request number for the next action + +### Remove Pull Request Branch (pr-post-remove-branch.yaml) + +Tiggered by `pr-close-signal.yaml`. This removes the temporary branch associated with +the pull request (if it was created). diff --git a/.github/workflows/pr-close-signal.yaml b/.github/workflows/pr-close-signal.yaml new file mode 100755 index 00000000..9b129d5d --- /dev/null +++ b/.github/workflows/pr-close-signal.yaml @@ -0,0 +1,23 @@ +name: "Bot: Send Close Pull Request Signal" + +on: + pull_request: + types: + [closed] + +jobs: + send-close-signal: + name: "Send closing signal" + runs-on: ubuntu-latest + if: ${{ github.event.action == 'closed' }} + steps: + - name: "Create PRtifact" + run: | + mkdir -p ./pr + printf ${{ github.event.number }} > ./pr/NUM + - name: Upload Diff + uses: actions/upload-artifact@v3 + with: + name: pr + path: ./pr + diff --git a/.github/workflows/pr-comment.yaml b/.github/workflows/pr-comment.yaml new file mode 100755 index 00000000..bb2eb03c --- /dev/null +++ b/.github/workflows/pr-comment.yaml @@ -0,0 +1,185 @@ +name: "Bot: Comment on the Pull Request" + +# read-write repo token +# access to secrets +on: + workflow_run: + workflows: ["Receive Pull Request"] + types: + - completed + +concurrency: + group: pr-${{ github.event.workflow_run.pull_requests[0].number }} + cancel-in-progress: true + + +jobs: + # Pull requests are valid if: + # - they match the sha of the workflow run head commit + # - they are open + # - no .github files were committed + test-pr: + name: "Test if pull request is valid" + runs-on: ubuntu-latest + if: > + github.event.workflow_run.event == 'pull_request' && + github.event.workflow_run.conclusion == 'success' + outputs: + is_valid: ${{ steps.check-pr.outputs.VALID }} + payload: ${{ steps.check-pr.outputs.payload }} + number: ${{ steps.get-pr.outputs.NUM }} + msg: ${{ steps.check-pr.outputs.MSG }} + steps: + - name: 'Download PR artifact' + id: dl + uses: carpentries/actions/download-workflow-artifact@main + with: + run: ${{ github.event.workflow_run.id }} + name: 'pr' + + - name: "Get PR Number" + if: ${{ steps.dl.outputs.success == 'true' }} + id: get-pr + run: | + unzip pr.zip + echo "NUM=$(<./NR)" >> $GITHUB_OUTPUT + + - name: "Fail if PR number was not present" + id: bad-pr + if: ${{ steps.dl.outputs.success != 'true' }} + run: | + echo '::error::A pull request number was not recorded. The pull request that triggered this workflow is likely malicious.' + exit 1 + - name: "Get Invalid Hashes File" + id: hash + run: | + echo "json<> $GITHUB_OUTPUT + - name: "Check PR" + id: check-pr + if: ${{ steps.dl.outputs.success == 'true' }} + uses: carpentries/actions/check-valid-pr@main + with: + pr: ${{ steps.get-pr.outputs.NUM }} + sha: ${{ github.event.workflow_run.head_sha }} + headroom: 3 # if it's within the last three commits, we can keep going, because it's likely rapid-fire + invalid: ${{ fromJSON(steps.hash.outputs.json)[github.repository] }} + fail_on_error: true + + # Create an orphan branch on this repository with two commits + # - the current HEAD of the md-outputs branch + # - the output from running the current HEAD of the pull request through + # the md generator + create-branch: + name: "Create Git Branch" + needs: test-pr + runs-on: ubuntu-latest + if: ${{ needs.test-pr.outputs.is_valid == 'true' }} + env: + NR: ${{ needs.test-pr.outputs.number }} + permissions: + contents: write + steps: + - name: 'Checkout md outputs' + uses: actions/checkout@v3 + with: + ref: md-outputs + path: built + fetch-depth: 1 + + - name: 'Download built markdown' + id: dl + uses: carpentries/actions/download-workflow-artifact@main + with: + run: ${{ github.event.workflow_run.id }} + name: 'built' + + - if: ${{ steps.dl.outputs.success == 'true' }} + run: unzip built.zip + + - name: "Create orphan and push" + if: ${{ steps.dl.outputs.success == 'true' }} + run: | + cd built/ + git config --local user.email "actions@github.com" + git config --local user.name "GitHub Actions" + CURR_HEAD=$(git rev-parse HEAD) + git checkout --orphan md-outputs-PR-${NR} + git add -A + git commit -m "source commit: ${CURR_HEAD}" + ls -A | grep -v '^.git$' | xargs -I _ rm -r '_' + cd .. + unzip -o -d built built.zip + cd built + git add -A + git commit --allow-empty -m "differences for PR #${NR}" + git push -u --force --set-upstream origin md-outputs-PR-${NR} + + # Comment on the Pull Request with a link to the branch and the diff + comment-pr: + name: "Comment on Pull Request" + needs: [test-pr, create-branch] + runs-on: ubuntu-latest + if: ${{ needs.test-pr.outputs.is_valid == 'true' }} + env: + NR: ${{ needs.test-pr.outputs.number }} + permissions: + pull-requests: write + steps: + - name: 'Download comment artifact' + id: dl + uses: carpentries/actions/download-workflow-artifact@main + with: + run: ${{ github.event.workflow_run.id }} + name: 'diff' + + - if: ${{ steps.dl.outputs.success == 'true' }} + run: unzip ${{ github.workspace }}/diff.zip + + - name: "Comment on PR" + id: comment-diff + if: ${{ steps.dl.outputs.success == 'true' }} + uses: carpentries/actions/comment-diff@main + with: + pr: ${{ env.NR }} + path: ${{ github.workspace }}/diff.md + + # Comment if the PR is open and matches the SHA, but the workflow files have + # changed + comment-changed-workflow: + name: "Comment if workflow files have changed" + needs: test-pr + runs-on: ubuntu-latest + if: ${{ always() && needs.test-pr.outputs.is_valid == 'false' }} + env: + NR: ${{ github.event.workflow_run.pull_requests[0].number }} + body: ${{ needs.test-pr.outputs.msg }} + permissions: + pull-requests: write + steps: + - name: 'Check for spoofing' + id: dl + uses: carpentries/actions/download-workflow-artifact@main + with: + run: ${{ github.event.workflow_run.id }} + name: 'built' + + - name: 'Alert if spoofed' + id: spoof + if: ${{ steps.dl.outputs.success == 'true' }} + run: | + echo 'body<> $GITHUB_ENV + echo '' >> $GITHUB_ENV + echo '## :x: DANGER :x:' >> $GITHUB_ENV + echo 'This pull request has modified workflows that created output. Close this now.' >> $GITHUB_ENV + echo '' >> $GITHUB_ENV + echo 'EOF' >> $GITHUB_ENV + + - name: "Comment on PR" + id: comment-diff + uses: carpentries/actions/comment-diff@main + with: + pr: ${{ env.NR }} + body: ${{ env.body }} + diff --git a/.github/workflows/pr-post-remove-branch.yaml b/.github/workflows/pr-post-remove-branch.yaml new file mode 100755 index 00000000..62c2e98d --- /dev/null +++ b/.github/workflows/pr-post-remove-branch.yaml @@ -0,0 +1,32 @@ +name: "Bot: Remove Temporary PR Branch" + +on: + workflow_run: + workflows: ["Bot: Send Close Pull Request Signal"] + types: + - completed + +jobs: + delete: + name: "Delete branch from Pull Request" + runs-on: ubuntu-latest + if: > + github.event.workflow_run.event == 'pull_request' && + github.event.workflow_run.conclusion == 'success' + permissions: + contents: write + steps: + - name: 'Download artifact' + uses: carpentries/actions/download-workflow-artifact@main + with: + run: ${{ github.event.workflow_run.id }} + name: pr + - name: "Get PR Number" + id: get-pr + run: | + unzip pr.zip + echo "NUM=$(<./NUM)" >> $GITHUB_OUTPUT + - name: 'Remove branch' + uses: carpentries/actions/remove-branch@main + with: + pr: ${{ steps.get-pr.outputs.NUM }} diff --git a/.github/workflows/pr-preflight.yaml b/.github/workflows/pr-preflight.yaml new file mode 100755 index 00000000..d0d7420d --- /dev/null +++ b/.github/workflows/pr-preflight.yaml @@ -0,0 +1,39 @@ +name: "Pull Request Preflight Check" + +on: + pull_request_target: + branches: + ["main"] + types: + ["opened", "synchronize", "reopened"] + +jobs: + test-pr: + name: "Test if pull request is valid" + if: ${{ github.event.action != 'closed' }} + runs-on: ubuntu-latest + outputs: + is_valid: ${{ steps.check-pr.outputs.VALID }} + permissions: + pull-requests: write + steps: + - name: "Get Invalid Hashes File" + id: hash + run: | + echo "json<> $GITHUB_OUTPUT + - name: "Check PR" + id: check-pr + uses: carpentries/actions/check-valid-pr@main + with: + pr: ${{ github.event.number }} + invalid: ${{ fromJSON(steps.hash.outputs.json)[github.repository] }} + fail_on_error: true + - name: "Comment result of validation" + id: comment-diff + if: ${{ always() }} + uses: carpentries/actions/comment-diff@main + with: + pr: ${{ github.event.number }} + body: ${{ steps.check-pr.outputs.MSG }} diff --git a/.github/workflows/pr-receive.yaml b/.github/workflows/pr-receive.yaml new file mode 100755 index 00000000..371ef542 --- /dev/null +++ b/.github/workflows/pr-receive.yaml @@ -0,0 +1,131 @@ +name: "Receive Pull Request" + +on: + pull_request: + types: + [opened, synchronize, reopened] + +concurrency: + group: ${{ github.ref }} + cancel-in-progress: true + +jobs: + test-pr: + name: "Record PR number" + if: ${{ github.event.action != 'closed' }} + runs-on: ubuntu-latest + outputs: + is_valid: ${{ steps.check-pr.outputs.VALID }} + steps: + - name: "Record PR number" + id: record + if: ${{ always() }} + run: | + echo ${{ github.event.number }} > ${{ github.workspace }}/NR # 2022-03-02: artifact name fixed to be NR + - name: "Upload PR number" + id: upload + if: ${{ always() }} + uses: actions/upload-artifact@v3 + with: + name: pr + path: ${{ github.workspace }}/NR + - name: "Get Invalid Hashes File" + id: hash + run: | + echo "json<> $GITHUB_OUTPUT + - name: "echo output" + run: | + echo "${{ steps.hash.outputs.json }}" + - name: "Check PR" + id: check-pr + uses: carpentries/actions/check-valid-pr@main + with: + pr: ${{ github.event.number }} + invalid: ${{ fromJSON(steps.hash.outputs.json)[github.repository] }} + + build-md-source: + name: "Build markdown source files if valid" + needs: test-pr + runs-on: ubuntu-latest + if: ${{ needs.test-pr.outputs.is_valid == 'true' }} + env: + GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} + RENV_PATHS_ROOT: ~/.local/share/renv/ + CHIVE: ${{ github.workspace }}/site/chive + PR: ${{ github.workspace }}/site/pr + MD: ${{ github.workspace }}/site/built + steps: + - name: "Check Out Main Branch" + uses: actions/checkout@v3 + + - name: "Check Out Staging Branch" + uses: actions/checkout@v3 + with: + ref: md-outputs + path: ${{ env.MD }} + + - name: "Set up R" + uses: r-lib/actions/setup-r@v2 + with: + use-public-rspm: true + install-r: false + + - name: "Set up Pandoc" + uses: r-lib/actions/setup-pandoc@v2 + + - name: "Setup Lesson Engine" + uses: carpentries/actions/setup-sandpaper@main + with: + cache-version: ${{ secrets.CACHE_VERSION }} + + - name: "Setup Package Cache" + uses: carpentries/actions/setup-lesson-deps@main + with: + cache-version: ${{ secrets.CACHE_VERSION }} + + - name: "Validate and Build Markdown" + id: build-site + run: | + sandpaper::package_cache_trigger(TRUE) + sandpaper::validate_lesson(path = '${{ github.workspace }}') + sandpaper:::build_markdown(path = '${{ github.workspace }}', quiet = FALSE) + shell: Rscript {0} + + - name: "Generate Artifacts" + id: generate-artifacts + run: | + sandpaper:::ci_bundle_pr_artifacts( + repo = '${{ github.repository }}', + pr_number = '${{ github.event.number }}', + path_md = '${{ env.MD }}', + path_pr = '${{ env.PR }}', + path_archive = '${{ env.CHIVE }}', + branch = 'md-outputs' + ) + shell: Rscript {0} + + - name: "Upload PR" + uses: actions/upload-artifact@v3 + with: + name: pr + path: ${{ env.PR }} + + - name: "Upload Diff" + uses: actions/upload-artifact@v3 + with: + name: diff + path: ${{ env.CHIVE }} + retention-days: 1 + + - name: "Upload Build" + uses: actions/upload-artifact@v3 + with: + name: built + path: ${{ env.MD }} + retention-days: 1 + + - name: "Teardown" + run: sandpaper::reset_site() + shell: Rscript {0} diff --git a/.github/workflows/sandpaper-main.yaml b/.github/workflows/sandpaper-main.yaml new file mode 100755 index 00000000..e17707ac --- /dev/null +++ b/.github/workflows/sandpaper-main.yaml @@ -0,0 +1,61 @@ +name: "01 Build and Deploy Site" + +on: + push: + branches: + - main + - master + schedule: + - cron: '0 0 * * 2' + workflow_dispatch: + inputs: + name: + description: 'Who triggered this build?' + required: true + default: 'Maintainer (via GitHub)' + reset: + description: 'Reset cached markdown files' + required: false + default: false + type: boolean +jobs: + full-build: + name: "Build Full Site" + runs-on: ubuntu-latest + permissions: + checks: write + contents: write + pages: write + env: + GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} + RENV_PATHS_ROOT: ~/.local/share/renv/ + steps: + + - name: "Checkout Lesson" + uses: actions/checkout@v3 + + - name: "Set up R" + uses: r-lib/actions/setup-r@v2 + with: + use-public-rspm: true + install-r: false + + - name: "Set up Pandoc" + uses: r-lib/actions/setup-pandoc@v2 + + - name: "Setup Lesson Engine" + uses: carpentries/actions/setup-sandpaper@main + with: + cache-version: ${{ secrets.CACHE_VERSION }} + + - name: "Setup Package Cache" + uses: carpentries/actions/setup-lesson-deps@main + with: + cache-version: ${{ secrets.CACHE_VERSION }} + + - name: "Deploy Site" + run: | + reset <- "${{ github.event.inputs.reset }}" == "true" + sandpaper::package_cache_trigger(TRUE) + sandpaper:::ci_deploy(reset = reset) + shell: Rscript {0} diff --git a/.github/workflows/sandpaper-version.txt b/.github/workflows/sandpaper-version.txt new file mode 100644 index 00000000..201a22c8 --- /dev/null +++ b/.github/workflows/sandpaper-version.txt @@ -0,0 +1 @@ +0.16.2 diff --git a/.github/workflows/update-cache.yaml b/.github/workflows/update-cache.yaml new file mode 100755 index 00000000..676d7424 --- /dev/null +++ b/.github/workflows/update-cache.yaml @@ -0,0 +1,125 @@ +name: "03 Maintain: Update Package Cache" + +on: + workflow_dispatch: + inputs: + name: + description: 'Who triggered this build (enter github username to tag yourself)?' + required: true + default: 'monthly run' + schedule: + # Run every tuesday + - cron: '0 0 * * 2' + +jobs: + preflight: + name: "Preflight Check" + runs-on: ubuntu-latest + outputs: + ok: ${{ steps.check.outputs.ok }} + steps: + - id: check + run: | + if [[ ${{ github.event_name }} == 'workflow_dispatch' ]]; then + echo "ok=true" >> $GITHUB_OUTPUT + echo "Running on request" + # using single brackets here to avoid 08 being interpreted as octal + # https://github.com/carpentries/sandpaper/issues/250 + elif [ `date +%d` -le 7 ]; then + # If the Tuesday lands in the first week of the month, run it + echo "ok=true" >> $GITHUB_OUTPUT + echo "Running on schedule" + else + echo "ok=false" >> $GITHUB_OUTPUT + echo "Not Running Today" + fi + + check_renv: + name: "Check if We Need {renv}" + runs-on: ubuntu-latest + needs: preflight + if: ${{ needs.preflight.outputs.ok == 'true'}} + outputs: + needed: ${{ steps.renv.outputs.exists }} + steps: + - name: "Checkout Lesson" + uses: actions/checkout@v3 + - id: renv + run: | + if [[ -d renv ]]; then + echo "exists=true" >> $GITHUB_OUTPUT + fi + + check_token: + name: "Check SANDPAPER_WORKFLOW token" + runs-on: ubuntu-latest + needs: check_renv + if: ${{ needs.check_renv.outputs.needed == 'true' }} + outputs: + workflow: ${{ steps.validate.outputs.wf }} + repo: ${{ steps.validate.outputs.repo }} + steps: + - name: "validate token" + id: validate + uses: carpentries/actions/check-valid-credentials@main + with: + token: ${{ secrets.SANDPAPER_WORKFLOW }} + + update_cache: + name: "Update Package Cache" + needs: check_token + if: ${{ needs.check_token.outputs.repo== 'true' }} + runs-on: ubuntu-latest + env: + GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} + RENV_PATHS_ROOT: ~/.local/share/renv/ + steps: + + - name: "Checkout Lesson" + uses: actions/checkout@v3 + + - name: "Set up R" + uses: r-lib/actions/setup-r@v2 + with: + use-public-rspm: true + install-r: false + + - name: "Update {renv} deps and determine if a PR is needed" + id: update + uses: carpentries/actions/update-lockfile@main + with: + cache-version: ${{ secrets.CACHE_VERSION }} + + - name: Create Pull Request + id: cpr + if: ${{ steps.update.outputs.n > 0 }} + uses: carpentries/create-pull-request@main + with: + token: ${{ secrets.SANDPAPER_WORKFLOW }} + delete-branch: true + branch: "update/packages" + commit-message: "[actions] update ${{ steps.update.outputs.n }} packages" + title: "Update ${{ steps.update.outputs.n }} packages" + body: | + :robot: This is an automated build + + This will update ${{ steps.update.outputs.n }} packages in your lesson with the following versions: + + ``` + ${{ steps.update.outputs.report }} + ``` + + :stopwatch: In a few minutes, a comment will appear that will show you how the output has changed based on these updates. + + If you want to inspect these changes locally, you can use the following code to check out a new branch: + + ```bash + git fetch origin update/packages + git checkout update/packages + ``` + + - Auto-generated by [create-pull-request][1] on ${{ steps.update.outputs.date }} + + [1]: https://github.com/carpentries/create-pull-request/tree/main + labels: "type: package cache" + draft: false diff --git a/.github/workflows/update-workflows.yaml b/.github/workflows/update-workflows.yaml new file mode 100755 index 00000000..288bcd13 --- /dev/null +++ b/.github/workflows/update-workflows.yaml @@ -0,0 +1,66 @@ +name: "02 Maintain: Update Workflow Files" + +on: + workflow_dispatch: + inputs: + name: + description: 'Who triggered this build (enter github username to tag yourself)?' + required: true + default: 'weekly run' + clean: + description: 'Workflow files/file extensions to clean (no wildcards, enter "" for none)' + required: false + default: '.yaml' + schedule: + # Run every Tuesday + - cron: '0 0 * * 2' + +jobs: + check_token: + name: "Check SANDPAPER_WORKFLOW token" + runs-on: ubuntu-latest + outputs: + workflow: ${{ steps.validate.outputs.wf }} + repo: ${{ steps.validate.outputs.repo }} + steps: + - name: "validate token" + id: validate + uses: carpentries/actions/check-valid-credentials@main + with: + token: ${{ secrets.SANDPAPER_WORKFLOW }} + + update_workflow: + name: "Update Workflow" + runs-on: ubuntu-latest + needs: check_token + if: ${{ needs.check_token.outputs.workflow == 'true' }} + steps: + - name: "Checkout Repository" + uses: actions/checkout@v3 + + - name: Update Workflows + id: update + uses: carpentries/actions/update-workflows@main + with: + clean: ${{ github.event.inputs.clean }} + + - name: Create Pull Request + id: cpr + if: "${{ steps.update.outputs.new }}" + uses: carpentries/create-pull-request@main + with: + token: ${{ secrets.SANDPAPER_WORKFLOW }} + delete-branch: true + branch: "update/workflows" + commit-message: "[actions] update sandpaper workflow to version ${{ steps.update.outputs.new }}" + title: "Update Workflows to Version ${{ steps.update.outputs.new }}" + body: | + :robot: This is an automated build + + Update Workflows from sandpaper version ${{ steps.update.outputs.old }} -> ${{ steps.update.outputs.new }} + + - Auto-generated by [create-pull-request][1] on ${{ steps.update.outputs.date }} + + [1]: https://github.com/carpentries/create-pull-request/tree/main + labels: "type: template and tools" + draft: false diff --git a/.github/workflows/workbench-beta-phase.yml b/.github/workflows/workbench-beta-phase.yml new file mode 100644 index 00000000..2faa25d9 --- /dev/null +++ b/.github/workflows/workbench-beta-phase.yml @@ -0,0 +1,60 @@ +name: "Deploy to AWS" + +on: + workflow_run: + workflows: ["01 Build and Deploy Site"] + types: + - completed + workflow_dispatch: + +jobs: + preflight: + name: "Preflight Check" + runs-on: ubuntu-latest + outputs: + ok: ${{ steps.check.outputs.ok }} + folder: ${{ steps.check.outputs.folder }} + steps: + - id: check + run: | + if [[ -z "${{ secrets.DISTRIBUTION }}" || -z "${{ secrets.AWS_ACCESS_KEY_ID }}" || -z "${{ secrets.AWS_SECRET_ACCESS_KEY }}" ]]; then + echo ":information_source: No site configured" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo 'To deploy the preview on AWS, you need the `AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY` and `DISTRIBUTION` secrets set up' >> $GITHUB_STEP_SUMMARY + else + echo "::set-output name=folder::"$(sed -E 's^.+/(.+)^\1^' <<< ${{ github.repository }}) + echo "::set-output name=ok::true" + fi + + full-build: + name: "Deploy to AWS" + needs: [preflight] + if: ${{ needs.preflight.outputs.ok }} + runs-on: ubuntu-latest + steps: + + - name: "Checkout site folder" + uses: actions/checkout@v3 + with: + ref: 'gh-pages' + path: 'source' + + - name: "Deploy to Bucket" + uses: jakejarvis/s3-sync-action@v0.5.1 + with: + args: --acl public-read --follow-symlinks --delete --exclude '.git/*' + env: + AWS_S3_BUCKET: preview.carpentries.org + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + SOURCE_DIR: 'source' + DEST_DIR: ${{ needs.preflight.outputs.folder }} + + - name: "Invalidate CloudFront" + uses: chetan/invalidate-cloudfront-action@master + env: + PATHS: /* + AWS_REGION: 'us-east-1' + DISTRIBUTION: ${{ secrets.DISTRIBUTION }} + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} diff --git a/.gitignore b/.gitignore index be69559c..b08b983f 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,42 @@ +# sandpaper files +episodes/*html +site/* +!site/README.md + +# History files +.Rhistory +.Rapp.history +# Session Data files +.RData +# User-specific files +.Ruserdata +# Example code in package build process +*-Ex.R +# Output files from R CMD build +/*.tar.gz +# Output files from R CMD check +/*.Rcheck/ +# RStudio files +.Rproj.user/ +# produced vignettes +vignettes/*.html +vignettes/*.pdf +# OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3 +.httr-oauth +# knitr and R markdown default cache directories +*_cache/ +/cache/ +# Temporary files created by R markdown +*.utf8.md +*.knit.md +# R Environment Variables +.Renviron +# pkgdown site +docs/ +# translation temp files +po/*~ +# renv detritus +renv/sandbox/ *.pyc *~ .DS_Store @@ -9,8 +48,6 @@ _site .Rproj.user *.Rproj .Rbuildignore -.Rhistory -.RData .bundle/ .vendor/ vendor/ diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md index c3b96690..f19b8049 100644 --- a/CODE_OF_CONDUCT.md +++ b/CODE_OF_CONDUCT.md @@ -1,11 +1,13 @@ --- -layout: page title: "Contributor Code of Conduct" --- + As contributors and maintainers of this project, -we pledge to follow the [Carpentry Code of Conduct][coc]. +we pledge to follow the [The Carpentries Code of Conduct][coc]. Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by following our [reporting guidelines][coc-reporting]. -{% include links.md %} + +[coc-reporting]: https://docs.carpentries.org/topic_folders/policies/incident-reporting.html +[coc]: https://docs.carpentries.org/topic_folders/policies/code-of-conduct.html diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 1bc75fad..6c2b81c8 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,168 +1,123 @@ -# Contributing - -[The Carpentries][c-site] ([Software Carpentry][swc-site], -[Data Carpentry][dc-site], and [Library Carpentry][lc-site]) are -open source projects, -and we welcome contributions of all kinds: -new lessons, -fixes to existing material, -bug reports, -and reviews of proposed changes are all welcome. - -## Contributor Agreement - -By contributing, -you agree that we may redistribute your work under [our license](LICENSE.md). -In exchange, -we will address your issues and/or assess your change proposal as -promptly as we can, -and help you become a member of our community. -Everyone involved in [The Carpentries][c-site] -agrees to abide by our [code of conduct](CODE_OF_CONDUCT.md). - -## How to Contribute - -The easiest way to get started is to file an issue -to tell us about a spelling mistake, -some awkward wording, -or a factual error. -This is a good way to introduce yourself -and to meet some of our community members. - -1. If you do not have a [GitHub][github] account, - you can [send us comments by email][email]. - However, - we will be able to respond more quickly if you use one of the other - methods described below. - -2. If you have a [GitHub][github] account, - or are willing to [create one][github-join], - but do not know how to use Git, - you can report problems or suggest improvements by - [creating an issue][issues]. - This allows us to assign the item to someone - and to respond to it in a threaded discussion. - -3. If you are comfortable with Git, - and would like to add or change material, - you can submit a pull request (PR). - Instructions for doing this are [included below](#using-github). - -## Where to Contribute - -1. If you wish to change this lesson, - please work in - , - which can be viewed at - . - -2. If you wish to change the example lesson, - please work in - , - which documents the format of our lessons - and can be viewed at - . - -3. If you wish to change the template used for workshop websites, - please work in - . - The home page of that repository explains how to set up workshop websites, - while the extra pages in - - provide more background on our design choices. - -4. If you wish to change CSS style files, tools, - or HTML boilerplate for lessons or workshops stored in - `_includes` or `_layouts`, - please work in . - -## What to Contribute - -There are many ways to contribute, -from writing new exercises and improving existing ones -to updating or filling in the documentation -and submitting [bug reports][issues] -about things that don't work, aren't clear, or are missing. -If you are looking for ideas, please see the 'Issues' tab for -a list of issues associated with this repository, -or you may also look at the issues for [Data Carpentry][dc-issues], -[Software Carpentry][swc-issues], and [Library Carpentry][lc-issues] projects. - -Comments on issues and reviews of pull requests are just as welcome: -we are smarter together than we are on our own. -Reviews from novices and newcomers are particularly valuable: -it's easy for people who have been using these lessons for a while -to forget how impenetrable some of this material can be, -so fresh eyes are always welcome. - -## What *Not* to Contribute +## Contributing + +[The Carpentries][cp-site] ([Software Carpentry][swc-site], [Data +Carpentry][dc-site], and [Library Carpentry][lc-site]) are open source +projects, and we welcome contributions of all kinds: new lessons, fixes to +existing material, bug reports, and reviews of proposed changes are all +welcome. + +### Contributor Agreement + +By contributing, you agree that we may redistribute your work under [our +license](LICENSE.md). In exchange, we will address your issues and/or assess +your change proposal as promptly as we can, and help you become a member of our +community. Everyone involved in [The Carpentries][cp-site] agrees to abide by +our [code of conduct](CODE_OF_CONDUCT.md). + +### How to Contribute + +The easiest way to get started is to file an issue to tell us about a spelling +mistake, some awkward wording, or a factual error. This is a good way to +introduce yourself and to meet some of our community members. + +1. If you do not have a [GitHub][github] account, you can [send us comments by + email][contact]. However, we will be able to respond more quickly if you use + one of the other methods described below. + +2. If you have a [GitHub][github] account, or are willing to [create + one][github-join], but do not know how to use Git, you can report problems + or suggest improvements by [creating an issue][repo-issues]. This allows us + to assign the item to someone and to respond to it in a threaded discussion. + +3. If you are comfortable with Git, and would like to add or change material, + you can submit a pull request (PR). Instructions for doing this are + [included below](#using-github). For inspiration about changes that need to + be made, check out the [list of open issues][issues] across the Carpentries. + +Note: if you want to build the website locally, please refer to [The Workbench +documentation][template-doc]. + +### Where to Contribute + +1. If you wish to change this lesson, add issues and pull requests here. +2. If you wish to change the template used for workshop websites, please refer + to [The Workbench documentation][template-doc]. + + +### What to Contribute + +There are many ways to contribute, from writing new exercises and improving +existing ones to updating or filling in the documentation and submitting [bug +reports][issues] about things that do not work, are not clear, or are missing. +If you are looking for ideas, please see [the list of issues for this +repository][repo-issues], or the issues for [Data Carpentry][dc-issues], +[Library Carpentry][lc-issues], and [Software Carpentry][swc-issues] projects. + +Comments on issues and reviews of pull requests are just as welcome: we are +smarter together than we are on our own. **Reviews from novices and newcomers +are particularly valuable**: it's easy for people who have been using these +lessons for a while to forget how impenetrable some of this material can be, so +fresh eyes are always welcome. + +### What *Not* to Contribute Our lessons already contain more material than we can cover in a typical -workshop, -so we are usually *not* looking for more concepts or tools to add to them. -As a rule, -if you want to introduce a new idea, -you must (a) estimate how long it will take to teach -and (b) explain what you would take out to make room for it. -The first encourages contributors to be honest about requirements; -the second, to think hard about priorities. +workshop, so we are usually *not* looking for more concepts or tools to add to +them. As a rule, if you want to introduce a new idea, you must (a) estimate how +long it will take to teach and (b) explain what you would take out to make room +for it. The first encourages contributors to be honest about requirements; the +second, to think hard about priorities. We are also not looking for exercises or other material that only run on one -platform. -Our workshops typically contain a mixture of Windows, macOS, and Linux users; -in order to be usable, -our lessons must run equally well on all three. - -## Using GitHub - -If you choose to contribute via GitHub, you may want to look at -[How to Contribute to an Open Source Project on GitHub][how-contribute]. -To manage changes, we follow [GitHub flow][github-flow]. -Each lesson has two maintainers who review issues and pull requests or encourage others to do so. -The maintainers are community volunteers and have final say over what gets merged into the lesson. -To use the web interface for contributing to a lesson: - -1. Fork the originating repository to your GitHub profile. -2. Within your version of the forked repository, move to the -3. `gh-pages` branch and -create a new branch for each significant change being made. -4. Navigate to the file(s) you wish to change within the new -5. branches and make revisions as required. -6. Commit all changed files within the appropriate branches. -7. Create individual pull requests from each of your changed branches -to the `gh-pages` branch within the originating repository. -8. If you receive feedback, make changes using your issue-specific -9. branches of the forked -repository and the pull requests will update automatically. -10. Repeat as needed until all feedback has been addressed. - -When starting work, please make sure your clone of the originating -`gh-pages` branch is up-to-date -before creating your own revision-specific branch(es) from there. -Additionally, please only work from your newly-created branch(es) and *not* -your clone of the originating `gh-pages` branch. -Lastly, published copies of all the lessons are available in the -`gh-pages` branch of the originating -repository for reference while revising. - -## Other Resources - -General discussion of [Software Carpentry][swc-site] and [Data Carpentry][dc-site] -happens on the [discussion mailing list][discuss-list], -which everyone is welcome to join. -You can also [reach us by email][email]. - -[email]: mailto:admin@software-carpentry.org +platform. Our workshops typically contain a mixture of Windows, macOS, and +Linux users; in order to be usable, our lessons must run equally well on all +three. + +### Using GitHub + +If you choose to contribute via GitHub, you may want to look at [How to +Contribute to an Open Source Project on GitHub][how-contribute]. In brief, we +use [GitHub flow][github-flow] to manage changes: + +1. Create a new branch in your desktop copy of this repository for each + significant change. +2. Commit the change in that branch. +3. Push that branch to your fork of this repository on GitHub. +4. Submit a pull request from that branch to the [upstream repository][repo]. +5. If you receive feedback, make changes on your desktop and push to your + branch on GitHub: the pull request will update automatically. + +NB: The published copy of the lesson is usually in the `main` branch. + +Each lesson has a team of maintainers who review issues and pull requests or +encourage others to do so. The maintainers are community volunteers, and have +final say over what gets merged into the lesson. + +### Other Resources + +The Carpentries is a global organisation with volunteers and learners all over +the world. We share values of inclusivity and a passion for sharing knowledge, +teaching and learning. There are several ways to connect with The Carpentries +community listed at including via social +media, slack, newsletters, and email lists. You can also [reach us by +email][contact]. + +[repo]: https://example.com/FIXME +[repo-issues]: https://example.com/FIXME/issues +[contact]: mailto:team@carpentries.org +[cp-site]: https://carpentries.org/ [dc-issues]: https://github.com/issues?q=user%3Adatacarpentry -[dc-site]: http://datacarpentry.org/ -[discuss-list]: http://lists.software-carpentry.org/listinfo/discuss +[dc-lessons]: https://datacarpentry.org/lessons/ +[dc-site]: https://datacarpentry.org/ +[discuss-list]: https://carpentries.topicbox.com/groups/discuss [github]: https://github.com [github-flow]: https://guides.github.com/introduction/flow/ [github-join]: https://github.com/join -[how-contribute]: https://egghead.io/series/how-to-contribute-to-an-open-source-project-on-github -[issues]: https://guides.github.com/features/issues/ +[how-contribute]: https://egghead.io/courses/how-to-contribute-to-an-open-source-project-on-github +[issues]: https://carpentries.org/help-wanted-issues/ +[lc-issues]: https://github.com/issues?q=user%3ALibraryCarpentry [swc-issues]: https://github.com/issues?q=user%3Aswcarpentry +[swc-lessons]: https://software-carpentry.org/lessons/ [swc-site]: https://software-carpentry.org/ -[c-site]: https://carpentries.org/ [lc-site]: https://librarycarpentry.org/ -[lc-issues]: https://github.com/issues?q=user%3Alibrarycarpentry +[template-doc]: https://carpentries.github.io/workbench/ diff --git a/LICENSE.md b/LICENSE.md index e6a3398d..7632871f 100644 --- a/LICENSE.md +++ b/LICENSE.md @@ -1,83 +1,79 @@ --- -layout: page title: "Licenses" -root: . --- + ## Instructional Material -All Software Carpentry, Data Carpentry, and Library Carpentry instructional material is -made available under the [Creative Commons Attribution -license][cc-by-human]. The following is a human-readable summary of +All Carpentries (Software Carpentry, Data Carpentry, and Library Carpentry) +instructional material is made available under the [Creative Commons +Attribution license][cc-by-human]. The following is a human-readable summary of (and not a substitute for) the [full legal text of the CC BY 4.0 license][cc-by-legal]. You are free: -* to **Share**---copy and redistribute the material in any medium or format -* to **Adapt**---remix, transform, and build upon the material +- to **Share**---copy and redistribute the material in any medium or format +- to **Adapt**---remix, transform, and build upon the material for any purpose, even commercially. -The licensor cannot revoke these freedoms as long as you follow the -license terms. +The licensor cannot revoke these freedoms as long as you follow the license +terms. Under the following terms: -* **Attribution**---You must give appropriate credit (mentioning that - your work is derived from work that is Copyright © Software - Carpentry and, where practical, linking to - http://software-carpentry.org/), provide a [link to the - license][cc-by-human], and indicate if changes were made. You may do - so in any reasonable manner, but not in any way that suggests the - licensor endorses you or your use. +- **Attribution**---You must give appropriate credit (mentioning that your work + is derived from work that is Copyright (c) The Carpentries and, where + practical, linking to ), provide a [link to the + license][cc-by-human], and indicate if changes were made. You may do so in + any reasonable manner, but not in any way that suggests the licensor endorses + you or your use. -**No additional restrictions**---You may not apply legal terms or -technological measures that legally restrict others from doing -anything the license permits. With the understanding that: +- **No additional restrictions**---You may not apply legal terms or + technological measures that legally restrict others from doing anything the + license permits. With the understanding that: Notices: -* You do not have to comply with the license for elements of the - material in the public domain or where your use is permitted by an - applicable exception or limitation. -* No warranties are given. The license may not give you all of the - permissions necessary for your intended use. For example, other - rights such as publicity, privacy, or moral rights may limit how you - use the material. +* You do not have to comply with the license for elements of the material in + the public domain or where your use is permitted by an applicable exception + or limitation. +* No warranties are given. The license may not give you all of the permissions + necessary for your intended use. For example, other rights such as publicity, + privacy, or moral rights may limit how you use the material. ## Software -Except where otherwise noted, the example programs and other software -provided by Software Carpentry and Data Carpentry are made available under the -[OSI][osi]-approved -[MIT license][mit-license]. - -Permission is hereby granted, free of charge, to any person obtaining -a copy of this software and associated documentation files (the -"Software"), to deal in the Software without restriction, including -without limitation the rights to use, copy, modify, merge, publish, -distribute, sublicense, and/or sell copies of the Software, and to -permit persons to whom the Software is furnished to do so, subject to -the following conditions: - -The above copyright notice and this permission notice shall be -included in all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE -LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION -OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION -WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +Except where otherwise noted, the example programs and other software provided +by The Carpentries are made available under the [OSI][osi]-approved [MIT +license][mit-license]. + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. ## Trademark -"Software Carpentry" and "Data Carpentry" and their respective logos -are registered trademarks of [Community Initiatives][CI]. +"The Carpentries", "Software Carpentry", "Data Carpentry", and "Library +Carpentry" and their respective logos are registered trademarks of [Community +Initiatives][ci]. [cc-by-human]: https://creativecommons.org/licenses/by/4.0/ [cc-by-legal]: https://creativecommons.org/licenses/by/4.0/legalcode [mit-license]: https://opensource.org/licenses/mit-license.html -[ci]: http://communityin.org/ +[ci]: https://communityin.org/ [osi]: https://opensource.org diff --git a/README.md b/README.md index 3dcf224b..fe452095 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,8 @@ +> **ATTENTION** This is an experimental test of [The Carpentries Workbench](https://carpentries.github.io/workbench) lesson infrastructure. +> It was automatically converted from the source lesson via [the lesson transition script](https://github.com/carpentries/lesson-transition/). +> +> If anything seems off, please contact Zhian Kamvar [zkamvar@carpentries.org](mailto:zkamvar@carpentries.org) + # High dimensional stats with R [![Create a Slack Account with us](https://img.shields.io/badge/Create_Slack_Account-The_Carpentries-071159.svg)](https://swc-slack-invite.herokuapp.com/) @@ -16,7 +21,7 @@ and make sure you have everything you need to begin developing your new lesson. Before you begin developing your new lesson, here are a few things we recommend you do: -* [ ] [Add relevant topic tags to your lesson repository][cdh-topic-tags]. +- [ ] [Add relevant topic tags to your lesson repository][cdh-topic-tags]. ## Contributing @@ -35,10 +40,10 @@ Please see the current list of for ideas for contributing to this repository. For making your contribution, we use the GitHub flow, which is nicely explained in the chapter -[Contributing to a Project](http://git-scm.com/book/en/v2/GitHub-Contributing-to-a-Project) +[Contributing to a Project](https://git-scm.com/book/en/v2/GitHub-Contributing-to-a-Project) in Pro Git by Scott Chacon. Look for the tag -![good_first_issue](https://img.shields.io/badge/-good%20first%20issue-gold.svg). +![good\_first\_issue](https://img.shields.io/badge/-good%20first%20issue-gold.svg). This indicates that the maintainers will welcome a pull request fixing this issue. @@ -46,10 +51,10 @@ issue. Current maintainers of this lesson are -* Alan O'Callaghan -* Ailith Ewing -* Catalina Vallejos -* Hannes Becher +- Alan O'Callaghan +- Ailith Ewing +- Catalina Vallejos +- Hannes Becher ## Authors @@ -59,6 +64,9 @@ A list of contributors to the lesson can be found in [AUTHORS](AUTHORS) To cite this lesson, please consult with [CITATION](CITATION) -[cdh-topic-tags]: https://cdh.carpentries.org/the-carpentries-incubator.html#topic-tags [community-lessons]: https://carpentries.org/community-lessons +[cdh-topic-tags]: https://cdh.carpentries.org/the-carpentries-incubator.html#topic-tags [lesson-example]: https://carpentries.github.io/lesson-example + + + diff --git a/_extras/figures.md b/_extras/figures.md deleted file mode 100644 index 0012c88e..00000000 --- a/_extras/figures.md +++ /dev/null @@ -1,79 +0,0 @@ ---- -title: Figures ---- - -{% include base_path.html %} -{% include manual_episode_order.html %} - - - -{% comment %} Create anchor for each one of the episodes. {% endcomment %} - -{% for lesson_episode in lesson_episodes %} - {% if site.episode_order %} - {% assign episode = site.episodes | where: "slug", lesson_episode | first %} - {% else %} - {% assign episode = lesson_episode %} - {% endif %} -
-{% endfor %} - -{% include links.md %} diff --git a/_extras/guide.md b/_extras/guide.md deleted file mode 100644 index 393791ee..00000000 --- a/_extras/guide.md +++ /dev/null @@ -1,6 +0,0 @@ ---- -title: "Instructor Notes" ---- -Coming soon. - -{% include links.md %} diff --git a/_extras/slides.md b/_extras/slides.md deleted file mode 100644 index cc2e534f..00000000 --- a/_extras/slides.md +++ /dev/null @@ -1,13 +0,0 @@ ---- -title: Lecture slides ---- - -{% include base_path.html %} - - -{% for p in site.slides %} -- [{{p.title}}]({{ relative_root_path }}/{{p.url | replace: "Rmd", "html"}}) -{% endfor %} - - -{% include links.md %} diff --git a/config.yaml b/config.yaml new file mode 100644 index 00000000..9275857a --- /dev/null +++ b/config.yaml @@ -0,0 +1,88 @@ +#------------------------------------------------------------ +# Values for this lesson. +#------------------------------------------------------------ + +# Which carpentry is this (swc, dc, lc, or cp)? +# swc: Software Carpentry +# dc: Data Carpentry +# lc: Library Carpentry +# cp: Carpentries (to use for instructor training for instance) +# incubator: The Carpentries Incubator +carpentry: 'incubator' + +# Overall title for pages. +title: 'High dimensional statistics with R' + +# Date the lesson was created (YYYY-MM-DD, this is empty by default) +created: + +# Comma-separated list of keywords for the lesson +keywords: 'software, data, lesson, The Carpentries' + +# Life cycle stage of the lesson +# possible values: pre-alpha, alpha, beta, stable +life_cycle: 'alpha' + +# License of the lesson materials (recommended CC-BY 4.0) +license: 'CC-BY 4.0' + +# Link to the source repository for this lesson +source: 'https://github.com/fishtree-attempt/high-dimensional-stats-r/' + +# Default branch of your lesson +branch: 'main' + +# Who to contact if there are any issues +contact: 'alan.ocallaghan@outlook.com' + +# Navigation ------------------------------------------------ +# +# Use the following menu items to specify the order of +# individual pages in each dropdown section. Leave blank to +# include all pages in the folder. +# +# Example ------------- +# +# episodes: +# - introduction.md +# - first-steps.md +# +# learners: +# - setup.md +# +# instructors: +# - instructor-notes.md +# +# profiles: +# - one-learner.md +# - another-learner.md + +# Order of episodes in your lesson +episodes: +- 01-introduction-to-high-dimensional-data.Rmd +- 02-high-dimensional-regression.Rmd +- 03-regression-regularisation.Rmd +- 04-principal-component-analysis.Rmd +- 05-factor-analysis.Rmd +- 06-k-means.Rmd +- 07-hierarchical.Rmd + +# Information for Learners +learners: + +# Information for Instructors +instructors: + +# Learner Profiles +profiles: + +# Customisation --------------------------------------------- +# +# This space below is where custom yaml items (e.g. pinning +# sandpaper and varnish versions) should live + + +url: https://preview.carpentries.org/high-dimensional-stats-r +analytics: carpentries +lang: en +workbench-beta: yes diff --git a/episodes/01-introduction-to-high-dimensional-data.Rmd b/episodes/01-introduction-to-high-dimensional-data.Rmd new file mode 100644 index 00000000..6decb16e --- /dev/null +++ b/episodes/01-introduction-to-high-dimensional-data.Rmd @@ -0,0 +1,435 @@ +--- +title: Introduction to high-dimensional data +author: GS Robertson +source: Rmd +teaching: 20 +exercises: 20 +math: yes +--- + +::::::::::::::::::::::::::::::::::::::: objectives + +- Explore examples of high-dimensional data in the biosciences. +- Appreciate challenges involved in analysing high-dimensional data. +- Explore different statistical methods used for analysing high-dimensional data. +- Work with example data created from biological studies. + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::: questions + +- What are high-dimensional data and what do these data look like in the biosciences? +- What are the challenges when analysing high-dimensional data? +- What statistical methods are suitable for analysing these data? +- How can Bioconductor be used to access high-dimensional data in the biosciences? + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +```{r setup, include=FALSE} +library("here") +source(here("bin/chunk-options.R")) +``` + +## What are high-dimensional data? + +*High-dimensional data* are defined as data in which the number of features (variables observed), +$p$, are close to or larger than the number of observations (or data points), $n$. +The opposite is *low-dimensional data* in which the number of observations, +$n$, far outnumbers the number of features, $p$. A related concept is *wide data*, which +refers to data with numerous features irrespective of the number of observations (similarly, +*tall data* is often used to denote data with a large number of observations). +Analyses of high-dimensional data require consideration of potential problems that +come from having more features than observations. + +High-dimensional data have become more common in many scientific fields as new +automated data collection techniques have been developed. More and more datasets +have a large number of features and some have as many features as there are rows +in the dataset. Datasets in which $p \\geq n$ are becoming more common. Such datasets +pose a challenge for data analysis as standard methods of analysis, such as linear +regression, are no longer appropriate. + +High-dimensional datasets are common in the biological sciences. Data sets in subjects like +genomics and medical sciences are often tall (with large $n$) and wide +(with large $p$), and can be difficult to analyse or visualise using +standard statistical tools. An example of high-dimensional data in biological +sciences may include data collected from hospital patients recording symptoms, +blood test results, behaviours, and general health, resulting in datasets with +large numbers of features. Researchers often want to relate these features to +specific patient outcomes (e.g. survival, length of time spent in hospital). +An example of what high-dimensional data might look like in a biomedical study +is shown in the figure below. + +```{r table-intro, echo=FALSE} +knitr::include_graphics("fig/intro-table.png") +``` + +::::::::::::::::::::::::::::::::::::::: challenge + +### Challenge 1 + +Descriptions of four research questions and their datasets are given below. +Which of these scenarios use high-dimensional data? + +1. Predicting patient blood pressure using: cholesterol level in blood, age, + and BMI measurements, collected from 100 patients. +2. Predicting patient blood pressure using: cholesterol level in blood, age, + and BMI, as well as information on 200,000 single nucleotide polymorphisms + from 100 patients. +3. Predicting the length of time patients spend in hospital with pneumonia infection + using: measurements on age, BMI, length of time with symptoms, + number of symptoms, and percentage of neutrophils in blood, using data + from 200 patients. +4. Predicting probability of a patient's cancer progressing using gene + expression data from 20,000 genes, as well as data associated with general patient health + (age, weight, BMI, blood pressure) and cancer growth (tumour size, + localised spread, blood test results). + +::::::::::::::: solution + +### Solution + +1. No. The number of observations (100 patients) is far greater than the number of features (3). +2. Yes, this is an example of high-dimensional data. There are only 100 observations but 200,000+3 features. +3. No. There are many more observations (200 patients) than features (5). +4. Yes. There is only one observation of more than 20,000 features. + + + +::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +Now that we have an idea of what high-dimensional data look like we can think +about the challenges we face in analysing them. + +## Challenges in dealing with high-dimensional data + +Most classical statistical methods are set up for use on low-dimensional data +(i.e. data where the number of observations $n$ is much larger than the number +of features $p$). This is because low-dimensional data were much more common in +the past when data collection was more difficult and time consuming. In recent +years advances in information technology have allowed large amounts of data to +be collected and stored with relative ease. This has allowed large numbers of +features to be collected, meaning that datasets in which $p$ matches or exceeds +$n$ are common (collecting observations is often more difficult or expensive +than collecting many features from a single observation). + +Datasets with large numbers of features are difficult to visualise. When +exploring low-dimensional datasets, it is possible to plot the response variable +against each of the limited number of explanatory variables to get an idea which +of these are important predictors of the response. With high-dimensional data +the large number of explanatory variables makes doing this difficult. In some +high-dimensional datasets it can also be difficult to identify a single response +variable, making standard data exploration and analysis techniques less useful. + +Let's have a look at a simple dataset with lots of features to understand some +of the challenges we are facing when working with high-dimensional data. + +::::::::::::::::::::::::::::::::::::::: challenge + +### Challenge 2 + +Load the `Prostate` dataset as follows: + +```{r prostate} +library("here") +Prostate <- readRDS(here("data/prostate.rds")) +``` + +Although technically not a high-dimensional dataset, the `Prostate` data +will allow us explore the problems encountered when working with many features. + +Examine the dataset (in which each row represents a single patient) to: + +1. Determine how many observations ($n$) and features ($p$) are available (hint: see the `dim()` function). +2. Examine what variables were measured (hint: see the `names()` and `head()` functions). +3. Plot the relationship between the variables (hint: see the `pairs()` function). + +::::::::::::::: solution + +### Solution + +```{r dim-prostate, eval=FALSE} +dim(Prostate) #print the number of rows and columns +``` + +```{r head-prostate, eval=FALSE} +names(Prostate) # examine the variable names +head(Prostate) #print the first 6 rows +``` + +```{r pairs-prostate} +names(Prostate) #examine column names + +pairs(Prostate) #plot each pair of variables against each other +``` + +The `pairs()` function plots relationships between each of the variables in +the `Prostate` dataset. This is possible for datasets with smaller numbers +of variables, but for datasets in which $p$ is larger it becomes difficult +(and time consuming) to visualise relationships between all variables in the +dataset. Even where visualisation is possible, fitting models to datasets +with many variables is difficult due to the potential for +overfitting and difficulties in identifying a response variable. + +::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +::::::::::::::::::::::::::::::::::::::::: callout + +### Locating data with R - the **`here`** package + +It is often desirable to access external datasets from inside R and to write +code that does this reliably on different computers. While R has an inbulit +function `setwd()` that can be used to denote where external datasets are +stored, this usually requires the user to adjust the code to their specific +system and folder structure. The **`here`** package is meant to be used in R +projects. It allows users to specify the data location relative to the R +project directory. This makes R code more portable and can contribute to +improve the reproducibility of an analysis. + + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +Imagine we are carrying out least squares regression on a dataset with 25 +observations. Fitting a best fit line through these data produces a plot shown +in the left-hand panel of the figure below. + +However, imagine a situation in which the number of observations and features in a +dataset are almost equal. In that situation the effective number of observations +per features is low. The result of fitting a best fit line through +few observations can be seen in the right-hand panel below. + +```{r intro-figure, echo=FALSE} +knitr::include_graphics("fig/intro-scatterplot.png") +# ![Figure 2: Least squares regression using a) low-dimensional data and b) low ratio of observations per feature](D:/Statistical consultancy/Consultancy/Grant applications/UKRI teaching grant 2021/Working materials/Figure 2 for Intro.png) +``` + +In the first situation, the least squares regression line does not fit the data +perfectly and there is some error around the regression line. But, when there are +only two observations the regression line will fit through the points exactly, +resulting in overfitting of the data. This suggests that carrying out least +squares regression on a dataset with few data points per feature would result +in difficulties in applying the resulting model to further datsets. This is a +common problem when using regression on high-dimensional datasets. + +Another problem in carrying out regression on high-dimensional data is dealing +with correlations between explanatory variables. The large numbers of features +in these datasets makes high correlations between variables more likely. + +::::::::::::::::::::::::::::::::::::::: challenge + +### Challenge 3 + +Use the `cor()` function to examine correlations between all variables in the +`Prostate` dataset. Are some pairs of variables highly correlated (i.e. +correlation coefficients > 0.6)? + +Use the `lm()` function to fit univariate regression models to predict patient +age using two variables that are highly correlated as predictors. Which of +these variables are statistically significant predictors of age? Hint: the +`summary()` function can help here. + +Fit a multiple linear regression model predicting patient age using both +variables. What happened? + +::::::::::::::: solution + +### Solution + +Create a correlation matrix of all variables in the Prostate dataset + +```{r cor-prostate} +cor(Prostate) +round(cor(Prostate), 2) # rounding helps to visualise the correlations +``` + +As seen above, some variables are highly correlated. In particular, the +correlation between `gleason` and `pgg45` is equal to 0.75. + +Fitting univariate regression models to predict age using gleason and pgg45 +as predictors. + +```{r univariate-prostate} +model1 <- lm(age ~ gleason, data = Prostate) +model2 <- lm(age ~ pgg45, data = Prostate) +``` + +Check which covariates have a significant efffect + +```{r summary-prostate} +summary(model1) +summary(model2) +``` + +Based on these results we conclude that both `gleason` and `pgg45` have a +statistically significan univariate effect (also referred to as a marginal +effect) as predictors of age (5% significance level). + +Fitting a multivariate regression model using both both `gleason` and `pgg45` +as predictors + +```{r multivariate-prostate} +model3 <- lm(age ~ gleason + pgg45, data = Prostate) +summary(model3) +``` + +Although `gleason` and `pgg45` have statistically significant univariate effects, +this is no longer the case when both variables are simultaneously included +as covariates in a multivariate regression model. + + + +::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +Including highly correlated variables such as `gleason` and `pgg45` +simultaneously the same regression model can lead to problems +in fitting a regression model and interpreting its output. To allow variables to +be included in the same model despite high levels of correlation, we can use +dimensionality reduction methods to collapse multiple variables into a single +new variable (we will explore this dataset further in the dimensionality +reduction lesson). We can also use modifications to linear regression like +regularisation, which we will discuss in the lesson on high-dimensional +regression. + +## What statistical methods are used to analyse high-dimensional data? + +As we found out in the above challenges, carrying out linear regression on +datasets with large numbers of features can be difficult due to: high levels of correlation +between variables; difficulty in identifying a clear response variable; and risk +of overfitting. These problems are common to the analysis of many high-dimensional datasets, +for example, those using genomics data with multiple genes, or species +composition data in an environment where the relative abundance of different species +within a community is of interest. For such datasets, other statistical methods +may be used to examine whether groups of observations show similar characteristics +and whether these groups may relate to other features in the data (e.g. +phenotype in genetics data). + +In this course, we will cover four methods that help in dealing with high-dimensional data: +(1) regression with numerous outcome variables, (2) regularised regression, +(3) dimensionality reduction, and (4) clustering. Here are some examples of when each of +these approaches may be used: + +(1) Regression with numerous outcomes refers to situations in which there are +many variables of a similar kind (expression values for many genes, methylation +levels for many sites in the genome) and when one is interested in assessing +whether these variables are associated with a specific covariate of interest, +such as experimental condition or age. In this case, multiple univariate +regression models (one per each outcome, using the covariate of interest as +predictor) could be fitted independently. In the context of high-dimensional +molecular data, a typical example are *differential gene expression* analyses. +We will explore this type of analysis in the *Regression with many outcomes* episode. + +(2) Regularisation (also known as *regularised regression* or *penalised regression*) +is typically used to fit regression models when there is a single outcome +variable or interest but the number of potential predictors is large, e.g. +there are more predictors than observations. Regularisation can help to prevent +overfitting and may be used to identify a small subset of predictors that are +associated with the outcome of interest. For example, regularised regression has +been often used when building *epigenetic clocks*, where methylation values +across several thousands of genomic sites are used to predict chronological age. +We will explore this in more detail in the *Regularised regression* episode. + +(3) Dimensionality reduction is commonly used on high-dimensional datasets for +data exploration or as a preprocessing step prior to other downstream analyses. +For instance, a low-dimensional visualisation of a gene expression dataset may +be used to inform *quality control* steps (e.g. are there any anomalous samples?). +This course contains two episodes that explore dimensionality reduction +techniques: *Principal component analysis* and *Factor analysis*. + +(4) Clustering methods can be used to identify potential grouping patterns +within a dataset. A popular example is the *identification of distinct cell types* +through clustering cells with similar gene expression patterns. The *K-means* +episode will explore a specific method to perform clustering analysis. + +::::::::::::::::::::::::::::::::::::::::: callout + +### Using Bioconductor to access high-dimensional data in the biosciences + +In this workshop, we will look at statistical methods that can be used to +visualise and analyse high-dimensional biological data using packages available +from Bioconductor, open source software for analysing high throughput genomic +data. Bioconductor contains useful packages and example datasets as shown on the +website [https://www.bioconductor.org/](https://www.bioconductor.org/). + +Bioconductor packages can be installed and used in `R` using the **`BiocManager`** +package. Let's load the **`minfi`** package from Bioconductor (a package for +analysing Illumina Infinium DNA methylation arrays). + +```{r libminfi} +library("minfi") +``` + +```{r vigminfi, eval=FALSE} +browseVignettes("minfi") +``` + +We can explore these packages by browsing the vignettes provided in +Bioconductor. Bioconductor has various packages that can be used to load and +examine datasets in `R` that have been made available in Bioconductor, usually +along with an associated paper or package. + +Next, we load the `methylation` dataset which represents data collected using +Illumina Infinium methylation arrays which are used to examine methylation +across the human genome. These data include information collected from the +assay as well as associated metadata from individuals from whom samples were +taken. + +```{r libsload} +library("here") +library("ComplexHeatmap") + +methylation <- readRDS(here("data/methylation.rds")) +head(colData(methylation)) + +methyl_mat <- t(assay(methylation)) +## calculate correlations between cells in matrix +cor_mat <- cor(methyl_mat) +``` + +```{r view-cor, eval=FALSE} +cor_mat[1:10, 1:10] # print the top-left corner of the correlation matrix +``` + +The `assay()` function creates a matrix-like object where rows represent probes +for genes and columns represent samples. We calculate correlations between +features in the `methylation` dataset and examine the first 100 cells of this +matrix. The size of the dataset makes it difficult to examine in full, a +common challenge in analysing high-dimensional genomics data. + + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +## Further reading + +- Buhlman, P. \& van de Geer, S. (2011) Statistics for High-Dimensional Data. Springer, London. +- [Buhlman, P., Kalisch, M. \& Meier, L. (2014) High-dimensional statistics with a view toward applications in biology. Annual Review of Statistics and Its Application](https://doi.org/10.1146/annurev-statistics-022513-115545). +- Johnstone, I.M. \& Titterington, D.M. (2009) Statistical challenges of high-dimensional data. Philosophical Transactions of the Royal Society A 367:4237-4253. +- [Bioconductor ethylation array analysis vignette](https://www.bioconductor.org/packages/release/workflows/vignettes/methylationArrayAnalysis/inst/doc/methylationArrayAnalysis.html). +- The *Introduction to Machine Learning with Python* course covers additional + methods that could be used to analyse high-dimensional data. See + [Introduction to machine learning](https://carpentries-incubator.github.io/machine-learning-novice-python/), + [Tree models](https://carpentries-incubator.github.io/machine-learning-trees-python/) and + [Neural networks](https://carpentries-incubator.github.io/machine-learning-neural-python/). + Some related (an important!) content is also available in + [Responsible machine learning](https://carpentries-incubator.github.io/machine-learning-responsible-python/). + +## Other resources suggested by former students + +- [Josh Starmer's](https://www.youtube.com/c/joshstarmer) youtube channel. + + + +:::::::::::::::::::::::::::::::::::::::: keypoints + +- High-dimensional data are data in which the number of features, $p$, are close to or larger than the number of observations, $n$. +- These data are becoming more common in the biological sciences due to increases in data storage capabilities and computing power. +- Standard statistical methods, such as linear regression, run into difficulties when analysing high-dimensional data. +- In this workshop, we will explore statistical methods used for analysing high-dimensional data using datasets available on Bioconductor. + +:::::::::::::::::::::::::::::::::::::::::::::::::: + + diff --git a/episodes/02-high-dimensional-regression.Rmd b/episodes/02-high-dimensional-regression.Rmd new file mode 100644 index 00000000..dd20e9ba --- /dev/null +++ b/episodes/02-high-dimensional-regression.Rmd @@ -0,0 +1,1201 @@ +--- +title: Regression with many outcomes +source: Rmd +teaching: 60 +exercises: 30 +math: yes +editor_options: + markdown: + wrap: 72 +--- + +::::::::::::::::::::::::::::::::::::::: objectives + +- Perform and critically analyse high dimensional regression. +- Understand methods for shrinkage of noise parameters in high-dimensional regression. +- Perform multiple testing adjustment. + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::: questions + +- How can we apply linear regression in a high-dimensional setting? +- How can we benefit from the fact that we have many outcomes? +- How can we control for the fact that we do many tests? + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +```{r settings, include=FALSE} +library("here") +source(here("bin/chunk-options.R")) +``` + +## DNA methylation data + +For the following few episodes, we will be working with human DNA +methylation data from flow-sorted blood samples. DNA methylation assays +measure, for each of many sites in the genome, the proportion of DNA +that carries a methyl mark (a chemical modification that does not alter the +DNA sequence). In this case, the methylation data come in +the form of a matrix of normalised methylation levels (M-values), where negative +values correspond to unmethylated DNA and positive values correspond to +methylated DNA. Along with this, we have a number of sample phenotypes +(eg, age in years, BMI). + +Let's read in the data for this episode: + +```{r loadmethy} +library("here") +library("minfi") +methylation <- readRDS(here("data/methylation.rds")) +``` + +Note: the code that we used to download these data from its source is available +[here](https://github.com/carpentries-incubator/high-dimensional-stats-r/blob/main/data/methylation.R) + +This `methylation` object is a `GenomicRatioSet`, a Bioconductor data +object derived from the `SummarizedExperiment` class. These +`SummarizedExperiment` objects contain `assay`s, in this case +normalised methylation levels, and optional sample-level `colData` and +feature-level `metadata`. These objects are very convenient to contain +all of the information about a dataset in a high-throughput context. If +you would like more detail on these objects it may be useful to consult +the [vignettes on +Bioconductor](https://www.bioconductor.org/packages/release/bioc/vignettes/SummarizedExperiment/inst/doc/SummarizedExperiment.html). + +```{r showmethy} +methylation +``` + +You can see in this output that this object has a `dim()` of +$`r nrow(methylation)` \\times `r ncol(methylation)`$, meaning it has +`r nrow(methylation)` features and `r ncol(methylation)` columns. To +extract the matrix of methylation M-values, we can use the +`assay()` function. One thing to bear in mind with these objects (and +data structures for computational biology in R generally) is that in the +matrix of methylation data, samples or observations are stored as +columns, while features (in this case, sites in the genome) are stored as rows. +This is in contrast to usual tabular data, where features or variables +are stored as columns and observations are stored as rows. + +```{r grabx} +methyl_mat <- assay(methylation) +``` + +The distribution of these M-values looks like this: + +```{r histx, fig.cap="Methylation levels are generally bimodally distributed.", fig.alt="Histogram of M-values for all features. The distribution appears to be bimodal, with a large number of unmethylated features as well as many methylated features, and many intermediate features."} +hist(methyl_mat, breaks = "FD", xlab = "M-value") +``` + +You can see that there are two peaks in this distribution, corresponding +to features which are largely unmethylated and methylated, respectively. + +Similarly, we can examine the `colData()`, which represents the +sample-level metadata we have relating to these data. In this case, the +metadata, phenotypes, and groupings in the `colData` look like this for +the first 6 samples: + +```{r datatable} +knitr::kable(head(colData(methylation)), row.names = FALSE) +``` + +In this episode, we will focus on the association between age and +methylation. The following heatmap summarises age and methylation levels +available in the Prostate dataset: + +```{r heatmap, fig.cap="Visualising the data as a heatmap, it's clear that there's too many models to fit 'by hand'.", fig.alt="Heatmap of methylation values across all features. Samples are ordered according to age."} +age <- methylation$Age + +library("ComplexHeatmap") +order <- order(age) +age_ord <- age[order] +methyl_mat_ord <- methyl_mat[, order] + +Heatmap(methyl_mat_ord, + name = "M-value", + cluster_columns = FALSE, + show_row_names = FALSE, + show_column_names = FALSE, + row_title = "Feature", + column_title = "Sample", + top_annotation = columnAnnotation(age = age_ord)) +``` + +Depending on the scientific question of interest, two types of high-dimensional +problems could be explored in this context: + +1. To predict age using methylation levels as predictors. In this case, we would + have a single outcome (age) which will be predicted using 5000 covariates + (methylation levels across the genome). + +2. To predict methylation levels using age as a predictor. In this case, we + would have 5000 outcomes (methylation levels across the genome) and a single + covariate (age). + +The examples in this episode will focus on the second type of problem, whilst +the next episode will focus on the first. + +::::::::::::::::::::::::::::::::::::::: challenge + +### Challenge 1 + +Why can we not just fit many linear regression models, one for each of the columns +in the `colData` above against each of the features in the matrix of +assays, and choose all of the significant results at a p-value of +0\.05? + +::::::::::::::: solution + +### Solution + +There are a number of problems that this kind of approach presents. +For example: 1. Without a research question in mind when creating a +model, it's not clear how we can interpret each model, and +rationalising the results after the fact can be dangerous; it's easy +to make up a "story" that isn't grounded in anything but the fact +that we have significant findings. 2. We may not have a representative +sample for each of these covariates. For example, we may have very +small sample sizes for some ethnicities, leading to spurious +findings. 3. If we perform `r nrow(methylation)` tests for each of +`r ncol(colData(methylation))` variables, even if there were no true +associations in the data, we'd be likely to observe some strong +spurious associations that arise just from random noise. + +::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +::::::::::::::::::::::::::::::::::::::::: callout + +### Measuring DNA Methylation + +DNA methylation is an epigenetic modification of DNA. Generally, we +are interested in the proportion of methylation at many sites or +regions in the genome. DNA methylation microarrays, as we are using +here, measure DNA methylation using two-channel microarrays, where one +channel captures signal from methylated DNA and the other captures +unmethylated signal. These data can be summarised as "Beta values" +($\\beta$ values), which is the ratio of the methylated signal to the +total signal (methylated plus unmethylated). The $\\beta$ value for +site $i$ is calculated as + +$$ +\beta_i = \frac{ +m_i +} { +u_{i} + m_{i} +} +$$ + +where $m\_i$ is the methylated signal for site $i$ and $u\_i$ is the +unmethylated signal for site $i$. $\\beta$ values take on a value in +the range $[0, 1]$, with 0 representing a completely unmethylated site +and 1 representing a completely methylated site. + +The M-values we use here are the $\\log\_2$ ratio of methylated versus +unmethylated signal: + +$$ +M_i = \log_2\left(\frac{m_i}{u_i}\right) +$$ + +M-values are not bounded to an interval as Beta values are, and +therefore can be easier to work with in statistical models. + + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +## Regression with many outcomes + +In high-throughput studies, it is common to have one or more phenotypes +or groupings that we want to relate to features of interest (eg, gene +expression, DNA methylation levels). In general, we want to identify +differences in the features of interest that are related to a phenotype +or grouping of our samples. Identifying features of interest that vary +along with phenotypes or groupings can allow us to understand how +phenotypes arise or manifest. Analysis of this type is sometimes referred +to using the term *differential analysis*. + +For example, we might want to identify genes that are expressed at a +higher level in mutant mice relative to wild-type mice to understand the +effect of a mutation on cellular phenotypes. Alternatively, we might +have samples from a set of patients, and wish to identify epigenetic +features that are different in young patients relative to old patients, +to help us understand how ageing manifests. + +Using linear regression, it is possible to identify differences like +these. However, high-dimensional data like the ones we're working with +require some special considerations. A primary consideration, as we saw +above, is that there are far too many features to fit each one-by-one as +we might do when analysing low-dimensional datasets (for example using +`lm` on each feature and checking the linear model assumptions). A +secondary consideration is that statistical approaches may behave +slightly differently in very high-dimensional data, compared to +low-dimensional data. A third consideration is the speed at which we can +actually compute statistics for data this large -- methods optimised for +low-dimensional data may be very slow when applied to high-dimensional +data. + +Ideally when performing regression, we want to identify cases like this, +where there is a clear association, and we probably "don't need" +statistics: + +```{r example1, echo=FALSE, fig.cap="A scatter plot of age and a feature of interest.", fig.alt="An example of a strong linear association between a continuous phenotype (age) on the x-axis and a feature of interest (DNA methylation at a given locus) on the y-axis. A strong linear relationship with a positive slope exists between the two.", fig.width=6, fig.height=6} +library("ggplot2") +theme_set(theme_bw()) +set.seed(42) +n <- 10 +x <- c(rnorm(n, 0), rnorm(n, 3)) +group <- (2 * x) + rnorm(n) +ggplot() + + aes(x = group, y = x) + + geom_point() + + labs(x = "Age", y = "DNA methylation") +``` + +or equivalently for a discrete covariate: + +```{r example2, echo=FALSE, fig.cap="A scatter plot of a grouping and a feature of interest.", fig.alt="An example of a strong linear association between a discrete phenotype (group) on the x-axis and a feature of interest (DNA methylation at a given locus) on the y-axis. The two groups clearly differ with respect to DNA methylation.", fig.width=6, fig.height=6} +library("ggplot2") +set.seed(42) +n <- 10 +x <- c(rnorm(n, 0), rnorm(n, 5)) +group <- c(rep("A", n), rep("B", n)) +ggplot() + + aes(x = group, y = x, colour = group) + + # geom_violin() + + # geom_boxplot(width = 0.25) + + geom_jitter(height = 0, width = 0.2) + + labs(y = "DNA methylation") +``` + +However, often due to small differences and small sample sizes, the +problem is more difficult: + +```{r example3, echo=FALSE, fig.cap="A scatter plot of a grouping and a feature of interest.", fig.alt="An example of a strong linear association between a discrete phenotype (group) on the x-axis and a feature of interest (DNA methylation at a given locus) on the y-axis. The two groups seem to differ with respect to DNA methylation, but the relationship is weak.", fig.width=6, fig.height=6} +library("ggplot2") +set.seed(66) +n <- 5 +x <- c(rnorm(n, 0), rnorm(n, 1)) +group <- c(rep("A", n), rep("B", n)) +ggplot() + + aes(x = group, y = x, colour = group) + + # geom_violin() + + # geom_boxplot(width = 0.25) + + geom_jitter(height = 0, width = 0.2) + + labs(y = "DNA methylation") +``` + +And, of course, we often have an awful lot of features and need to +prioritise a subset of them! We need a rigorous way to prioritise genes +for further analysis. + +## Fitting a linear model + +So, in the data we have read in, we have a matrix of methylation values +$X$ and a vector of ages, $y$. One way to model this is to see if we can +use age to predict the expected (average) methylation value for sample +$j$ at a given locus $i$, which we can write as $X\_{ij}$. We can write +that model as: + +$$ +\mathbf{E}(X_{ij}) = \beta_0 + \beta_1 \text{Age}_j +$$ + +where $\\text{Age}\_j$ is the age of sample $j$. In this model, $\\beta\_1$ +represents the unit change in mean methylation level for each unit +(year) change in age. For a specific CpG, we can fit this model and get more +information from the model object. For illustration purposes, here we +arbitrarily select the first CpG in the `methyl_mat` matrix (the one on its first row). + +```{r fit1} +age <- methylation$Age +# methyl_mat[1, ] indicates that the 1st CpG will be used as outcome variable +lm_age_methyl1 <- lm(methyl_mat[1, ] ~ age) +lm_age_methyl1 +``` + +We now have estimates for the expected methylation level when age equals +0 (the intercept) and the change in methylation level for a unit change +in age (the slope). We could plot this linear model: + +```{r plot-lm-methyl1, fig.cap="A scatter plot of age versus the methylation level for an arbitrarily selected CpG side (the one stored as the first column of methyl_mat). Each dot represents an individual. The black line represents the estimated linear model.", fig.alt="An example of the relationship between age (x-axis) and methylation levels (y-axis) for an arbitrarily selected CpG. In this case, the y-axis shows methylation levels for the first CpG in our data. The black line shows the fitted regression line (based on the intercept and slope estimates shown above). For this feature, we can see that there is no strong relationship between methylation and age."} +plot(age, methyl_mat[1, ], xlab = "Age", ylab = "Methylation level", pch = 16) +abline(lm_age_methyl1) +``` + +For this feature, we can see that there is no strong relationship +between methylation and age. We could try to repeat this for every +feature in our dataset; however, we have a lot of features! We need an +approach that allows us to assess associations between all of these +features and our outcome while addressing the three considerations we +outlined previously. Before we introduce this approach, let's go into +detail about how we generally check whether the results of a linear +model are statistically significant. + +## Hypothesis testing in linear regression + +Using the linear model we defined above, we can ask questions based on the +estimated value for the regression coefficients. For example, do individuals +with different age have different methylation values for a given CpG? We usually +do this via *hypothesis testing*. This framework compares the results that we +observed (here, estimated linear model coefficients) to the results you would +expect under a *null hypothesis* associated to our question. In the example above, +a suitable null hypothesis would test whether the regression coefficient associated +to age ($\\beta\_1$) is equal to zero or not. If $\\beta\_1$ is equal to zero, +the linear model indicates that there is no linear relationship between age +and the methylation level for the CpG (remember: as its name suggests, linear +regression can only be used to model linear relationships between predictors and +outcomes!). In other words, the answer to our question would be: no! + +The output of a linear model typically returns the results associated +with the null hypothesis described above (this may not always be the most realistic +or useful null hypothesis, but it is the one we have by default!). To be +more specific, the test compares our observed results with a set of +hypothetical counter-examples of what we would expect to observe if we repeated +the same experiment and analysis over and over again under the null hypothesis. + +For this linear model, we can use `tidy()` from the **`broom`** package to +extract detailed information about the coefficients and the associated +hypothesis tests in this model: + +```{r tidyfit} +library("broom") +tidy(lm_age_methyl1) +``` + +The standard errors (`std.error`) represent the statistical uncertainty in our +regression coefficient estimates (often referred to as *effect size*). The test +statistics and p-values represent measures of how (un)likely it would be to observe +results like this under the "null hypothesis". + +::::::::::::::::::::::::::::::::::::::: challenge + +### Challenge 2 + +In the model we fitted, the estimate for the intercept is 0.902 and its associated +p-value is 0.0129. What does this mean? + +::::::::::::::: solution + +### Solution + +The first coefficient in a linear model like this is the intercept, which measures +the mean of the outcome (in this case, the methylation value for the first CpG) +when age is zero. In this case, the intercept estimate is 0.902. However, this is +not a particularly noteworthy finding as we do not have any observations with age +zero (nor even any with age \< 20!). + +The reported p-value is associated to the following null hypothesis: +the intercept ($\\beta\_0$ above) is equal to zero. Using the usual +significance threshold of 0.05, we reject the null hypothesis as +the p-value is smaller than 0.05. However, it is not really interesting +if this intercept is zero or not, since we probably do not care what the +methylation level is when age is zero. In fact, this question does not +even make much sense! In this example, we are more interested +in the regression coefficient associated to age, as that can tell us +whether there is a linear relationship between age and methylation for the CpG. + +::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +## Fitting a lot of linear models + +In the linear model above, we are generally interested in the second regression +coefficient (often referred to as *slope*) which measures the linear relationship +between age and methylation levels. For the first CpG, here is its estimate: + +```{r tidyfit2} +coef_age_methyl1 <- tidy(lm_age_methyl1)[2, ] +coef_age_methyl1 +``` + +In this case, the p-value is equal to 0.381 and therefore we cannot reject the null +hypothesis: there is no statistical evidence to suggest that the regression +coefficient associated to age is not equal to zero. + +Now, we could do this for every feature (CpG) in the dataset and rank the +results based on their test statistic or associated p-value. However, fitting +models in this way to `r nrow(methylation)` features is not very computationally +efficient, and it would also be laborious to do programmatically. There are ways +to get around this, but first let us talk about what exactly we are doing when +we look at significance tests in this context. + +```{r rbindfits, echo=FALSE} +# Instead of repeating this model fitting and coefficient extraction by hand, we +# could write a function to fit this kind of model for any given row of the +# matrix: + +lm_feature <- function(i, methyl_mat, age) { + tidy(lm(methyl_mat[i, ] ~ age))[2, ] +} + +coefs <- lapply( + seq_len(nrow(methyl_mat)), + lm_feature, + age = age, + methyl_mat = methyl_mat +) +## bind together all of our small tables to make one big table +coef_df <- do.call(rbind, coefs) +## add a "feature name" column +coef_df$feature <- rownames(methyl_mat) +# plot(coef_df$estimate, -log10(coef_df$p.value), +# xlab = "Effect size", ylab = bquote(-log[10](p)), +# pch = 19 +# ) +``` + +## How does hypothesis testing for a linear model work? + +In order to decide whether a result would be unlikely under the null +hypothesis, we must calculate a test statistic. For coefficient $k$ in a +linear model (in our case, it would be the slope), the test statistic is +a t-statistic given by: + +$$ +t_{k} = \frac{\hat{\beta}_{k}}{SE\left(\hat{\beta}_{k}\right)} +$$ + +$SE\\left(\\hat{\\beta}\_{k}\\right)$ measures the uncertainty we have in our +effect size estimate. Knowing what distribution these t-statistics +follow under the null hypothesis allows us to determine how unlikely it +would be for us to observe what we have under those circumstances, if we +repeated the experiment and analysis over and over again. To +demonstrate, we can compute the t-statistics "by hand" (advanced content). + +```{r simfit} +table_age_methyl1 <- tidy(lm_age_methyl1) +``` + +We can see that the t-statistic is just the ratio between the coefficient estimate +and the standard error: + +```{r simtval} +tvals <- table_age_methyl1$estimate / table_age_methyl1$std.error +all.equal(tvals, table_age_methyl1$statistic) +``` + +Calculating the p-values is a bit more tricky. Specifically, it is the +proportion of the distribution of the test statistic under the null +hypothesis that is *as extreme or more extreme* than the observed value +of the test statistic. This is easy to observe visually, by plotting the +theoretical distribution of the test statistic under the null hypothesis +(see next call-out box for more details about it): + +```{r tdist, echo=FALSE, fig.cap="The p-value for a regression coefficient represents how often it'd be observed under the null.", fig.alt="Density plot of a t-distribution showing the observed test statistics (here, t-statistics). The p-values, visualised here with shaded regions, represent the portion of the null distribution that is as extreme or more extreme as the observed test statistics, which are shown as dashed lines."} +ggplot() + + geom_function(fun = function(x) dt(x, df = lm_age_methyl1$df)) + + xlim(-4, 4) + + geom_vline( + aes(xintercept = abs(tvals), color = c("Intercept", "Slope")), + lty = "dashed" + ) + + stat_function(fun = function(x) dt(x, df = lm_age_methyl1$df), + aes(fill = "Intercept"), + xlim = c(abs(tvals)[[1]], 4), + alpha = 0.25, + geom = "area" + ) + + stat_function(fun = function(x) dt(x, df = lm_age_methyl1$df), + aes(fill = "Slope"), + xlim = c(abs(tvals)[[2]], 4), + alpha = 0.25, + geom = "area" + ) + + scale_color_discrete("Parameter", aesthetics = c("fill", "colour")) + + labs(x = "t-statistic", y = "Density") +``` + +The red-ish shaded region represents the portion of the distribution of +the test statistic under the null hypothesis that is equal or greater to +the value we observe for the intercept term. As our null hypothesis +relates to a 2-tailed test (as the null hypothesis states that the regression +coefficient is equal to zero, we would reject it if the regression +coefficient is substantially larger **or** smaller than zero), the p-value for +the test is twice the value of the shaded region. In this case, the shaded region +is small relative to the total area of the null distribution; therefore, the +p-value is small ($p=`r round(table_age_methyl1$p.value[[1]], digits = 3)`$). The blue-ish shaded region represents the same measure for the slope term; +this is larger, relative to the total area of the distribution, therefore the +p-value is larger than the one for the intercept term +($p=`r round(table_age_methyl1$p.value[[2]], digits = 3)`$). The +the p-value is a function of the test statistic: the ratio between the effect size +we're estimating and the uncertainty we have in that effect. A large effect with large +uncertainty may not lead to a small p-value, and a small effect with +small uncertainty may lead to a small p-value. + +::::::::::::::::::::::::::::::::::::::::: callout + +### Calculating p-values from a linear model + +Manually calculating the p-value for a linear model is a little bit +more complex than calculating the t-statistic. The intuition posted +above is definitely sufficient for most cases, but for completeness, +here is how we do it: + +Since the statistic in a linear model is a t-statistic, it follows a +student t distribution under the null hypothesis, with degrees of +freedom (a parameter of the student t-distribution) given by the +number of observations minus the number of coefficients fitted, in +this case +$`r ncol(methylation)` - `r length(coef(lm_age_methyl1))` = `r lm_age_methyl1$df`$. +We want to know what portion of the distribution function of the test +statistic is as extreme as, or more extreme than, the value we observed. +The function`pt()`(similar to`pnorm()`, etc) can give us this information. + +Since we're not sure if the coefficient will be larger or smaller than +zero, we want to do a 2-tailed test. Therefore we take the absolute +value of the t-statistic, and look at the upper rather than lower +tail. In the figure above the shaded areas are only looking at "half" of the +t-distribution (which is symmetric around zero), therefore we multiply the +shaded area by 2 in order to calculate the p-value. + +Combining all of this gives us: + +```{r simpval} +pvals <- 2 * pt(abs(tvals), df = lm_age_methyl1$df, lower.tail = FALSE) +all.equal(table_age_methyl1$p.value, pvals) +``` + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +## Sharing information across outcome variables + +Now that we understand how hypothesis tests work in the +linear model framework, we are going to introduce an idea that allows us to +take advantage of the fact that we carry out many tests at once on +structured data. We can leverage this fact to *share information* +between model parameters. The insight that we use to perform +*information pooling* or sharing is derived from our knowledge about the +structure of the data. For example, in a high-throughput experiment like +a DNA methylation assay, we know that all of the features were measured +simultaneously, using the same technique. This means that generally, we +expect the base-level variability for each feature to be broadly +similar. + +This can enable us to get a better estimate of the uncertainty of model +parameters than we could get if we consider each feature in isolation. +So, to share information between features allows us to get more robust +estimators. Remember that the t-statistic for coefficient $\\beta\_k$ in a +linear model is the ratio between the coefficient estimate and its standard +error: + +$$ +t_{k} = \frac{\hat{\beta}_{k}}{SE\left(\hat{\beta}_{k}\right)} +$$ + +It is clear that large effect sizes will likely lead to small p-values, +as long as the standard error for the coefficent is not large. However, +the standard error is affected by the amount of noise, as we saw +earlier. If we have a small number of observations, it is common for the +noise for some features to be extremely small simply by chance. This, in turn, +causes small p-values for these features, which may give us unwarranted +confidence in the level of certainty we have in the results (false positives). + +There are many statistical methods in genomics that use this type of +approach to get better estimates by pooling information between features +that were measured simultaneously using the same techniques. Here we +will focus on the package **`limma`**, which is an established software +package used to fit linear models, originally for the gene expression +micro-arrays that were common in the 2000s, but which is still in use in +RNAseq experiments, among others. The authors of **`limma`** made some +assumptions about the distributions that these follow, and pool +information across genes to get a better estimate of the uncertainty in +effect size estimates. It uses the idea that noise levels should be +similar between features to *moderate* the estimates of the test +statistic by shrinking the estimates of standard errors towards a common +value. This results in a *moderated t-statistic*. + +The process of running a model in **`limma`** is somewhat different to what you +may have seen when running linear models. Here, we define a *model matrix* or +*design matrix*, which is a way of representing the +coefficients that should be fit in each linear model. These are used in +similar ways in many different modelling libraries. + +```{r design-age} +library("limma") +design_age <- model.matrix(~age) +dim(design_age) +head(design_age) +``` + +::::::::::::::::::::::::::::::::::::::::: callout + +### What is a model matrix? + +When R fits a regression model, it chooses a vector of regression coefficients +that minimises the differences between outcome values and those values +predicted by using the covariates (or predictor variables). But how do we get +from a set of predictors and regression coefficients to predicted values? This +is done via matrix multipliciation. The matrix of predictors is (matrix) +multiplied by the vector of coefficients. That matrix is called the +**model matrix** (or design matrix). It has one row for each observation and +one column for each predictor plus (by default) one aditional column of ones +(the intercept column). Many R libraries (but not **`limma`** ) contruct the +model matrix behind the scenes. Usually, it can be extracted from a model fit +using the function `model.matrix()`. Here is an example: + +```{r} +data(cars) +head(cars) +mod1 <- lm(dist ~ speed, data=cars) # fit regression model using speed as a predictor +head(model.matrix(mod1)) # the model matrix contains two columns: intercept and speed +``` + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +As you can see, the design matrix has the same number of rows as our +methylation data has samples. It also has two columns - one for the +intercept (similar to the linear model we fit above) and one for age. +This happens "under the hood" when fitting a linear model with `lm()`, but +here we have to specify it directly. The [limma user +manual](https://www.bioconductor.org/packages/release/bioc/vignettes/limma/inst/doc/usersguide.pdf) +has more detail on how to make design matrices for different types of +experimental design, but here we are going to stick with this simple two-variable case. + +We then pass our matrix of methylation values into `lmFit()`, specifying +the design matrix. Internally, this function runs `lm()` on each row of +the data in an efficient way. The function `eBayes()`, when applied to the +output of `lmFit()`, performs the pooled estimation of standard errors +that results in the moderated t-statistics and resulting p-values. + +```{r lmfit-age} +fit_age <- lmFit(methyl_mat, design = design_age) +fit_age <- eBayes(fit_age) +``` + +To obtain the results of the linear models, we can use the `topTable()` +function. By default, this returns results for the first coefficient in +the model. As we saw above when using `lm()`, and when we defined +`design_age` above, the first coefficient relates to the intercept term, +which we are not particularly interested in here; therefore we specify +`coef = 2`. Further, `topTable()` by default only returns the top 10 +results. To see all of the results in the data, we specify +`number = nrow(fit_age)` to ensure that it returns a row for every row +of the input matrix. + +```{r ebayes-toptab} +toptab_age <- topTable(fit_age, coef = 2, number = nrow(fit_age)) +orderEffSize <- rev(order(abs(toptab_age$logFC))) # order by effect size (absolute log-fold change) +head(toptab_age[orderEffSize, ]) +``` + +The output of `topTable` includes the coefficient, here termed a log +fold change `logFC`, the average level (`aveExpr`), the t-statistic `t`, +the p-value (`P.Value`), and the *adjusted* p-value (`adj.P.Val`). We'll +cover what an adjusted p-value is very shortly. The table also includes +`B`, which represents the log-odds that a feature is signficantly +different, which we won't cover here, but which will generally be a 1-1 +transformation of the p-value. The coefficient estimates here are termed +`logFC` for legacy reasons relating to how microarray experiments were +traditionally performed. There are more details on this topic in many +places, for example [this tutorial by Kasper D. +Hansen](https://kasperdanielhansen.github.io/genbioconductor/html/limma.html) + +Now we have estimates of effect sizes and p-values for the association +between methylation level at each locus and age for our 37 samples. It's +useful to create a plot of effect size estimates (model coefficients) +against p-values for each of these linear models, to visualise the +magnitude of effects and the statistical significance of each. These +plots are often called "volcano plots", because they resemble an +eruption. + +```{r limmavolc1, fig.cap="Plotting p-values against effect sizes using limma; the results are similar to a standard linear model.", fig.alt="A plot of -log10(p) against effect size estimates for a regression of age against methylation using limma."} +plot(toptab_age$logFC, -log10(toptab_age$P.Value), + xlab = "Effect size", ylab = bquote(-log[10](p-value)), + pch = 19 +) +``` + +In this figure, every point represents a feature of interest. The x-axis +represents the effect size observed for that feature in a linear model, +while the y-axis is the $-\\log\_{10}(\\text{p-value})$, where larger +values indicate increasing statistical evidence of a non-zero effect +size. A positive effect size represents increasing methylation with +increasing age, and a negative effect size represents decreasing +methylation with increasing age. Points higher on the x-axis represent +features for which we think the results we observed would be very +unlikely under the null hypothesis. + +Since we want to identify features that have different methylation levels +in different age groups, in an ideal case there would be clear +separation between "null" and "non-null" features. However, usually we +observe results as we do here: there is a continuum of effect sizes and +p-values, with no clear separation between these two classes of +features. While statistical methods exist to derive insights from +continuous measures like these, it is often convenient to obtain a list +of features which we are confident have non-zero effect sizes. This is +made more difficult by the number of tests we perform. + +::::::::::::::::::::::::::::::::::::::: challenge + +### Challenge 3 + +The effect size estimates are very small, and yet many of the p-values +are well below a usual significance level of p \< 0.05. Why is this? + +::::::::::::::: solution + +### Solution + +Because age has a much larger range than methylation levels, the +unit change in methylation level even for a strong relationship is +very small! + +As we mentioned, the p-value is a function of both the effect size +estimate and the uncertainty (standard error) of that estimate. +Because the uncertainty in our estimates is much smaller than the +estimates themselves, the p-values are also small. + +If we predicted age using methylation level, it is likely we would see +much larger coefficients, though broadly similar p-values! + +::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +It is worthwhile considering what exactly the effect of the *moderation* +or information sharing that **`limma`** performs has on our results. To do +this, let us compare the effect sizes estimates and p-values from the two +approaches. + +```{r plot-limma-lm-effect, echo=FALSE} +plot( + coef_df[["estimate"]], + toptab_age[coef_df[["feature"]], "logFC"], + pch = 16, + main = "Comparison of effect sizes from limma and lm", + xlab = "Effect size from lm", + ylab = "Effect size from limma" +) +abline(0:1, lty = "dashed") +``` + +These are exactly identical! This is because **`limma`** does not perform +any sharing of information when estimating effect sizes. This is in +contrast to similar packages that apply shrinkage to the effect size +estimates, like **`DESeq2`**. These often use information sharing to shrink +or moderate the effect size estimates, in the case of **`DESeq2`** by again +sharing information between features about sample-to-sample variability. +In contrast, let us look at the p-values from **`limma`** and R's built-in `lm()` function: + +```{r plot-limma-lm-pval, echo=FALSE} +plot( + coef_df[["p.value"]], + toptab_age[coef_df[["feature"]], "P.Value"], + pch = 16, + main = "Comparison of p-values from limma and lm", + xlab = "p-value from lm", + ylab = "p-value from limma", + log = "xy" +) +abline(0:1, lty = "dashed") +``` + +we can see that for the vast majority of features, the results are +broadly similar. There seems to be a minor general tendency for **`limma`** +to produce smaller p-values, but for several features, the p-values from +limma are considerably larger than the p-values from `lm()`. This is +because the information sharing tends to shrink large standard error +estimates downwards and small estimates upwards. When the degree of +statistical significance is due to an abnormally small standard error +rather than a large effect, this effect results in this prominent +reduction in statistical significance, which has been shown to perform +well in case studies. The degree of shrinkage generally depends on the +amount of pooled information and the strength of the evidence +independent of pooling. For example, with very few samples and many +features, information sharing has a larger effect, because there are a +lot of genes that can be used to provide pooled estimates, and the +evidence from the data that this is weighed against is relatively +sparse. In contrast, when there are many samples and few features, there +is not much opportunity to generate pooled estimates, and the evidence +of the data can easily outweigh the pooling. + +Shrinkage methods like these ones can be complex to implement and +understand, but it is useful to develop an intuition about why these approaches may be more +precise and sensitive than the naive approach of fitting a model to each +feature separately. + +::::::::::::::::::::::::::::::::::::::: challenge + +### Challenge 4 + +1. Try to run the same kind of linear model with smoking status as + covariate instead of age, and making a volcano plot. *Note: + smoking status is stored as* `methylation$smoker`. +2. We saw in the example in the lesson that this information sharing + can lead to larger p-values. Why might this be preferable? + +::::::::::::::: solution + +### Solution + +1. The following code runs the same type of model with smoking + status: + + ```{r limmavolc2, fig.cap="A plot of significance against effect size for a regression of smoking against methylation.", fig.alt="A plot of -log10(p) against effect size estimates for a regression of smoking status against methylation using limma."} + design_smoke <- model.matrix(~methylation$smoker) + fit_smoke <- lmFit(methyl_mat, design = design_smoke) + fit_smoke <- eBayes(fit_smoke) + toptab_smoke <- topTable(fit_smoke, coef = 2, number = nrow(fit_smoke)) + plot(toptab_smoke$logFC, -log10(toptab_smoke$P.Value), + xlab = "Effect size", ylab = bquote(-log[10](p)), + pch = 19 + ) + ``` + +2. Being a bit more conservative when identifying features can help + to avoid false discoveries. Furthermore, when rejecting the null + hypothesis is based more on a small standard error resulting + from abnormally low levels of variability for a given feature, + we might want to be a bit more conservative in our expectations. + + + +::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +```{r limma-app-ex, echo=FALSE, eval=FALSE} +> ## Exercise +> +> Launch `shinystats::limmaApp` and adjust the parameters. +> +> Discuss the output in groups. Consider the following questions: +> +> 1. How does the number of features affect the relationship between these two +> similar methods? +> 2. What about the number of samples? +> 3. When ranking genes, why would we want to downrank the most significant and +> uprank some with more moderate changes? +> +> > ## Solution +> > +> > 1. With more features, the amount of shrinkage increases. +> > 2. With more samples, the shrinkage is weaker and the difference between the +> > methods is smaller. +> > 3. Because the p-value relies on the effect size estimate *and* its standard +> > error, a very small standard error by chance (with few replicates) can +> > lead to a very small p-value. "Moderating" or shrinking the standard errors +> > brings these more in line with features that have a similar effect size +> > but larger standard error. +> {: .solution} +{: .challenge} +``` + +::::::::::::::::::::::::::::::::::::::::: callout + +### Shrinkage + +Shrinkage is an intuitive term for an effect of information sharing, +and is something observed in a broad range of statistical models. +Often, shrinkage is induced by a *multilevel* modelling approach or by +*Bayesian* methods. + +The general idea is that these models incorporate information about +the structure of the data into account when fitting the parameters. We +can share information between features because of our knowledge about +the data structure; this generally requires careful consideration +about how the data were generated and the relationships within. + +An example people often use is estimating the effect of attendance on +grades in several schools. We can assume that this effect is similar +in different schools (but maybe not identical), so we can *share +information* about the effect size between schools and shrink our +estimates towards a common value. + +For example in **`DESeq2`**, the authors used the observation that genes +with similar expression counts in RNAseq data have similar +*dispersion*, and a better estimate of these dispersion parameters +makes estimates of fold changes much more stable. Similarly, in +**`limma`** the authors made the assumption that in the absence of +biological effects, we can often expect the technical variation in the +measurement of the expression of each of the genes to be broadly +similar. Again, better estimates of variability allow us to prioritise +genes in a more reliable way. + +There are many good resources to learn about this type of approach, +including: + +- [a blog post by TJ + Mahr](https://www.tjmahr.com/plotting-partial-pooling-in-mixed-effects-models/) +- [a book by David Robinson](https://gumroad.com/l/empirical-bayes) +- [a (relatively technical) book by Gelman and + Hill](https://www.stat.columbia.edu/~gelman/arm/) + + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +```{r, eval=FALSE, echo=FALSE} +# todo: callout box explaining DESeq2 +``` + +## The problem of multiple tests + +With such a large number of features, it would be useful to decide which +features are "interesting" or "significant" for further study. However, +if we were to apply a normal significance threshold of 0.05, it would be likely +we end up with a lot of false positives. This is because a p-value +threshold like this represents a $\\frac{1}{20}$ chance that we observe +results as extreme or more extreme under the null hypothesis (that there +is no assocation between age and methylation level). If we carry out many more +than 20 such tests, we can expect to see situations where, despite the null +hypothesis being true, we observe observe signifiant p-values due to random chance. To +demonstrate this, it is useful to see what happens if we permute (scramble) the age values and +run the same test again: + +```{r volcplotfake, fig.cap="Plotting p-values against effect sizes for a randomised outcome shows we still observe 'significant' results.", fig.alt="Plot of -log10(p) against effect size estimates for a regression of a made-up feature against methylation level for each feature in the data. A dashed line represents a 0.05 significance level."} + +age_perm <- age[sample(ncol(methyl_mat), ncol(methyl_mat))] +design_age_perm <- model.matrix(~age_perm) + +fit_age_perm <- lmFit(methyl_mat, design = design_age_perm) +fit_age_perm <- eBayes(fit_age_perm) +toptab_age_perm <- topTable(fit_age_perm, coef = 2, number = nrow(fit_age_perm)) + +plot(toptab_age_perm$logFC, -log10(toptab_age_perm$P.Value), + xlab = "Effect size", ylab = bquote(-log[10](p)), + pch = 19 +) +abline(h = -log10(0.05), lty = "dashed", col = "red") +``` + +Since we have generated a random sequence of ages, we have no reason to +suspect that there is a true association between methylation levels and +this sequence of random numbers. However, you can see that the p-value +for many features is still lower than a traditional significance level +of $p=0.05$. In fact, here `r sum(toptab_age_perm$P.Value < 0.05)` +features are significant at p \< 0.05. If we were to use this fixed +threshold in a real experiment, it is likely that we would identify many +features as associated with age, when the results we are observing are +simply due to chance. + +::::::::::::::::::::::::::::::::::::::: challenge + +### Challenge 5 + +1. If we run `r nrow(methylation)` tests under the null hypothesis, + how many of them (on average) will be statistically significant at + a threshold of $p \< 0.05$? +2. Why would we want to be conservative in labelling features as + significantly different? By conservative, we mean to err towards + labelling true differences as "not significant" rather than vice + versa. +3. How could we account for a varying number of tests to ensure + "significant" changes are truly different? + +::::::::::::::: solution + +### Solution + +1. By default we expect + $`r nrow(methylation)` \\times 0.05 = `r nrow(methylation) * 0.05`$ + features to be statistically significant under the null + hypothesis, because p-values should always be uniformly + distributed under the null hypothesis. +2. Features that we label as "significantly different" will often + be reported in manuscripts. We may also spend time and money + investigating them further, computationally or in the lab. + Therefore, spurious results have a real cost for ourselves and + for others. +3. One approach to controlling for the number of tests is to divide + our significance threshold by the number of tests performed. + This is termed "Bonferroni correction" and we'll discuss this + further now. + + + +::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +## Adjusting for multiple tests + +When performing many statistical tests to categorise features, we are +effectively classifying features as "non-significant" or "significant", that latter meaning those for +which we reject the null hypothesis. We also +generally hope that there is a subset of features for which the null +hypothesis is truly false, as well as many for which the null truly does +hold. We hope that for all features for which the null hypothesis is +true, we accept it, and for all features for which the null hypothesis +is not true, we reject it. As we showed in the example with permuted +age, with a large number of tests it is inevitable that we will get some of +these wrong. + +We can think of these features as being "truly different" or "not truly +different"[^1]. Using this idea, we can see that each categorisation we +make falls into four categories: + +[^1]: "True difference" is a hard category to rigidly define. As we've +seen, with a lot of data, we can detect tiny differences, and with +little data, we can't detect large differences. However, both can be +argued to be "true". + +| | Label as different | Label as not different | +| -----------------------------------------------------: | ----------------------------------------------: | ---------------------: | +| Truly different | True positive | False negative | +| Truly not different | False positive | True negative | + +If the null hypothesis was true for every feature, then as we perform +more and more tests we'd tend to correctly categorise most results as +negative. However, since p-values are uniformly distributed under the +null, at a significance level of 5%, 5% of all results will be +"significant" even though we would expect to see these results, given +the null hypothesis is true, simply by chance. These would fall under +the label "false positives" in the table above, and are also termed +"false discoveries." + +There are two common ways of controlling these false discoveries. The +first is to say, when we're doing $n$ tests, that we want to have the +same certainty of making one false discovery with $n$ tests as we have +if we're only doing one test. This is "Bonferroni" correction,[^2] which +divides the significance level by the number of tests performed, $n$. +Equivalently, we can use the non-transformed p-value threshold but +multiply our p-values by the number of tests. This is often very +conservative, especially with a lot of features! + +[^2]: Bonferroni correction is also termed "family-wise" error rate +control. + +```{r p-fwer, fig.cap="Bonferroni correction often produces very large p-values, especially with low sample sizes.", fig.alt="Plot of Bonferroni-adjusted p-values (y) against unadjusted p-values (x). A dashed black line represents the identity (where x=y), while dashed red lines represent 0.05 significance thresholds."} +p_raw <- toptab_age$P.Value +p_fwer <- p.adjust(p_raw, method = "bonferroni") +library("ggplot2") +ggplot() + + aes(p_raw, p_fwer) + + geom_point() + + scale_x_log10() + scale_y_log10() + + geom_abline(slope = 1, linetype = "dashed") + + geom_hline(yintercept = 0.05, linetype = "dashed", col = "red") + + geom_vline(xintercept = 0.05, linetype = "dashed", col = "red") + + labs(x = "Raw p-value", y = "Bonferroni p-value") +``` + +You can see that the p-values are exactly one for the vast majority of +tests we performed! This is not ideal sometimes, because unfortunately +we usually don't have very large sample sizes in health sciences. + +The second main way of controlling for multiple tests is to control the +*false discovery rate*.[^3] This is the proportion of false positives, +or false discoveries, we'd expect to get each time if we repeated the +experiment over and over. + +[^3]: This is often called "Benjamini-Hochberg" adjustment. + +1. Rank the p-values +2. Assign each a rank (1 is smallest) +3. Calculate the critical value $$ + q = \left(\frac{i}{m}\right)Q + $$, where $i$ is rank, $m$ is the number of tests, and $Q$ is the + false discovery rate we want to target.[^4] +4. Find the largest p-value less than the critical value. All smaller + than this are significant. + +[^4]: People often perform extra controls on FDR-adjusted p-values, +ensuring that ranks don't change and the critical value is never +smaller than the original p-value. + +| FWER | FDR | +| :----------------------------------------------------- | :---------------------------------------------- | +| \+ Controls probability of identifying a false positive | \+ Controls rate of false discoveries | +| \+ Strict error rate control | \+ Allows error control with less stringency | +| \- Very conservative | \- Does not control probability of making errors | +| \- Requires larger statistical power | \- May result in false discoveries | + +::::::::::::::::::::::::::::::::::::::: challenge + +### Challenge 6 + +1. At a significance level of 0.05, with 100 tests performed, what is + the Bonferroni significance threshold? +2. In a gene expression experiment, after FDR correction we observe + 500 significant genes. What proportion of these genes are truly + different? +3. Try running FDR correction on the `p_raw` vector. *Hint: check + `help("p.adjust")` to see what the method is called*. + Compare these values to the raw p-values and the Bonferroni + p-values. + +::::::::::::::: solution + +### Solution + +1. The Bonferroni threshold for this significance threshold is $$ + \frac{0.05}{100} = 0.0005 + $$ + +2. Trick question! We can't say what proportion of these genes are + truly different. However, if we repeated this experiment and + statistical test over and over, on average 5% of the results + from each run would be false discoveries. + +3. The following code runs FDR correction and compares it to + non-corrected values and to Bonferroni: + + ```{r p-fdr, fig.cap="Benjamini-Hochberg correction is less conservative than Bonferroni", fig.alt="Plot of Benjamini-Hochberg-adjusted p-values (y) against unadjusted p-values (x). A dashed black line represents the identity (where x=y), while dashed red lines represent 0.05 significance thresholds."} + p_fdr <- p.adjust(p_raw, method = "BH") + ggplot() + + aes(p_raw, p_fdr) + + geom_point() + + scale_x_log10() + scale_y_log10() + + geom_abline(slope = 1, linetype = "dashed") + + geom_hline(yintercept = 0.05, linetype = "dashed", color = "red") + + geom_vline(xintercept = 0.05, linetype = "dashed", color = "red") + + labs(x = "Raw p-value", y = "Benjamini-Hochberg p-value") + ``` + + ```{r plot-fdr-fwer, fig.alt="Plot of Benjamini-Hochberg-adjusted p-values (y) against Bonferroni-adjusted p-values (x). A dashed black line represents the identity (where x=y), while dashed red lines represent 0.05 significance thresholds."} + ggplot() + + aes(p_fdr, p_fwer) + + geom_point() + + scale_x_log10() + scale_y_log10() + + geom_abline(slope = 1, linetype = "dashed") + + geom_hline(yintercept = 0.05, linetype = "dashed", color = "red") + + geom_vline(xintercept = 0.05, linetype = "dashed", color = "red") + + labs(x = "Benjamini-Hochberg p-value", y = "Bonferroni p-value") + ``` + +::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +::::::::::::::::::::::::::::::::::::::::: callout + +### Feature selection + +In this episode, we have focussed on regression in a setting where there are more +features than observations. This approach is relevant if we are interested in the +association of each feature with some outcome or if we want to screen for features +that have a strong association with an outcome. If, however, we are interested in +predicting an outcome or if we want to know which features explain the variation +in the outcome, we may want to restrict ourselves to a subset of relevant features. +One way of doing this is called *regularisation*, and this is the topic of the next episode. +An alternative is called *feature selection*. This is covered in the subsequent (optional) episode. + + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +### Further reading + +- [**`limma`** tutorial by Kasper D. + Hansen](https://kasperdanielhansen.github.io/genbioconductor/html/limma.html) +- [**`limma`** user + manual](https://www.bioconductor.org/packages/release/bioc/vignettes/limma/inst/doc/usersguide.pdf). +- [The **`VariancePartition`** package](https://bioconductor.org/packages/release/bioc/vignettes/variancePartition/inst/doc/dream.html) has similar functionality to **`limma`** but allows the inclusion of random effects. + +### Footnotes + + + +:::::::::::::::::::::::::::::::::::::::: keypoints + +- Performing linear regression in a high-dimensional setting requires us to perform hypothesis testing in a way that low-dimensional regression may not. +- Sharing information between features can increase power and reduce false positives. +- When running a lot of null hypothesis tests for high-dimensional data, multiple testing correction allows retain power and avoid making costly false discoveries. +- Multiple testing methods can be more conservative or more liberal, depending on our goals. + +:::::::::::::::::::::::::::::::::::::::::::::::::: + + diff --git a/episodes/03-regression-regularisation.Rmd b/episodes/03-regression-regularisation.Rmd new file mode 100644 index 00000000..d925c368 --- /dev/null +++ b/episodes/03-regression-regularisation.Rmd @@ -0,0 +1,1157 @@ +--- +title: Regularised regression +source: Rmd +teaching: 60 +exercises: 20 +math: yes +--- + +::::::::::::::::::::::::::::::::::::::: objectives + +- Understand the benefits of regularised models. +- Understand how different types of regularisation work. +- Apply and critically analyse regularised regression models. + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::: questions + +- What is regularisation? +- How does regularisation work? +- How can we select the level of regularisation for a model? + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +```{r settings, include=FALSE} +library("here") +library("ComplexHeatmap") +source(here("bin/chunk-options.R")) +``` + +## Introduction + +This episode is about **regularisation**, also called **regularised regression** +or **penalised regression**. This approach can be used for prediction and for +feature selection and it is particularly useful when dealing with high-dimensional data. + +One reason that we need special statistical tools for high-dimensional data is +that standard linear models cannot handle high-dimensional data sets -- one cannot fit +a linear model where there are more features (predictor variables) than there are observations +(data points). In the previous lesson we dealt with this problem by fitting individual +models for each feature and sharing information among these models. Now we will +take a look at an alternative approach called regularisation. Regularisation can be used to +stabilise coefficient estimates (and thus to fit models with more features than observations) +and even to select a subset of relevant features. + +First, let us check out what happens if we try to fit a linear model to high-dimensional +data! We start by reading in the data from the last lesson: + +```{r loadmethy} +library("here") +library("minfi") +methylation <- readRDS(here("data/methylation.rds")) + +## here, we transpose the matrix to have features as rows and samples as columns +methyl_mat <- t(assay(methylation)) +age <- methylation$Age +``` + +Then, we try to fit a model with outcome age and all 5,000 features in this +dataset as predictors (average methylation levels, M-values, across different +sites in the genome). + +```{r fitall, R.options=list(max.print = 20)} +# by using methyl_mat in the formula below, R will run a multivariate regression +# model in which each of the columns in methyl_mat is used as a predictor. +fit <- lm(age ~ methyl_mat) +summary(fit) +``` + +You can see that we're able to get some effect size estimates, but they seem very +high! The summary also says that we were unable to estimate +effect sizes for `r format(sum(is.na(coef(fit))), big.mark=",")` features +because of "singularities". What this means is that R couldn't find a way to +perform the calculations necessary due to the fact that we have more features +than observations. + +::::::::::::::::::::::::::::::::::::::::: callout + +### Singularities + +The message that `lm` produced is not necessarily the most intuitive. What +are "singularities", and why are they an issue? A singular matrix +is one that cannot be +[inverted](https://en.wikipedia.org/wiki/Invertible_matrix). +The inverse of an $n \\times n$ square matrix $A$ is the matrix $B$ for which +$AB = BA = I\_n$, where $I\_n$ is the $n \\times n$ identity matrix. + +Why is the inverse important? Well, to find the +coefficients of a linear model of a matrix of predictor features $X$ and an +outcome vector $y$, we may perform the calculation + +$$ +(X^TX)^{-1}X^Ty +$$ + +You can see that, if we're unable to find the inverse of the matrix $X^TX$, +then we'll be unable to find the regression coefficients. + +Why might this be the case? +Well, when the [determinant](https://en.wikipedia.org/wiki/Determinant) +of the matrix is zero, we are unable to find its inverse. + +```{r determinant} +xtx <- t(methyl_mat) %*% methyl_mat +det(xtx) +``` + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +> ### Correlated features -- common in high-dimensional data +> +> So, we can't fit a standard linear model to high-dimensional data. But there +> is another issue. In high-dimensional datasets, there +> are often multiple features that contain redundant information (correlated features). +> +> We have seen in the first episode that correlated features can make it hard +> (or impossible) to correctly infer parameters. If we visualise the level of +> correlation between sites in the methylation dataset, we can see that many +> of the features essentially represent the same information - there are many +> off-diagonal cells, which are deep red or blue. For example, the following +> heatmap visualises the correlations for the first 500 features in the +> `methylation` dataset (we selected 500 features only as it can be hard to +> visualise patterns when there are too many features!). +> +> ```{r corr-mat-meth, fig.cap="Cap", fig.alt="Alt"} +> ``` + +library("ComplexHeatmap") + +::::::::::::::::::::::::::::::::::::::::: callout + +small \<- methyl\_mat[, 1:500] +cor\_mat \<- cor(small) +Heatmap(cor\_mat, +column\_title = "Feature-feature correlation in methylation data", +name = "Pearson correlation", +show\_row\_dend = FALSE, show\_column\_dend = FALSE, +show\_row\_names = FALSE, show\_column\_names = FALSE +) + +``` + +Correlation between features can be problematic for technical reasons. If it is +very severe, it may even make it impossible to fit a model! This is in addition to +the fact that with more features than observations, we can't even estimate +the model properly. Regularisation can help us to deal with correlated features. +``` + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +::::::::::::::::::::::::::::::::::::::: challenge + +### Challenge 1 + +Discuss in groups: + +1. Why would we observe correlated features in high-dimensional biological + data? +2. Why might correlated features be a problem when fitting linear models? +3. What issue might correlated features present when selecting features to include in a model one at a time? + +::::::::::::::: solution + +### Solution + +1. Many of the features in biological data represent very similar + information biologically. For example, sets of genes that form complexes + are often expressed in very similar quantities. Similarly, methylation + levels at nearby sites are often very highly correlated. +2. Correlated features can make inference unstable or even impossible + mathematically. +3. When we are selecting features one at a time we want to pick the most predictive feature each time. + When a lot of features are very similar but encode + slightly different information, which of the correlated features we + select to include can have a huge impact on the later stages of model selection! + +::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +## Coefficient estimates of a linear model + +When we fit a linear model, we're finding the line through our data that +minimises the sum of the squared residuals. We can think of that as finding +the slope and intercept that minimises the square of the length of the dashed +lines. In this case, the red line in the left panel is the line that +accomplishes this objective, and the red dot in the right panel is the point +that represents this line in terms of its slope and intercept among many +different possible models, where the background colour represents how well +different combinations of slope and intercept accomplish this objective. + +```{r regplot, echo=FALSE, fig.cap="Illustrative example demonstrated how regression coefficients are inferred under a linear model framework.", fig.alt="For each observation, the left panel shows the residuals with respect to the optimal line (the one that minimises the sum of square errors). These are calculated as the difference between the value predicted by the line and the observed outcome. Right panel shows the sum of squared residuals across all possible linear regression models (as defined by different values of the regression coefficients).", fig.width=10} +library("viridis") +set.seed(42) +noise_sd <- 1 +nobs <- 50 +slope <- 2 +intercept <- 2 + +maxlim <- max(abs(slope), abs(intercept)) * 2 +maxlim <- max(maxlim, 5) +lims <- c(-maxlim, maxlim) + +l2 <- 2 +x <- rnorm(nobs, mean = 0, sd = 1) +noise <- rnorm(nobs, mean = 0, sd = noise_sd) +y <- (slope * x) + (intercept) + noise + +n <- 200 +s <- seq(-maxlim, maxlim, length.out = n) +loglik <- function(slope, intercept, x, y, noise_sd) { + sum(dnorm(y, mean = (slope * x) + intercept, sd = noise_sd, log = TRUE)) +} + +ll <- matrix(ncol = n, nrow = n) +coef <- mask <- norm_mat <- matrix(ncol = n, nrow = n) +for (i in seq_along(s)) { + coef[, ] <- s +} +for (i in seq_along(s)) { + for (j in seq_along(s)) { + norm_mat[i, j] <- sqrt(s[[i]]^2 + s[[j]]^2) + ll[i, j] <- loglik(s[i], s[j], x, y, noise_sd) + } +} +mask <- norm_mat <= l2 +ind_mle <- arrayInd(which.max(ll), dim(ll)) +pll <- ll * as.numeric(mask) +pll[pll == 0] <- NA +ind_ple <- arrayInd(which.max(pll), dim(pll)) + +par(mfrow = 1:2) +plot(x, y, pch = 19) +abline( + a = coef[ind_mle[[2]]], + b = coef[ind_mle[[1]]], + col = "firebrick" +) +yhat <- x +for (i in seq_along(x)) { + yhat[[i]] <- (x[[i]] * coef[ind_mle[[1]]]) + coef[ind_mle[[2]]] + lines( + x = rep(x[[i]], each = 2), y = c(yhat[[i]], y[[i]]), + lty = "dashed" + ) +} + +image(s, s, ll, + xlab = "slope", ylab = "intercept", + col = viridis(40, option = "A", direction = 1), + xlim = lims, ylim = lims +) +abline(v = 0, lty = "dashed") +abline(h = 0, lty = "dashed") +points( + coef[ind_mle[[1]]], coef[ind_mle[[2]]], + pch = 19, cex = 2, col = "firebrick" +) +``` + +Mathematically, we can write the sum of squared residuals as + +$$ +\sum_{i=1}^N ( y_i-x'_i\beta)^2 +$$ + +where $\\beta$ is a vector of (unknown) covariate effects which we want to learn +by fitting a regression model: the $j$-th element of $\\beta$, which we denote as +$\\beta\_j$ quantifies the effect of the $j$-th covariate. For each individual +$i$, $x\_i$ is a vector of $j$ covariate values and $y\_i$ is the true observed value for +the outcome. The notation $x'\_i\\beta$ indicates matrix multiplication. In this case, the +result is equivalent to multiplying each element of $x\_i$ by its corresponding element in +$\\beta$ and then calculating the sum across all of those values. The result of this +product (often denoted by $\\hat{y}\_i$) is the predicted value of the outcome generated +by the model. As such, $y\_i-x'\_i\\beta$ can be interpreted as the prediction error, also +referred to as model residual. To quantify the total error across all individuals, we sum +the square residuals $( y\_i-x'\_i\\beta)^2$ across all the individuals in our data. + +Finding the value of $\\beta$ that minimises +the sum above is the line of best fit through our data when considering +this goal of minimising the sum of squared error. However, it is not the only +possible line we could use! For example, we might want to err on the side of +caution when estimating effect sizes (coefficients). That is, we might want to +avoid estimating very large effect sizes. This can help us to create *generalisable* +models. This is important when models that are fitted (trained) on one dataset +and then used to predict outcomes from a new dataset. Restricting parameter +estimates is particularly important when analysing high-dimensional data. + +::::::::::::::::::::::::::::::::::::::: challenge + +### Challenge 2 + +Discuss in groups: + +1. What are we minimising when we fit a linear model? +2. Why are we minimising this objective? What assumptions are we making + about our data when we do so? + +::::::::::::::: solution + +### Solution + +1. When we fit a linear model we are minimising the squared error. + In fact, the standard linear model estimator is often known as + "ordinary least squares". The "ordinary" really means "original" here, + to distinguish between this method, which dates back to ~1800, and some + more "recent" (think 1940s...) methods. +2. Least squares assumes that, when we account for the change in the mean of + the outcome based on changes in the income, the data are normally + distributed. That is, the *residuals* of the model, or the error + left over after we account for any linear relationships in the data, + are normally distributed, and have a fixed variance. + +::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +## Model selection using training and test sets + +Sets of models are often compared using statistics such as adjusted $R^2$, AIC or BIC. +These show us how well the model is learning the data used in fitting that same model [^1]. +However, these statistics do not really tell us how well the model will generalise to new data. +This is an important thing to consider -- if our model doesn't generalise to new data, +then there's a chance that it's just picking up on a technical or batch effect +in our data, or simply some noise that happens to fit the outcome we're modelling. +This is especially important when our goal is prediction -- it's not much good +if we can only predict well for samples where the outcome is already known, +after all! + +To get an idea of how well our model generalises, we can split the data into +two - a "training" and a "test" set. We use the "training" data to fit the model, +and then see its performance on the "test" data. + +```{r validation, echo=FALSE, out.width="500px", fig.cap="Schematic representation of how a dataset can be divided into a training and a test set.", fig.alt="Schematic representation of how a dataset can be divided into a training (the portion of the data used to fit a model) and a test set (the portion of the data used to assess external generalisability)."} +knitr::include_graphics("fig/validation.png") +``` + +One thing that often happens in this context is that large +coefficient values minimise the training error, but they don't minimise the +test error on unseen data. First, we'll go through an example of what exactly +this means. + +For the next few challenges, we'll work with a set of features +known to be associated with age from a paper by Horvath et al.[^2]. Horvath et al +use methylation markers alone to predict the biological age of an individual. +This is useful in studying age-related disease amongst many other things. + +```{r coefhorvath} +coef_horvath <- readRDS(here::here("data/coefHorvath.rds")) +methylation <- readRDS(here::here("data/methylation.rds")) + +library("SummarizedExperiment") +age <- methylation$Age +methyl_mat <- t(assay(methylation)) + +coef_horvath <- coef_horvath[1:20, ] +features <- coef_horvath$CpGmarker +horvath_mat <- methyl_mat[, features] + +## Generate an index to split the data +set.seed(42) +train_ind <- sample(nrow(methyl_mat), 25) +``` + +::::::::::::::::::::::::::::::::::::::: challenge + +### Challenge 3 + +1. Split the methylation data matrix and the age vector + into training and test sets. +2. Fit a model on the training data matrix and training age + vector. +3. Check the mean squared error on this model. + +::::::::::::::: solution + +### Solution + +1. Splitting the data involves using our index to split up the matrix and + the age vector into two each. We can use a negative subscript to create + the test data. + + ```{r testsplit} + train_mat <- horvath_mat[train_ind, ] + train_age <- age[train_ind] + test_mat <- horvath_mat[-train_ind, ] + test_age <- age[-train_ind] + ``` + +The solution to this exercise is important because the generated objects +(`train_mat`, `train_age`, `test_mat` and `test_age`) will be used later in +this episode. Please make sure that you use the same object names. + +2. To + + ```{r trainfit} + # as.data.frame() converts train_mat into a data.frame + fit_horvath <- lm(train_age ~ ., data = as.data.frame(train_mat)) + ``` + +Using the `.` syntax above together with a `data` argument will lead to +the same result as usign `train_age ~ tran_mat`: R will fit a multivariate +regression model in which each of the colums in `train_mat` is used as +a predictor. We opted to use the `.` syntax because it will help us to +obtain model predictions using the `predict()` function. + +3. The mean squared error of the model is the mean of the square of the + residuals. This seems very low here -- on average we're only off by + about a year! + ```{r testerror} + mean(residuals(fit_horvath)^2) + ``` + +::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +Having trained this model, now we can check how well it does in predicting age +from new dataset (the test data). +Here we use the mean of the squared difference between our predictions and the +true ages for the test data, or "mean squared error" (MSE). Unfortunately, it +seems like this is a lot higher than the error on the training data! + +```{r} +mse <- function(true, prediction) { + mean((true - prediction)^2) +} +pred_lm <- predict(fit_horvath, newdata = as.data.frame(test_mat)) +err_lm <- mse(test_age, pred_lm) +err_lm +``` + +Further, if we plot true age against predicted age for the samples in the test +set, we can see how well we're really doing - ideally these would line up +exactly! + +```{r test-plot-lm, fig.cap="A scatter plot of observed age versus predicted age for individuals in the test set. Each dot represents one individual. Dashed line is used as a reference to indicate how perfect predictions would look (observed = predicted).", fig.alt="A scatter plot of observed age versus predicted age for individuals in the test set. Each dot represents one individual. Dashed line is used as a reference to indicate how perfect predictions would look (observed = predicted). In this case we observe high prediction error in the test set."} +par(mfrow = c(1, 1)) +plot(test_age, pred_lm, pch = 19) +abline(coef = 0:1, lty = "dashed") +``` + +This figure shows the predicted ages obtained from a linear model fit plotted +against the true ages, which we kept in the test dataset. If the prediction were +good, the dots should follow a line. Regularisation can help us to make the +model more generalisable, improving predictions for the test dataset (or any +other dataset that is not used when fitting our model). + +## Using regularisation to impove generalisability + +As stated above, restricting model parameter estimates can improve a model's +generalisability. This can be done with *regularisation*. The idea to add another +condition to the problem we're solving with linear regression. This condition +controls the total size of the coefficients that come out. +For example, we might say that the point representing the slope and intercept +must fall within a certain distance of the origin, $(0, 0)$. Note that we are +still trying to solve for the line that minimises the square of the residuals; +we are just adding this extra constraint to our solution. + +For the 2-parameter model (slope and intercept), we could +visualise this constraint as a circle with a given radius. We +want to find the "best" solution (in terms of minimising the +residuals) that also falls within a circle of a given radius +(in this case, `r l2`). + +```{r ridgeplot, echo=FALSE, fig.cap="Illustrative example demonstrated how regression coefficients are inferred under a linear model framework, with (blue line) and without (red line) regularisation. A ridge penalty is used in this example", fig.alt="For each observation, the left panel shows the residuals with respect to the optimal lines obtained with and without regularisation. Right panel shows the sum of squared residuals across all possible linear regression models. Regularisation moves the line away from the optimal (in terms of minimising the sum of squared residuals). ", fig.width=10} +set.seed(42) +par(mfrow = 1:2) +plot(x, y, pch = 19) +abline( + a = coef[ind_ple[[2]]], + b = coef[ind_ple[[1]]], col = "dodgerblue" +) +abline( + a = coef[ind_mle[[2]]], + b = coef[ind_mle[[1]]], col = "firebrick" +) + +image(s, s, ll, + xlab = "slope", ylab = "intercept", + col = viridis(40, option = "A", direction = 1), + xlim = lims, ylim = lims +) +abline(v = 0, lty = "dashed") +abline(h = 0, lty = "dashed") +points( + coef[ind_mle[[1]]], coef[ind_mle[[2]]], + pch = 19, cex = 2, col = "firebrick" +) +contour( + s, s, norm_mat, + add = TRUE, levels = l2, drawlabels = FALSE +) +points( + coef[ind_ple[[1]]], coef[ind_ple[[2]]], + pch = 19, cex = 2, col = "dodgerblue" +) +par(mfrow = c(1, 1)) +``` + +There are multiple ways to define the distance that our solution must fall in, +though. The one we've plotted above controls the squared sum of the +coefficients, $\\beta$. +This is also sometimes called the $L^2$ norm. This is defined as + +$$ +\left\lVert \beta\right\lVert_2 = \sqrt{\sum_{j=1}^p \beta_j^2} +$$ + +To control this, we specify that the solution for the equation above +also has to have an $L^2$ norm smaller than a certain amount. Or, equivalently, +we try to minimise a function that includes our $L^2$ norm scaled by a +factor that is usually written $\\lambda$. + +$$ +\sum_{i=1}^N \biggl( y_i - x'_i\beta\biggr)^2 + \lambda \left\lVert \beta \right\lVert_2 ^2 +$$ + +Another way of thinking about this is that when finding the best model, we're +weighing up a balance of the ordinary least squares objective and a "penalty" +term that punished models with large coefficients. The balance between the +penalty and the ordinary least squares objective is controlled by $\\lambda$ - +when $\\lambda$ is large, we care a lot about the size of the coefficients. +When it's small, we don't really care a lot. When it's zero, we're back to +just using ordinary least squares. This type of regularisation is called *ridge regression*. + +## Why would we want to restrict our model? + +It may seem an odd thing to do: to restrict the possible values of our model +parameters! Why would we want to do this? Firstly, as discussed earlier, our +model estimates can be very unstable or even difficult to calculate when we have +many correlated features. Secondly, this type of approach can make our model more +generalisable to new data. To show this, we'll fit a model using the same set +of 20 features (stored as `features`) selected earlier in this episode (these +are a subset of the features identified by Horvarth et al), using both +regularised and ordinary least squares. + +```{r plot-ridge, fig.cap="Cap", fig.alt="Alt"} +library("glmnet") + +## glmnet() performs scaling by default, supply un-scaled data: +horvath_mat <- methyl_mat[, features] # select the first 20 sites as before +train_mat <- horvath_mat[train_ind, ] # use the same individuals as selected before +test_mat <- horvath_mat[-train_ind, ] + + + +ridge_fit <- glmnet(x = train_mat, y = train_age, alpha = 0) +plot(ridge_fit, xvar = "lambda") +abline(h = 0, lty = "dashed") +``` + +This plot shows how the estimated coefficients for each CpG site change +as we increase the penalty, $\\lambda$. That is, +as we decrease the size of the region that solutions can fall into, the values +of the coefficients that we get back tend to decrease. In this case, +coefficients trend towards zero but generally don't reach it until the penalty +gets very large. We can see that initially, some parameter estimates are really, +really large, and these tend to shrink fairly rapidly. + +We can also notice that some parameters "flip signs"; that is, they start off +positive and become negative as lambda grows. This is a sign of collinearity, +or correlated predictors. As we reduce the importance of one feature, we can +"make up for" the loss in accuracy from that one feature by adding a bit of +weight to another feature that represents similar information. + +Since we split the data into test and training data, we can prove that ridge +regression gives us a better prediction in this case: + +```{r pred-ridge-lm, fig.width=10, fig.cap="Cap", fig.alt="Alt"} +pred_ridge <- predict(ridge_fit, newx = test_mat) +err_ridge <- apply(pred_ridge, 2, function(col) mse(test_age, col)) +min(err_ridge) +err_lm + +which_min_err <- which.min(err_ridge) +min_err_ridge <- min(err_ridge) +pred_min_ridge <- pred_ridge[, which_min_err] +``` + +We can see where on the continuum of lambdas we've picked a model by plotting +the coefficient paths again. In this case, we've picked a model with fairly +modest shrinkage. + +```{r chooselambda, fig.cap="Cap", fig.alt="Alt"} +chosen_lambda <- ridge_fit$lambda[which.min(err_ridge)] +plot(ridge_fit, xvar = "lambda") +abline(v = log(chosen_lambda), lty = "dashed") +``` + +::::::::::::::::::::::::::::::::::::::: challenge + +### Challenge 4 + +1. Which performs better, ridge or OLS? +2. Plot predicted ages for each method against the true ages. + How do the predictions look for both methods? Why might ridge be + performing better? + +::::::::::::::: solution + +### Solution + +1. Ridge regression performs significantly better on unseen data, despite + being "worse" on the training data. + ```{r err-lm-ridge} + min_err_ridge + err_lm + ``` +2. The ridge ones are much less spread out with far fewer extreme predictions. + ```{r plot-ridge-prediction, fig.width=10, fig.cap="Cap", fig.alt="Alt"} + all <- c(pred_lm, test_age, pred_min_ridge) + lims <- range(all) + par(mfrow = 1:2) + plot(test_age, pred_lm, + xlim = lims, ylim = lims, + pch = 19 + ) + abline(coef = 0:1, lty = "dashed") + plot(test_age, pred_min_ridge, + xlim = lims, ylim = lims, + pch = 19 + ) + abline(coef = 0:1, lty = "dashed") + ``` + +::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +## LASSO regression + +*LASSO* is another type of regularisation. In this case we use the $L^1$ norm, +or the sum of the absolute values of the coefficients. + +$$ +\left\lVert \beta \right\lVert_1 = \sum_{j=1}^p |\beta_j| +$$ + +This tends to produce sparse models; that is to say, it tends to remove features +from the model that aren't necessary to produce accurate predictions. This +is because the region we're restricting the coefficients to has sharp edges. +So, when we increase the penalty (reduce the norm), it's more likely that +the best solution that falls in this region will be at the corner of this +diagonal (i.e., one or more coefficient is exactly zero). + +```{r shrink-lasso, echo=FALSE, fig.cap="Illustrative example demonstrated how regression coefficients are inferred under a linear model framework, with (blue line) and without (red line) regularisation. A LASSO penalty is used in this example.", fig.alt="For each observation, the left panel shows the residuals with respect to the optimal lines obtained with and without regularisation. Right panel shows the sum of squared residuals across all possible linear regression models. Regularisation moves the line away from the optimal (in terms of minimising the sum of squared residuals)", fig.width=10} +loglik <- function(slope, intercept, x, y, noise_sd) { + sum(dnorm(y, mean = (slope * x) + intercept, sd = noise_sd, log = TRUE)) +} +drawplot <- function() { + set.seed(42) + noise_sd <- 1 + nobs <- 200 + slope <- 2 + intercept <- 2 + maxlim <- max(abs(slope), abs(intercept)) * 2 + maxlim <- max(maxlim, 5) + lims <- c(-maxlim, maxlim) + l1 <- 2 + x <- rnorm(nobs, mean = 0, sd = 1) + noise <- rnorm(nobs, mean = 0, sd = noise_sd) + y <- (slope * x) + (intercept) + noise + n <- 200 + + s <- seq(-maxlim, maxlim, length.out = n) + + ll <- matrix(ncol = n, nrow = n) + coef <- mask <- norm_mat <- matrix(ncol = n, nrow = n) + for (i in seq_along(s)) { + coef[, ] <- s + } + for (i in seq_along(s)) { + for (j in seq_along(s)) { + norm_mat[i, j] <- abs(s[[i]]) + abs(s[[j]]) + ll[i, j] <- loglik(s[[i]], s[[j]], x, y, noise_sd) + } + } + mask <- norm_mat <= l1 + + ind_mle <- arrayInd(which.max(ll), dim(ll)) + pll <- ll * as.numeric(mask) + pll[pll == 0] <- NA + ind_ple <- arrayInd(which.max(pll), dim(pll)) + + par(mfrow = 1:2) + + plot(x, y, pch = 19) + abline( + a = coef[ind_ple[[2]]], + b = coef[ind_ple[[1]]], col = "dodgerblue" + ) + abline( + a = coef[ind_mle[[2]]], + b = coef[ind_mle[[1]]], col = "firebrick" + ) + image(s, s, ll, + xlab = "slope", ylab = "intercept", + col = viridis(40, option = "A", direction = 1), + xlim = lims, ylim = lims + ) + abline(v = 0, lty = "dashed") + abline(h = 0, lty = "dashed") + points( + coef[ind_mle[[1]]], coef[ind_mle[[2]]], + pch = 19, cex = 2, col = "firebrick" + ) + contour( + s, s, norm_mat, + add = TRUE, levels = l1, drawlabels = FALSE + ) + points( + coef[ind_ple[[1]]], coef[ind_ple[[2]]], + pch = 19, cex = 2, col = "dodgerblue" + ) +} +drawplot() +``` + +::::::::::::::::::::::::::::::::::::::: challenge + +### Challenge 5 + +1. Use `glmnet` to fit a LASSO model (hint: set `alpha = 1`). +2. Plot the model object. Remember that for ridge regression, + we set `xvar = "lambda"`. What if you don't set this? What's the + relationship between the two plots? +3. How do the coefficient paths differ to the ridge case? + +::::::::::::::: solution + +### Solution + +1. Fitting a LASSO model is very similar to a ridge model, we just need + to change the `alpha` setting. + ```{r fitlas} + fit_lasso <- glmnet(x = methyl_mat, y = age, alpha = 1) + ``` +2. When `xvar = "lambda"`, the x-axis represents increasing model sparsity + from left to right, because increasing lambda increases the penalty added + to the coefficients. When we instead plot the L1 norm on the x-axis, + increasing L1 norm means that we are allowing our + coefficients to take on increasingly large values. + ```{r plotlas, fig.width=10, echo=FALSE} + par(mfrow = c(2, 1)) + plot(fit_lasso, xvar = "lambda") + plot(fit_lasso) + ``` +3. The paths tend to go to exactly zero much more when sparsity increases when we use a LASSO model. + In the ridge case, the paths tend towards zero but less commonly reach exactly zero. + +::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +## Cross-validation to find the best value of $\\lambda$ + +There are various methods to select the "best" +value for $\\lambda$. One is to split +the data into $K$ chunks. We then use $K-1$ of +these as the training set, and the remaining $1$ chunk +as the test set. We can repeat this until we've rotated through all $K$ chunks, +giving us a good estimate of how well each of the lambda values work in our +data. This is called cross-validation, and doing this repeated test/train split +gives us a better estimate of how generalisable our model is. Cross-validation +is a really deep topic that we're not going to cover in more detail today, though! + +```{r cvfig, echo=FALSE, fig.cap="Schematic representiation of a $K$-fold cross-validation procedure.", fig.alt="The data is divided into $K$ chunks. For each cross-validation iteration, one data chunk is used as the test set. The remaining $K-1$ chunks are combined into a training set."} +knitr::include_graphics("fig/cross_validation.png") +``` + +We can use this new idea to choose a lambda value, by finding the lambda +that minimises the error across each of the test and training splits. + +```{r lasso-cv, fig.cap="Cross-validated mean squared error for different values of lambda under a LASSO penalty.", fig.alt="Alt"} +lasso <- cv.glmnet(methyl_mat[, -1], age, alpha = 1) +plot(lasso) +coefl <- coef(lasso, lasso$lambda.min) +selected_coefs <- as.matrix(coefl)[which(coefl != 0), 1] + +## load the horvath signature to compare features +coef_horvath <- readRDS(here::here("data/coefHorvath.rds")) +## We select some of the same features! Hooray +intersect(names(selected_coefs), coef_horvath$CpGmarker) +``` + +```{r heatmap-lasso, echo=FALSE, fig.cap="Heatmap showing methylation values for the selected CpG and how the vary with age.", fig.alt="Overall, we observe either increasing or decreasing methylation patterns as a function of age."} +hmat <- t(scale(methyl_mat[, names(selected_coefs)[-1]])) +ord <- order(age) +hmat_ord <- hmat[, ord] +age_ord <- age[ord] +col <- circlize::colorRamp2(seq(min(age), max(age), length.out = 100), viridis(100)) +Heatmap( + hmat_ord, + name = "Scaled\nmethylation level", + top_annotation = columnAnnotation( + age = age_ord, + col = list(age = col) + ), + show_column_names = FALSE, + column_title = "Sample", + row_title = "Feature", + row_names_side = "left", + row_title_side = "left", + column_title_side = "bottom", + show_row_dend = FALSE, + cluster_columns = FALSE +) +``` + +## Blending ridge regression and the LASSO - elastic nets + +So far, we've used ridge regression, where `alpha = 0`, and LASSO regression, +where `alpha = 1`. What if `alpha` is set to a value between zero and one? +Well, this actually lets us blend the properties of ridge and LASSO +regression. This allows us to have the nice properties of the LASSO, where +uninformative variables are dropped automatically, and the nice properties +of ridge regression, where our coefficient estimates are a bit more +conservative, and as a result our predictions are a bit better. + +Formally, the objective function of elastic net regression is to optimise the +following function: + +$$ +\left(\sum_{i=1}^N y_i - x'_i\beta\right) ++ \lambda \left( +\alpha \left\lVert \beta \right\lVert_1 + +(1-\alpha) \left\lVert \beta \right\lVert_2 +\right) +$$ + +You can see that if `alpha = 1`, then the L1 norm term is multiplied by one, +and the L2 norm is multiplied by zero. This means we have pure LASSO regression. +Conversely, if `alpha = 0`, the L2 norm term is multiplied by one, and the L1 +norm is multiplied by zero, meaning we have pure ridge regression. Anything +in between gives us something in between. + +The contour plots we looked at previously to visualise what this penalty looks +like for different values of `alpha`. + +```{r elastic-contour, fig.width=16, echo=FALSE, fig.cap="For an elastic net, the panels show the effect of the regularisation across different values of alpha", fig.alt="For lower values of alpha, the penalty resembles ridge regression. For higher values of alpha, the penalty resembles LASSO regression."} +plot_elastic <- function(alpha) { + set.seed(42) + noise_sd <- 1 + nobs <- 100 + slope <- 1 + intercept <- 1 + norm <- 1 + + maxlim <- max(abs(slope), abs(intercept)) * 2 + maxlim <- max(maxlim, 5) + lims <- c(-maxlim, maxlim) + + x <- rnorm(nobs, mean = 0, sd = 1) + noise <- rnorm(nobs, mean = 0, sd = noise_sd) + y <- (slope * x) + (intercept) + noise + + n <- 200 + s <- seq(-maxlim, maxlim, length.out = n) + + ll <- matrix(ncol = n, nrow = n) + coef <- norm_mat <- mask <- matrix(ncol = n, nrow = n) + for (i in seq_along(s)) { + coef[, ] <- s + } + for (i in seq_along(s)) { + for (j in seq_along(s)) { + norm_mat[i, j] <- ( + (sqrt(s[[i]]^2 + s[[j]]^2) * ((1 - alpha))) + + ((abs(s[[i]]) + abs(s[[j]])) * (alpha)) + ) + ll[i, j] <- loglik(s[i], s[j], x, y, noise_sd) + } + } + mask <- norm_mat <= norm + ind_mle <- arrayInd(which.max(ll), dim(ll)) + pll <- ll * as.numeric(mask) + pll[pll == 0] <- NA + ind_ple <- arrayInd(which.max(pll), dim(pll)) + + image(s, s, ll, + main = paste0("Elastic net with alpha = ", alpha), + xlab = "slope", ylab = "intercept", + col = viridis(40, option = "A", direction = 1), + xlim = lims, ylim = lims + ) + abline(v = 0, lty = "dashed") + abline(h = 0, lty = "dashed") + points( + coef[ind_mle[[1]]], coef[ind_mle[[2]]], + pch = 19, cex = 2, col = "firebrick" + ) + contour( + s, s, norm_mat, + add = TRUE, levels = norm, drawlabels = FALSE + ) + points( + coef[ind_ple[[1]]], coef[ind_ple[[2]]], + pch = 19, cex = 2, col = "dodgerblue" + ) +} +par(mfrow = c(1, 3)) +plot_elastic(0.25) +plot_elastic(0.5) +plot_elastic(0.75) +``` + +::::::::::::::::::::::::::::::::::::::: challenge + +### Challenge 6 + +1. Fit an elastic net model (hint: alpha = 0.5) without cross-validation and plot the model + object. +2. Fit an elastic net model with cross-validation and plot the error. Compare + with LASSO. +3. Select the lambda within one standard error of + the minimum cross-validation error (hint: `lambda.1se`). Compare the + coefficients with the LASSO model. +4. Discuss: how could we pick an `alpha` in the range (0, 1)? Could we justify + choosing one *a priori*? + +::::::::::::::: solution + +### Solution + +1. Fitting an elastic net model is just like fitting a LASSO model. + You can see that coefficients tend to go exactly to zero, + but the paths are a bit less + extreme than with pure LASSO; similar to ridge. + ```{r elastic} + elastic <- glmnet(methyl_mat[, -1], age, alpha = 0.5) + plot(elastic) + ``` +2. The process of model selection is similar for elastic net models as for + LASSO models. + ```{r elastic-cv, fig.cap="Elastic", fig.alt="Alt"} + elastic_cv <- cv.glmnet(methyl_mat[, -1], age, alpha = 0.5) + plot(elastic_cv) + ``` +3. You can see that the coefficients from these two methods are broadly + similar, but the elastic net coefficients are a bit more conservative. + Further, more coefficients are exactly zero in the LASSO model. + ```{r elastic-plot, fig.cap="LASSO-Elastic", fig.alt="Alt"} + coefe <- coef(elastic_cv, elastic_cv$lambda.1se) + sum(coefe[, 1] == 0) + sum(coefl[, 1] == 0) + plot( + coefl[, 1], coefe[, 1], + pch = 16, + xlab = "LASSO coefficients", + ylab = "Elastic net coefficients" + ) + abline(0:1, lty = "dashed") + + ``` +4. You could pick an arbitrary value of `alpha`, because arguably pure ridge + regression or pure LASSO regression are also arbitrary model choices. + To be rigorous and to get the best-performing model and the best + inference about predictors, it's usually best to find the best + combination of `alpha` and `lambda` using a grid search approach + in cross-validation. However, this can be very computationally demanding. + + + +::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +::::::::::::::::::::::::::::::::::::::::: callout + +### The bias-variance tradeoff + +When we make predictions in statistics, there are two sources of error +that primarily influence the (in)accuracy of our predictions. these are *bias* +and *variance*. + +The total expected error in our predictions is given by the following +equation: + +$$ +E(y - \hat{y}) = \text{Bias}^2 + \text{Variance} + \sigma^2 +$$ + +Here, $\\sigma^2$ represents the irreducible error, that we can never overcome. +Bias results from erroneous assumptions in the model used for predictions. +Fundamentally, bias means that our model is mis-specified in some way, +and fails to capture some components of the data-generating process +(which is true of all models). If we have failed to account for a confounding +factor that leads to very inaccurate predictions in a subgroup of our +population, then our model has high bias. + +Variance results from sensitivity to particular properties of the input data. +For example, if a tiny change to the input data would result in a huge change +to our predictions, then our model has high variance. + +Linear regression is an unbiased model under certain conditions. +In fact, the [Gauss-Markov theorem](https://en.wikipedia.org/wiki/Gauss%E2%80%93Markov_theorem) +shows that under the right conditions, OLS is the best possible type of +unbiased linear model. + +Introducing penalties to means that our model is no longer unbiased, meaning +that the coefficients estimated from our data will systematically deviate +from the ground truth. Why would we do this? As we saw, the total error is +a function of bias and variance. By accepting a small amount of bias, it's +possible to achieve huge reductions in the total expected error. + +This bias-variance tradeoff is also why people often favour elastic net +regression over pure LASSO regression. + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +::::::::::::::::::::::::::::::::::::::::: callout + +### Other types of outcomes + +You may have noticed that `glmnet` is written as `glm`, not `lm`. +This means we can actually model a variety of different outcomes +using this regularisation approach. For example, we can model binary +variables using logistic regression, as shown below. The type of outcome +can be specified using the `family` argument, which specifies the family +of the outcome variable. + +In fact, `glmnet` is somewhat cheeky as it also allows you to model +survival using Cox proportional hazards models, which aren't GLMs, strictly +speaking. + +For example, in the current dataset we can model smoking status as a binary +variable in logistic regression by setting `family = "binomial"`. + +The [package documentation](https://glmnet.stanford.edu/articles/glmnet.html) +explains this in more detail. + +```{r binomial, fig.cap="Title", fig.alt="Alt"} +smoking <- as.numeric(factor(methylation$smoker)) - 1 +# binary outcome +table(smoking) +fit <- cv.glmnet(x = methyl_mat, nfolds = 5, y = smoking, family = "binomial") + +coef <- coef(fit, s = fit$lambda.min) +coef <- as.matrix(coef) +coef[which(coef != 0), 1] +plot(fit) +``` + +In this case, the results aren't very interesting! We select an intercept-only +model. However, as highlighted by the warnings above, we should not trust this +result too much as the data was too small to obtain reliable results! We only +included it here to provide the code that *could* be used to perform penalised +regression for binary outcomes (i.e. penalised logistic regression). + + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +::::::::::::::::::::::::::::::::::::::::: callout + +### tidymodels + +A lot of the packages for fitting predictive models like regularised +regression have different user interfaces. To do predictive modelling, it's +important to consider things like choosing a good performance metric and +how to run normalisation. It's also useful to compare different +model "engines". + +To this end, the **`tidymodels`** R framework exists. We're not doing a course on +advanced topics in predictive modelling so we are not covering this framework +in detail. However, the code below would be useful to perform repeated +cross-validation. More information about **`tidymodels`**, including installation +instructions, can be found [here](https://www.rdocumentation.org/packages/tidymodels/versions/1.1.1). + +```{r tidymodels, eval=FALSE} +library("tidymodels") +all_data <- as.data.frame(cbind(age = age, methyl_mat)) +split_data <- initial_split(all_data) + +norm_recipe <- recipe(training(split_data)) %>% + ## everything other than age is a predictor + update_role(everything(), new_role = "predictor") %>% + update_role(age, new_role = "outcome") %>% + ## center and scale all the predictors + step_center(all_predictors()) %>% + step_scale(all_predictors()) + +## set the "engine" to be a linear model with tunable alpha and lambda +glmnet_model <- linear_reg(penalty = tune(), mixture = tune()) %>% + set_engine("glmnet") + +## define a workflow, with normalisation recipe into glmnet engine +workflow <- workflow() %>% + add_recipe(norm_recipe) %>% + add_model(glmnet_model) + +## 5-fold cross-validation repeated 5 times +folds <- vfold_cv(training(split_data), v = 5, repeats = 5) + +## define a grid of lambda and alpha parameters to search +glmn_set <- parameters( + penalty(range = c(-5, 1), trans = log10_trans()), + mixture() +) +glmn_grid <- grid_regular(glmn_set) +ctrl <- control_grid(save_pred = TRUE, verbose = TRUE) + +## use the metric "rmse" (root mean squared error) to grid search for the +## best model +results <- workflow %>% + tune_grid( + resamples = folds, + metrics = metric_set(rmse), + control = ctrl + ) +## select the best model based on RMSE +best_mod <- results %>% select_best("rmse") +best_mod +## finalise the workflow and fit it with all of the training data +final_workflow <- finalize_workflow(workflow, best_mod) +final_workflow +final_model <- final_workflow %>% + fit(data = training(split_data)) + +## plot predicted age against true age for test data +plot( + testing(split_data)$age, + predict(final_model, new_data = testing(split_data))$.pred, + xlab = "True age", + ylab = "Predicted age", + pch = 16, + log = "xy" +) +abline(0:1, lty = "dashed") +``` + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +## Further reading + +- [An introduction to statistical learning](https://www.statlearning.com/). +- [Elements of statistical learning](https://web.stanford.edu/~hastie/ElemStatLearn/). +- [glmnet vignette](https://glmnet.stanford.edu/articles/glmnet.html). +- [tidymodels](https://www.tidymodels.org/). + +## Footnotes + +[^1]: Model selection including $R^2$, AIC and BIC are covered in the additional feature selection for regression episode of this course. +[^2]: [Epigenetic Predictor of Age, Bocklandt et al. (2011)](https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0014821) + + + +:::::::::::::::::::::::::::::::::::::::: keypoints + +- Regularisation is a way to fit a model, get better estimates of effect sizes, and perform variable selection simultaneously. +- Test and training splits, or cross-validation, are a useful way to select models or hyperparameters. +- Regularisation can give us a more predictive set of variables, and by restricting the magnitude of coefficients, can give us a better (and more stable) estimate of our outcome. +- Regularisation is often *very* fast! Compared to other methods for variable selection, it is very efficient. This makes it easier to practice rigorous variable selection. + +:::::::::::::::::::::::::::::::::::::::::::::::::: + + diff --git a/episodes/04-principal-component-analysis.Rmd b/episodes/04-principal-component-analysis.Rmd new file mode 100644 index 00000000..6c42fd30 --- /dev/null +++ b/episodes/04-principal-component-analysis.Rmd @@ -0,0 +1,841 @@ +--- +title: Principal component analysis +author: GS Robertson +source: Rmd +teaching: 90 +exercises: 30 +math: yes +--- + +::::::::::::::::::::::::::::::::::::::: objectives + +- Identify situations where PCA can be used to answer research questions using high-dimensional data. +- Perform a PCA on high-dimensional data. +- Select the appropriate number of principal components. +- Interpret the output of PCA. + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::: questions + +- What is principal component analysis (PCA) and when can it be used? +- How can we perform a PCA in R? +- How many principal components are needed to explain a significant amount of variation in the data? +- How to interpret the output of PCA using loadings and principal components? + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +```{r settings, include=FALSE} +library("here") +source(here("bin/chunk-options.R")) +``` + +## Introduction + +Imagine a dataset which contains many variables ($p$), close to the total number +of rows in the dataset ($n$). Some of these variables are highly correlated and +several form groups which you might expect to represent the same overall effect. +Such datasets are challenging to analyse for several reasons, with the main +problem being how to reduce dimensionality in the dataset while retaining the +important features. + +In this episode we will explore *principal component analysis* (PCA) as a +popular method of analysing high-dimensional data. PCA is an unsupervised +statistical method which allows large datasets of correlated variables to be +summarised into smaller numbers of uncorrelated principal components that +explain most of the variability in the original dataset. This is useful, +for example, during initial data exploration as it allows correlations among +data points to be observed and principal components to be calculated for +inclusion in further analysis (e.g. linear regression). An example of PCA might +be reducing several variables representing aspects of patient health +(blood pressure, heart rate, respiratory rate) into a single feature. + +## Advantages and disadvantages of PCA + +Advantages: + +- It is a relatively easy to use and popular method. +- Various software/packages are available to run a PCA. +- The calculations used in a PCA are easy to understand for statisticians and + non-statisticians alike. + +Disadvantages: + +- It assumes that variables in a dataset are correlated. +- It is sensitive to the scale at which input variables are measured. + If input variables are measured at different scales, the variables + with large variance relative to the scale of measurement will have + greater impact on the principal components relative to variables with smaller + variance. In many cases, this is not desirable. +- It is not robust against outliers, meaning that very large or small data + points can have a large effect on the output of the PCA. +- PCA assumes a linear relationship between variables which is not always a + realistic assumption. +- It can be difficult to interpret the meaning of the principal components, + especially when including them in further analyses (e.g. inclusion in a linear + regression). + +::::::::::::::::::::::::::::::::::::::::: callout + +### Supervised vs unsupervised learning + +Most statistical problems fall into one of two categories: supervised or +unsupervised learning. +Examples of supervised learning problems include linear regression and include +analyses in which each observation has both at least one independent variable +($x$) as well as a dependent variable ($y$). In supervised learning problems +the aim is to predict the value of the response given future observations or +to understand the relationship between the dependent variable and the +predictors. In unsupervised learning for each observation there is no +dependent variable ($y$), but only +a series of independent variables. In this situation there is no need for +prediction, as there is no dependent variable to predict (hence the analysis +can be thought as being unsupervised by the dependent variable). Instead +statistical analysis can be used to understand relationships between the +independent variables or between observations themselves. Unsupervised +learning problems often occur when analysing high-dimensional datasets in +which there is no obvious dependent variable to be +predicted, but the analyst would like to understand more about patterns +between groups of observations or reduce dimensionality so that a supervised +learning process may be used. + + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +::::::::::::::::::::::::::::::::::::::: challenge + +### Challenge 1 + +Descriptions of three datasets and research questions are given below. For +which of these might PCA be considered a useful tool for analysing data so +that the research questions may be addressed? + +1. An epidemiologist has data collected from different patients admitted to + hospital with infectious respiratory disease. They would like to determine + whether length of stay in hospital differs in patients with different + respiratory diseases. +2. An online retailer has collected data on user interactions with its online + app and has information on the number of times each user interacted with + the app, what products they viewed per interaction, and the type and cost + of these products. The retailer would like to use this information to + predict whether or not a user will be interested in a new product. +3. A scientist has assayed gene expression levels in 1000 cancer patients and + has data from probes targeting different genes in tumour samples from + patients. She would like to create new variables representing relative + abundance of different groups of genes to i) find out if genes form + subgroups based on biological function and ii) use these new variables + in a linear regression examining how gene expression varies with disease + severity. +4. All of the above. + +::::::::::::::: solution + +### Solution + +In the first case, a regression model would be more suitable; perhaps a +survival model. +In the second, again a regression model, likely linear or logistic, would +be more suitable. +In the third example, PCA can help to identify modules of correlated +features that explain a large amount of variation within the data. + +Therefore the answer here is 3. + + + +::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +## What is a principal component? + +```{r, eval=FALSE, echo=FALSE} +# A PCA is carried out by calculating a matrix of Pearson's correlations from +# the original dataset which shows how each of the variables in the dataset +# relate to each other. +``` + +The first principal component is the direction of the data along which the +observations vary the most. The second principal component is the direction of +the data along which the observations show the next highest amount of variation. +For example, Figure 1 shows biodiversity index versus percentage area left +fallow for 50 farms in southern England. The red line represents the first +principal component direction of the data, which is the direction along which +there is greatest variability in the data. Projecting points onto this line +(i.e. by finding the location on the line closest to the point) would give a +vector of points with the greatest possible variance. The next highest amount +of variability in the data is represented by the line perpendicular to first +regression line which represents the second principal component (green line). + +The second principal component is a linear combination of the variables that +is uncorrelated with the first principal component. There are as many principal +components as there are variables in your dataset, but as we'll see, some are +more useful at explaining your data than others. By definition, the first +principal component explains more variation than other principal components. + +```{r fig1, echo=FALSE, fig.cap="Cap", fig.alt="Alt"} +# ![Figure 1: Biodiversity index and percentage area fallow PCA](D:/Statistical consultancy/Consultancy/Grant applications/UKRI teaching grant 2021/Working materials/Bio index vs percentage fallow.png) +knitr::include_graphics("fig/bio_index_vs_percentage_fallow.png") +``` + +The animation below illustrates how principal components are calculated from +data. You can imagine that the black line is a rod and each red dashed line is +a spring. The energy of each spring is proportional to its squared length. The +direction of the first principal component is the one that minimises the total +energy of all of the springs. In the animation below, the springs pull the rod, +finding the direction of the first principal component when they reach +equilibrium. We then use the length of the springs from the rod as the first +principal component. +This is explained in more detail on [this Q\&A website](https://stats.stackexchange.com/questions/2691/making-sense-of-principal-component-analysis-eigenvectors-eigenvalues). + +```{r pendulum, echo=FALSE, fig.cap="Cap", fig.alt="Alt"} +knitr::include_graphics("fig/pendulum.gif") +``` + +The first principal component's scores ($Z\_1$) are calculated using the equation: + +$$ +Z_1 = a_{11}X_1 + a_{21}X_2 +....+a_{p1}X_p +$$ + +$X\_1...X\_p$ represents variables in the original dataset and $a\_{11}...a\_{p1}$ +represent principal component loadings, which can be thought of as the degree to +which each variable contributes to the calculation of the principal component. +We will come back to principal component scores and loadings further below. + +## How do we perform a PCA? + +### A prostate cancer dataset + +The `prostate` dataset represents data from 97 +men who have prostate cancer. The data come from a study which examined the +correlation between the level of prostate specific antigen and a number of +clinical measures in men who were about to receive a radical prostatectomy. +The data have 97 rows and 9 columns. + +Columns include: + +- `lcavol` (log-transformed cancer volume), +- `lweight` (log-transformed prostate weight), +- `lbph` (log-transformed amount of benign prostate enlargement), +- `svi` (seminal vesicle invasion), +- `lcp` (log-transformed capsular penetration; amount of spread of cancer in + outer walls of prostate), +- `gleason` (Gleason score; grade of cancer cells), +- `pgg45` (percentage Gleason scores 4 or 5), +- `lpsa` (log-tranformed prostate specific antigen; level of PSA in blood). +- `age` (patient age in years). + +Here we will calculate principal component scores for each of the rows in this +dataset, using five principal components (one for each variable included in the +PCA). We will include five clinical variables in our PCA, each of the continuous +variables in the prostate dataset, so that we can create fewer variables +representing clinical markers of cancer progression. Standard PCAs are carried +out using continuous variables only. + +First, we will examine the `prostate` dataset (originally part of the +**`lasso2`** package): + +```{r prostate-load} +prostate <- readRDS(here("data/prostate.rds")) +``` + +```{r prostate-head} +head(prostate) +``` + +Note that each row of the dataset represents a single patient. + +We will create a subset of the data including only the clinical variables we +want to use in the PCA. + +```{r pros2} +pros2 <- prostate[, c("lcavol", "lweight", "lbph", "lcp", "lpsa")] +head(pros2) +``` + +### Do we need to standardise the data? + +Now we compare the variances between variables in the dataset. + +```{r var-hist, fig.cap="Caption"} +apply(pros2, 2, var) +par(mfrow = c(1, 2)) +hist(pros2$lweight, breaks = "FD") +hist(pros2$lbph, breaks = "FD") +``` + +Note that variance is greatest for `lbph` and lowest for `lweight`. It is clear +from this output that we need to scale each of these variables before including +them in a PCA analysis to ensure that differences in variances between variables +do not drive the calculation of principal components. In this example we +standardise all five variables to have a mean of 0 and a standard +deviation of 1. + +::::::::::::::::::::::::::::::::::::::: challenge + +### Challenge 2 + +Why might it be necessary to standardise variables before performing a PCA? +Can you think of datasets where it might not be necessary to standardise +variables? +Discuss. + +1. To make the results of the PCA interesting. +2. If you want to ensure that variables with different ranges of values + contribute equally to analysis. +3. To allow the feature matrix to be calculated faster, especially in cases + where there are a lot of input variables. +4. To allow both continuous and categorical variables to be included in the PCA. +5. All of the above. + +::::::::::::::: solution + +### Solution + +2. +Scaling the data isn't guaranteed to make the results more interesting. +It also won't affect how quickly the output will be calculated, whether +continuous and categorical variables are present or not. + +It is done to ensure that all features have equal weighting in the resulting +PCs. + +You may not want to standardise datasets which contain continuous variables +all measured on the same scale (e.g. gene expression data or RNA sequencing +data). In this case, variables with very little sample-to-sample variability +may represent only random noise, and standardising the data would give +these extra weight in the PCA. + +::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +Next we will carry out a PCA using the `prcomp()` function in base R. The input +data (`pros2`) is in the form of a matrix. Note that the `scale = TRUE` argument +is used to standardise the variables to have a mean 0 and standard deviation of +1\. + +```{r prcomp} +pca.pros <- prcomp(pros2, scale = TRUE, center = TRUE) +pca.pros +``` + +## How many principal components do we need? + +We have calculated one principal component for each variable in the original +dataset. How do we choose how many of these are necessary to represent the true +variation in the data, without having extra components that are unnecessary? + +Let's look at the relative importance of each component using `summary`. + +```{r summ} +summary(pca.pros) +``` + +```{r, echo=FALSE} +# Get proportions of variance explained by each PC (rounded to 2 DP) +prop.var <- round(summary(pca.pros)$importance["Proportion of Variance", ], 2) * + 100 +``` + +This returns the proportion of variance in the data explained by each of the +(p = 5) principal components. In this example, PC1 explains approximately +`r prop.var[[1]]`% of variance in the data, PC2 `r prop.var[[2]]`% of variance, +PC3 a further `r prop.var[[3]]`%, PC4 approximately `r prop.var[[4]]`% and PC5 +around `r prop.var[[5]]`%. + +Let us visualise this. A plot of the amount of variance accounted for by each PC +is also called a scree plot. Note that the amount of variance accounted for by a principal +component is also called eigenvalue and thus the y-axis in scree plots if often +labelled "eigenvalue". + +Often, scree plots show a characteristic pattern where initially, the variance drops +rapidly with each additional principal component. But then there is an "elbow" after which the +variance decreases more slowly. The total variance explained up to the elbow point is sometimes +interpreted as structural variance that is relevant and should be retained versus noise +which may be discarded after the elbow. + +```{r varexp} +# calculate variance explained +varExp <- (pca.pros$sdev^2) / sum(pca.pros$sdev^2) * 100 +# calculate percentage variance explained using output from the PCA +varDF <- data.frame(Dimensions = 1:length(varExp), varExp = varExp) +# create new dataframe with five rows, one for each principal component +``` + +```{r vardf-plot, fig.cap="Caption"} +plot(varDF) +``` + +The screeplot shows that the first principal component explains most of the +variance in the data (>50%) and each subsequent principal component explains +less and less of the total variance. The first two principal components +explain >70% of variance in the data. But what do these two principal +components mean? + +### What are loadings and principal component scores? + +Most PCA functions will produce two main output matrices: the +*principal component scores* and the *loadings*. The matrix of principal component scores +has as many rows as there were observations in the input matrix. These +scores are what is usually visualised or used for down-stream analyses. +The matrix of loadings (also called rotation matrix) has as many rows as there +are features in the original data. It contains information about how the +(usually centered and scaled) original data relate to the PC scores. + +When calling a PCA object generated with `prcomp()`, the loadings are printed by default: + +```{r pca-pros} +pca.pros +``` + +The principal component scores are obtained by carrying out matrix multiplication of the +(usually centered and scaled) original data times the loadings. The following +callout demonstrates this. + +::::::::::::::::::::::::::::::::::::::::: callout + +### Computing a PCA "by hand" + +The rotation matrix obtained in a PCA is identical to the eigenvectors +of the covariance matrix of the data. Multiplying these with the (centered and scaled) +data yields the PC scores: + +```{r pca-by-hand} +pros2.scaled <- scale(pros2) # centre and scale the Prostate data +pros2.cov <- cov(pros2.scaled) #generate covariance matrix +pros2.cov +pros2.eigen <- eigen(pros2.cov) # preform eigen decomposition +pros2.eigen # The slot $vectors = rotation of the PCA +# generate PC scores by by hand, using matrix multiplication +my.pros2.pcs <- pros2.scaled %*% pros2.eigen$vectors +# compare results +par(mfrow=c(1,2)) +plot(pca.pros$x[,1:2], main="prcomp()") +abline(h=0, v=0, lty=2) +plot(my.pros2.pcs[,1:2], main="\"By hand\"", xlab="PC1", ylab="PC2") +abline(h=0, v=0, lty=2) +par(mfrow=c(1,1)) +# Note that the axis orientations may be swapped but the relative positions of the dots should be the same in both plots. +``` + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +One way to visualise how principal components relate to the original variables +is by creating a biplot. Biplots usually show two principal components plotted +against each other. Observations are sometimes labelled with numbers. The +contribution of each original variable to the principal components displayed +is then shown by arrows (generated from those two columns of the rotation matrix that +correspond to the principal components shown). NB, there are several biplot +implementations in different R libraries. It is thus a good idea to specify +the desired package when calling `biplot()`. A biplot of the first two principal +components can be generated as follows: + +```{r stats-biplot, fig.cap="Caption"} +stats::biplot(pca.pros, xlim = c(-0.3, 0.3)) +``` + +This biplot shows the position of each patient on a 2-dimensional plot where +loadings can be observed via the red arrows associated with each of +the variables. The variables `lpsa`, `lcavol` and `lcp` are associated with +positive values on PC1 while positive values on PC2 are associated with the +variables `lbph` and `lweight`. The length of the arrows indicates how much +each variable contributes to the calculation of each principal component. + +The left and bottom axes show normalised principal component scores. The axes +on the top and right of the plot are used to interpret the loadings, where +loadings are scaled by the standard deviation of the principal components +(`pca.pros$sdev`) times the square root the number of observations. + +Finally, you need to know that PC scores and rotations may have different slot names, +depending on the PCA implementation you use. Here are some examples: + +| library::command() | PC scores | Loadings | +| ------------------ | --------- | --------- | +| stats::prcomp() | $x | $rotation | +| stats::princomp() | $scores | $loadings | +| PCAtools::pca() | $rotated | $loadings | + +## Using PCA to analyse gene expression data + +In this section you will carry out your own PCA using the Bioconductor package **`PCAtools`** +applied to gene expression data to explore the topics covered above. +**`PCAtools`** provides functions that can be used to explore data via PCA and +produce useful figures and analysis tools. The package is made for the somewhat unusual +Bioconductor style of data tables (observations in columns, features in rows). When +using Bioconductor data sets and **`PCAtools`**, it is thus not necessary to transpose the data. + +### A gene expression dataset of cancer patients + +The dataset we will be analysing in this lesson includes two subsets of data: + +- a matrix of gene expression data showing microarray results for different + probes used to examine gene expression profiles in 91 different breast + cancer patient samples. +- metadata associated with the gene expression results detailing information + from patients from whom samples were taken. + +Let's load the **`PCAtools`** package and the data. + +```{r pcatools} +library("PCAtools") +``` + +We will first load the microarray breast cancer gene expression data and +associated metadata, downloaded from the +[Gene Expression Omnibus](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE2990). + +```{r se} +library("SummarizedExperiment") +cancer <- readRDS(here::here("data/cancer_expression.rds")) +mat <- assay(cancer) +metadata <- colData(cancer) +``` + +```{r mat, eval=FALSE} +View(mat) +#nrow=22215 probes +#ncol=91 samples +``` + +```{r meta, eval=FALSE} +View(metadata) +#nrow=91 +#ncol=8 +``` + +```{r colnames} +all(colnames(mat) == rownames(metadata)) +#Check that column names and row names match +#If they do should return TRUE +``` + +The 'mat' variable contains a matrix of gene expression profiles for each sample. +Rows represent gene expression measurements and columns represent samples. The +'metadata' variable contains the metadata associated with the gene expression +data including the name of the study from which data originate, the age of the +patient from which the sample was taken, whether or not an oestrogen receptor +was involved in their cancer and the grade and size of the cancer for each +sample (represented by rows). + +Microarray data are difficult to analyse for several reasons. Firstly, +they are typically high-dimensional and therefore are subject to the same +difficulties associated with analysing high dimensional data outlined above +(i.e. *p*\>*n*, large numbers of rows, multiple possible response variables, +curse of dimensionality). Secondly, formulating a research question using +microarray data can be difficult, especially if not much is known a priori +about which genes code for particular phenotypes of interest. Finally, +exploratory analysis, which can be used to help formulate research questions +and display relationships, is difficult using microarray data due to the number +of potentially interesting response variables (i.e. expression data from probes +targeting different genes). + +If researchers hypothesise that groups of genes (e.g. biological pathways) may +be associated with different phenotypic characteristics of cancers (e.g. +histologic grade, tumour size), using statistical methods that reduce the +number of columns in the microarray matrix to a smaller number of dimensions +representing groups of genes would help visualise the data and address +research questions regarding the effect different groups of genes have on +disease progression. + +Using the **`PCAtools`** we will apply a PCA to the cancer +gene expression data, plot the amount of variation in the data explained by +each principal component and plot the most important principal components +against each other as well as understanding what each principal component +represents. + +::::::::::::::::::::::::::::::::::::::: challenge + +### Challenge 3 + +Apply a PCA to the cancer gene expression data using the `pca()` function from +**`PCAtools`**. You can use the help files in PCAtools to find out about the `pca()` +function (type `help("pca")` or `?pca` in R). + +Let us assume we only care about the principal components accounting for the top +80% of the variance in the dataset. Use the `removeVar` argument in `pca()` to remove +the PCs accounting for the bottom 20%. + +As in the example using prostate data above, examine the first 5 rows and +columns of rotated data and loadings from your PCA. + +::::::::::::::: solution + +### Solution + +```{r pca-ex} +pc <- pca(mat, metadata = metadata) +#Many PCs explain a very small amount of the total variance in the data +#Remove the lower 20% of PCs with lower variance +pc <- pca(mat, metadata = metadata, removeVar = 0.2) +#Explore other arguments provided in pca +pc$rotated[1:5, 1:5] +pc$loadings[1:5, 1:5] + +which.max(pc$loadings[, 1]) +pc$loadings[49, ] + +which.max(pc$loadings[, 2]) +pc$loadings[27, ] +``` + +The function `pca()` is used to perform PCA, and uses as inputs a matrix +(`mat`) containing continuous numerical data +in which rows are data variables and columns are samples, and `metadata` +associated with the matrix in which rows represent samples and columns +represent data variables. It has options to centre or scale the input data +before a PCA is performed, although in this case gene expression data do +not need to be transformed prior to PCA being carried out as variables are +measured on a similar scale (values are comparable between rows). The output +of the `pca()` function includes a lot of information such as loading values +for each variable (`loadings`), principal component scores (`rotated`) +and the amount of variance in the data +explained by each principal component. + +Rotated data shows principal +component scores for each sample and each principal component. Loadings +the contribution each variable makes to each principal component. + + + +::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +::::::::::::::::::::::::::::::::::::::::: callout + +### Scaling variables for PCA + +When running `pca()` above, we kept the default setting, `scale=FALSE`. That means genes with higher variation in +their expression levels should have higher loadings, which is what we are interested in. +Whether or not to scale variables for PCA will depend on your data and research question. + +Note that this is different from normalising gene expression data. Gene expression +data have to be normalised before donwstream analyses can be +carried out. This is to reduce to effect technical and other potentially confounding +factors. We assume that the expression data we use had been noralised previously. + + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +### Choosing how many components are important to explain the variance in the data + +As in the example using the `prostate` dataset we can use a screeplot to +compare the proportion of variance in the data explained by each principal +component. This allows us to understand how much information in the microarray +dataset is lost by projecting the observations onto the first few principal +components and whether these principal components represent a reasonable +amount of the variation. The proportion of variance explained should sum to one. + +There are no clear guidelines on how many principal components should be +included in PCA: your choice depends on the total variability of the data and +the size of the dataset. We often look at the 'elbow' on the screeplot as an +indicator that the addition of principal components does not drastically +contribute to explain the remaining variance or choose an arbitory cut off for +proportion of variance explained. + +::::::::::::::::::::::::::::::::::::::: challenge + +### Challenge 4 + +Using the `screeplot()` function in **`PCAtools`**, create a screeplot to show +proportion of variance explained by each principal component. Explain the +output of the screeplot in terms of proportion of variance in data explained +by each principal component. + +::::::::::::::: solution + +### Solution + +```{r scree-ex, fig.cap="Caption"} +screeplot(pc, axisLabSize = 5, titleLabSize = 8) +``` + +Note that first principal component (PC1) explains more variation than +other principal components (which is always the case in PCA). The screeplot +shows that the first principal component only explains ~33% of the total +variation in the micrarray data and many principal components explain very +little variation. The red line shows the cumulative percentage of explained +variation with increasing principal components. Note that in this case 18 +principal components are needed to explain over 75% of variation in the +data. This is not an unusual result for complex biological datasets +including genetic information as clear relationships between groups are +sometimes difficult to observe in the data. The screeplot shows that using +a PCA we have reduced 91 predictors to 18 in order to explain a significant +amount of variation in the data. See additional arguments in screeplot +function for improving the appearance of the plot. + + + +::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +### Investigating the principal components + +Once the most important principal components have been identified using +`screeplot()`, these can be explored in more detail by plotting principal components +against each other and highlighting points based on variables in the metadata. +This will allow any potential clustering of points according to demographic or +phenotypic variables to be seen. + +We can use biplots to look for patterns in the output from the PCA. Note that there +are two functions called `biplot()`, one in the package **`PCAtools`** and one in +**`stats`**. Both functions produce biplots but their scales are different! + +::::::::::::::::::::::::::::::::::::::: challenge + +### Challenge 5 + +Create a biplot of the first two principal components from your PCA +using `biplot()` function in **`PCAtools`**. See `help("PCAtools::biplot")` for +arguments and their meaning. For instance, `lab` or `colBy` may be useful. + +Examine whether the data appear to form clusters. Explain your results. + +::::::::::::::: solution + +### Solution + +```{r biplot-ex, fig.cap="Caption"} +biplot(pc, lab = NULL, colby = 'Grade', legendPosition = 'top') +``` + +The biplot shows the position of patient samples relative to PC1 and PC2 +in a 2-dimensional plot. Note that two groups are apparent along the PC1 +axis according to expressions of different genes while no separation can be +seem along the PC2 axis. Labels of patient samples are automatically added +in the biplot. Labels for each sample are added by default, but can be +removed if there is too much overlap in names. Note that **`PCAtools`** does +not scale biplot in the same way as biplot using the stats package. + + + +::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +Let's consider this biplot in more detail, and also display the loadings: + +::::::::::::::::::::::::::::::::::::::: challenge + +### Challenge 6 + +Use `colby` and `lab` arguments in `biplot()` to explore whether these two +groups may cluster by patient age or by whether or not the sample expresses +the oestrogen receptor gene (ER+ or ER-). + +Note: You may see a warning about `ggrepel`. This happens when there are many +labels but little space for plotting. This is not usually a serious problem - +not all labels will be shown. + +::::::::::::::: solution + +### Solution + +```{r pca-biplot-ex2, fig.cap="Caption"} + PCAtools::biplot(pc, + lab = paste0(pc$metadata$Age,'years'), + colby = 'ER', + hline = 0, vline = 0, + legendPosition = 'right') +``` + +It appears that one cluster has more ER+ samples than the other group. + + + +::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +So far, we have only looked at a biplot of PC1 versus PC2 which only gives part +of the picture. The `pairplots()` function in **`PCAtools`** can be used to create +multiple biplots including different principal components. + +```{r pairsplot, fig.cap="Caption"} +pairsplot(pc) +``` + +The plots show two apparent clusters involving the first principal component +only. No other clusters are found involving other principal components. Each dot +is coloured differently along a gradient of blues. This can potentially help identifying +the same observation/individual in several panels. Here too, the argument `colby` allows +you to set custom colours. + +Finally, it can sometimes be of interest to compare how certain variables contribute +to different principal components. This can be visualised with `plotloadings()` from +the **`PCAtools`** package. The function checks the range of loadings for each +principal component specified (default: first five PCs). It then selects the features +in the top and bottom 5% of these ranges and displays their loadings. This behaviour +can be adjusted with the `rangeRetain` argument, which has 0.1 as the default value (i.e. +5% on each end of the range). NB, if there are too many labels to be plotted, you will see +a warning. This is not a serious problem. + +```{r loadingsplots} +plotloadings(pc, c("PC1"), rangeRetain = 0.1) +plotloadings(pc, c("PC2"), rangeRetain = 0.1) +plotloadings(pc, c("PC1", "PC2"), rangeRetain = 0.1) +``` + +You can see how the third code line prooces more dots, some of which do not have +extreme loadings. This is because all loadings selected for any PC are shown for all +other PCs. For instance, it is plausible that features which have high loadings on +PC1 may have lower ones on PC2. + +## Using PCA output in further analysis + +The output of PCA can be used to interpret data or can be used in further +analyses. For example, the PCA outputs new variables (principal components) +which represent several variables in the original dataset. These new variables +are useful for further exploring data, for example, comparing principal +component scores between groups or including the new variables in linear +regressions. Because the principal components are uncorrelated (and independent) +they can be included together in a single linear regression. + +::::::::::::::::::::::::::::::::::::::::: callout + +### Principal component regression + +PCA is often used to reduce large numbers of correlated variables into fewer +uncorrelated variables that can then be included in linear regression or +other models. This technique is called principal component regression (PCR) +and it allows researchers to examine the effect of several correlated +explanatory variables on a single response variable in cases where a high +degree of correlation initially prevents them from being included in the same +model. This is called principal componenet regression (PCR) and is just one +example of how principal components can be used in further analysis of data. +When carrying out PCR, the variable of interest (response/dependent variable) +is regressed against the principal components calculated using PCA, rather +than against each individual explanatory variable from the original dataset. +As there as many principal components created from PCA as there are variables +in the dataset, we must select which principal components to include in PCR. +This can be done by examining the amount of variation in the data explained +by each principal component (see above). + + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +## Further reading + +- James, G., Witten, D., Hastie, T. \& Tibshirani, R. (2013) An Introduction to Statistical Learning with Applications in R. + Chapter 6.3 (Dimension Reduction Methods), Chapter 10 (Unsupervised Learning). +- [Jolliffe, I.T. \& Cadima, J. (2016) Principal component analysis: a review and recent developments. Phil. Trans. R. Soc A 374.](https://dx.doi.org/10.1098/rsta.2015.0202). +- [Johnstone, I.M. \& Titterington, D.M. (2009) Statistical challenges of high-dimensional data. Phil. Trans. R. Soc A 367.](https://doi.org/10.1098/rsta.2009.0159) +- [PCA: A Practical Guide to Principal Component Analysis, Analytics Vidhya](https://www.analyticsvidhya.com/blog/2016/03/pca-practical-guide-principal-component-analysis-python/). +- [A One-Stop Shop for Principal Component Analysis, Towards Data Science](https://towardsdatascience.com/a-one-stop-shop-for-principal-component-analysis-5582fb7e0a9c). + + + +:::::::::::::::::::::::::::::::::::::::: keypoints + +- A principal component analysis is a statistical approach used to reduce dimensionality in high-dimensional datasets (i.e. where $p$ is equal or greater than $n$). +- PCA may be used to create a low-dimensional set of features from a larger set of variables. Examples of when a PCA may be useful include reducing high-dimensional datasets to fewer variables for use in a linear regression and for identifying groups with similar features. +- PCA is a useful dimensionality reduction technique used in the analysis of complex biological datasets (e.g. high throughput data or genetics data). +- The first principal component represents the dimension along which there is maximum variation in the data. Subsequent principal components represent dimensions with progressively less variation. +- Screeplots and biplots may be used to show: 1. how much variation in the data is explained by each principal component and 2. how data points cluster according to principal component scores and which variables are associated with these scores. + +:::::::::::::::::::::::::::::::::::::::::::::::::: + + diff --git a/episodes/05-factor-analysis.Rmd b/episodes/05-factor-analysis.Rmd new file mode 100644 index 00000000..f957a35b --- /dev/null +++ b/episodes/05-factor-analysis.Rmd @@ -0,0 +1,318 @@ +--- +title: Factor analysis +author: GS Robertson +source: Rmd +teaching: 25 +exercises: 10 +math: yes +--- + +::::::::::::::::::::::::::::::::::::::: objectives + +- Perform a factor analysis on high-dimensional data. +- Select an appropriate number of factors. +- Interpret the output of factor analysis. + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::: questions + +- What is factor analysis and when can it be used? +- What are communality and uniqueness in factor analysis? +- How to decide on the number of factors to use? +- How to interpret the output of factor analysis? + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +```{r setup, include=FALSE} +library("here") +source(here("bin/chunk-options.R")) +``` + +## Introduction + +Biologists often encounter high-dimensional datasets from which they wish +to extract underlying features – they need to carry out dimensionality +reduction. The last episode dealt with one method to achieve this this, +called principal component analysis (PCA). Here, we introduce more general +set of methods called factor analysis (FA). + +There are two types of FA, called exploratory and confirmatory factor analysis +(EFA and CFA). Both EFA and CFA aim to reproduce the observed relationships +among a group of features with a smaller set of latent variables. EFA +is used in a descriptive, data-driven manner to uncover which +measured variables are reasonable indicators of the various latent dimensions. +In contrast, CFA is conducted in an a-priori, +hypothesis-testing manner that requires strong empirical or theoretical foundations. +We will mainly focus on EFA here, which is used to group features into a specified +number of latent factors. + +Unlike with PCA, researchers using FA have to specify the number of latent +variables (factors) at the point of running the analysis. Researchers may use +exploratory data analysis methods (including PCA) to provide an initial estimate +of how many factors adequately explain the variation observed in a dataset. +In practice, a range of different values is usually tested. + +### An example + +One scenario for using FA would be whether student scores in different subjects +can be summarised by certain subject categories. Take a look at the hypothetical +dataset below. If we were to run and EFA on this, we might find that the scores +can be summarised well by two factors, which we can then interpret. We have +labelled these hypothetical factors "mathematical ability" and "writing ability". + +```{r table, echo=FALSE} +knitr::include_graphics("fig/table_for_fa.png") +# ![Figure 1: Student exam scores per subject. Subjects can be split into two factors representing mathematical ability and writing ability](D:/Statistical consultancy/Consultancy/Grant applications/UKRI teaching grant 2021/Working materials/Table for FA.png) +``` + +So, EFA is designed to identify a specified number of unobservable factors from +observable features contained in the original dataset. This is slightly +different from PCA, which does not do this directly. Just to recap, PCA creates +as many principal components as there are features in the dataset, each +component representing a different linear combination of features. The principal +components are ordered by the amount of variance they account for. + +## Advantages and disadvantages of Factor Analysis + +There are several advantages and disadvantages of using FA as a +dimensionality reduction method. + +Advantages: + +- FA is a useful way of combining different groups of data into known + representative factors, thus reducing dimensionality in a dataset. +- FA can take into account researchers' expert knowledge when choosing + the number of factors to use, and can be used to identify latent or hidden + variables which may not be apparent from using other analysis methods. +- It is easy to implement with many software tools available to carry out FA. +- Confirmatory FA can be used to test hypotheses. + +Disadvantages: + +- Justifying the choice of + number of factors to use may be difficult if little is known about the + structure of the data before analysis is carried out. +- Sometimes, it can be difficult to interpret what factors mean after + analysis has been completed. +- Like PCA, standard methods of carrying out FA assume that input variables + are continuous, although extensions to FA allow ordinal and binary + variables to be included (after transforming the input matrix). + +## Prostate cancer patient data + +The prostate dataset represents data from 97 men who have prostate cancer. +The data come from a study which examined the correlation between the level +of prostate specific antigen and a number of clinical measures in men who were +about to receive a radical prostatectomy. The data have 97 rows and 9 columns. + +Columns are: + +- `lcavol`: log (cancer volume) +- `lweight`: log (prostate weight) +- `age`: age (years) +- `lbph`: log (benign prostatic hyperplasia amount) +- `svi`: seminal vesicle invasion +- `lcp`: log (capsular penetration); amount of spread of cancer in outer walls + of prostate +- `gleason`: [Gleason score](https://en.wikipedia.org/wiki/Gleason_grading_system) +- `pgg45`: percentage Gleason scores 4 or 5 +- `lpsa`: log (prostate specific antigen) + +In this example, we use the clinical variables to identify factors representing +various clinical variables from prostate cancer patients. Two principal +components have already been identified as explaining a large proportion +of variance in the data when these data were analysed in the PCA episode. +We may expect a similar number of factors to exist in the data. + +Let's subset the data to just include the log-transformed clinical variables +for the purposes of this episode: + +```{r prostate} +prostate <- readRDS(here("data/prostate.rds")) +``` + +```{r view, eval=FALSE} +View(prostate) +``` + +```{r dims} +nrow(prostate) +head(prostate) + +#select five log-transformed clinical variables for further analysis +pros2 <- prostate[, c(1, 2, 4, 6, 9)] +head(pros2) +``` + +## Performing exploratory factor analysis + +EFA may be implemented in R using the `factanal()` function +from the **`stats`** package (which is a built-in package in base R). This +function fits a factor analysis by maximising the log-likelihood using a +data matrix as input. The number of factors to be fitted in the analysis +is specified by the user using the `factors` argument. Options for +transforming the factors by rotating the data in different ways are +available via the `rotation` argument (default is 'none'). + +::::::::::::::::::::::::::::::::::::::: challenge + +### Challenge 1 (3 mins) + +Use the `factanal()` function to identify the minimum number of factors +necessary to explain most of the variation in the data + +::::::::::::::: solution + +### Solution + +```{r ex1} +# Include one factor only +pros_fa <- factanal(pros2, factors = 1) +pros_fa +# p-value <0.05 suggests that one factor is not sufficient +# we reject the null hypothesis that one factor captures full +# dimensionality in the dataset + +# Include two factors +pros_fa <- factanal(pros2, factors = 2) +pros_fa +# p-value >0.05 suggests that two factors is sufficient +# we cannot reject the null hypothesis that two factors captures +# full dimensionality in the dataset + +#Include three factors +pros_fa <- factanal(pros2, factors = 3) +# Error shows that fitting three factors are not appropriate +# for only 5 variables (number of factors too high) +``` + +::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +The output of `factanal()` shows the loadings for each of the input variables +associated with each factor. The loadings are values between -1 and 1 which +represent the relative contribution each input variable makes to the factors. +Positive values show that these variables are positively related to the +factors, while negative values show a negative relationship between variables +and factors. Loading values are missing for some variables because R does not +print loadings less than 0.1. + +There are numerous ways to select the "best" number of factors. One is to use +the minimum number of features that does not leave a significant amount of +variance unaccounted for. In practise, we repeat the factor +analysis using different values in the `factors` argument. If we have an +idea of how many factors there will be before analysis, we can start with +that number. The final section of the analysis output shows the results of +a hypothesis test in which the null hypothesis is that the number of factors +used in the model is sufficient to capture most of the variation in the +dataset. If the p-value is less than 0.05, we reject the null hypothesis +and accept that the number of factors included is too small. If the p-value +is greater than 0.05, we accept the null hypothesis that the number of +factors used captures variation in the data. + +Like PCA, the fewer factors that can explain most of the variation in the +dataset, the better. It is easier to explore and interpret results using a +smaller number of factors which represent underlying features in the data. + +## Variance accounted for by factors - communality and uniqueness + +The *communality* of a variable is the sum of its squared loadings. It +represents the proportion of the variance in a variable that is accounted +for by the FA model. + +*Uniqueness* is the opposite of communality and represents the amount of +variation in a variable that is not accounted for by the FA model. Uniqueness is +calculated by subtracting the communality value from 1. If uniqueness is high for +a given variable, that means this variable is not well explaind/accounted for +by the factors identified. + +```{r common-unique} +apply(pros_fa$loadings^2, 1, sum) #communality +1 - apply(pros_fa$loadings^2, 1, sum) #uniqueness +``` + +## Visualising the contribution of each variable to the factors + +Similar to a biplot as we produced in the PCA episode, we can "plot the +loadings". This shows how each original variable contributes to each of +the factors we chose to visualise. + +```{r biplot} +#First, carry out factor analysis using two factors +pros_fa <- factanal(pros2, factors = 2) + +#plot loadings for each factor +plot( + pros_fa$loadings[, 1], + pros_fa$loadings[, 2], + xlab = "Factor 1", + ylab = "Factor 2", + ylim = c(-1, 1), + xlim = c(-1, 1), + main = "Factor analysis of prostate data" +) +abline(h = 0, v = 0) + +#add column names to each point +text( + pros_fa$loadings[, 1] - 0.08, + pros_fa$loadings[, 2] + 0.08, + colnames(pros2), + col = "blue" +) +``` + +::::::::::::::::::::::::::::::::::::::: challenge + +### Challenge 2 (3 mins) + +Use the output from your factor analysis and the plots above to interpret +the results of your analysis. + +What variables are most important in explaining each factor? Do you think +this makes sense biologically? Discuss in groups. + +::::::::::::::: solution + +### Solution + +This plot suggests that the variables lweight and lbph are associated with +high values on factor 2 (but lower values on factor 1) and the variables +lcavol, lcp and lpsa are associated with high values on factor 1 +(but lower values on factor 2). There appear to be two 'clusters' of +variables which can be represented by the two factors. + +The grouping of weight and enlargement (lweight and lbph) makes sense +biologically, as we would expect prostate enlargement to be associated +with greater weight. The groupings of lcavol, lcp, and lpsa also make +sense biologically, as larger cancer volume may be expected to be +associated with greater cancer spead and therefore higher PSA in the blood. + + + +::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +## Further reading + +- Gundogdu et al. (2019) Comparison of performances of Principal Component Analysis (PCA) and Factor Analysis (FA) methods on the identification of cancerous and healthy colon tissues. International Journal of Mass Spectrometry 445:116204. +- Kustra et al. (2006) A factor analysis model for functional genomics. BMC Bioinformatics 7: doi:10.1186/1471-2105-7-21. +- Yong, A.G. \& Pearce, S. (2013) A beginner's guide to factor analysis: focusing on exploratory factor analysis. Tutorials in Quantitative Methods for Psychology 9(2):79-94. +- Confirmatory factor analysis can be carried out with the package [Lavaan](https://www.lavaan.ugent.be/index.html). +- A more sophisticated implementation of EFA is available in the packages [EFA.dimensions](https://cran.r-project.org/web/packages/EFA.dimensions/index.html) and [psych](https://personality-project.org/r/psych/). + + + +:::::::::::::::::::::::::::::::::::::::: keypoints + +- Factor analysis is a method used for reducing dimensionality in a dataset by reducing variation contained in multiple variables into a smaller number of uncorrelated factors. +- PCA can be used to identify the number of factors to initially use in factor analysis. +- The `factanal()` function in R can be used to fit a factor analysis, where the number of factors is specified by the user. +- Factor analysis can take into account expert knowledge when deciding on the number of factors to use, but a disadvantage is that the output requires careful interpretation. + +:::::::::::::::::::::::::::::::::::::::::::::::::: + + diff --git a/episodes/06-k-means.Rmd b/episodes/06-k-means.Rmd new file mode 100644 index 00000000..813753d2 --- /dev/null +++ b/episodes/06-k-means.Rmd @@ -0,0 +1,601 @@ +--- +title: K-means +source: Rmd +teaching: 45 +exercises: 15 +math: yes +--- + +::::::::::::::::::::::::::::::::::::::: objectives + +- Understand the importance of clustering in high-dimensional data +- Understand and perform K-means clustering in `R`. +- Assess clustering performance using silhouette scores. +- Assess cluster robustness using bootstrapping. + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::: questions + +- How do we detect real clusters in high-dimensional data? +- How does K-means work and when should it be used? +- How can we perform K-means in `R`? +- How can we appraise a clustering and test cluster robustness? + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +```{r settings, include=FALSE} +library("here") +source(here("bin/chunk-options.R")) +``` + +## Introduction + +High-dimensional data, especially in biological settings, has +many sources of heterogeneity. Some of these are stochastic variation +arising from measurement error or random differences between organisms. +In some cases, a known grouping causes this heterogeneity (sex, treatment +groups, etc). In other cases, this heterogeneity arises from the presence of +unknown subgroups in the data. **Clustering** is a set of techniques that allows +us to discover unknown groupings like this, which we can often use to +discover the nature of the heterogeneity we're investigating. + +**Cluster analysis** involves finding groups of observations that are more +similar to each other (according to some feature) than they are to observations +in other groups. Cluster analysis is a useful statistical tool for exploring +high-dimensional datasets as +visualising data with large numbers of features is difficult. It is commonly +used in fields such as bioinformatics, genomics, and image processing in which +large datasets that include many features are often produced. Once groups +(or clusters) of observations have been identified using cluster analysis, +further analyses or interpretation can be carried out on the groups, for +example, using metadata to further explore groups. + +There are various ways to look for clusters of observations in a dataset using +different *clustering algorithms*. One way of clustering data is to minimise +distance between observations within a cluster and maximise distance between +proposed clusters. Clusters can be updated in an iterative process so that over +time we can become more confident in size and shape of clusters. + +## Believing in clusters + +When using clustering, it's important to realise that data may seem to +group together even when these groups are created randomly. It's especially +important to remember this when making plots that add extra visual aids to +distinguish clusters. +For example, if we cluster data from a single 2D normal distribution and draw +ellipses around the points, these clusters suddenly become almost visually +convincing. This is a somewhat extreme example, since there is genuinely no +heterogeneity in the data, but it does reflect what can happen if you allow +yourself to read too much into faint signals. + +Let's explore this further using an example. We create two columns of data +('x' and 'y') and partition these data into three groups ('a', 'b', 'c') +according to data values. We then plot these data and their allocated clusters +and put ellipses around the clusters using the `stat_ellipse` function +in `ggplot`. + +```{r fake-cluster, echo=FALSE} +set.seed(11) +library("MASS") +library("ggplot2") +data <- mvrnorm(n = 200, mu = rep(1, 2), Sigma = matrix(runif(4), ncol = 2)) +data <- as.data.frame(data) +colnames(data) <- c("x", "y") + +data$cluster <- ifelse( + data$y < (data$x * -0.06 + 0.9), + "a", + ifelse( + data$y < 1.15, + "b", + "c" + ) +) +ggplot(data, aes(x, y, colour = cluster)) + + geom_point() + + stat_ellipse() +``` + +The randomly created data used here appear to form three clusters when we +plot the data. Putting ellipses around the clusters can further convince us +that the clusters are 'real'. But how do we tell if clusters identified +visually are 'real'? + +## What is K-means clustering? + +**K-means clustering** is a clustering method which groups data points into a +user-defined number of distinct non-overlapping clusters. In K-means clustering +we are interested in minimising the *within-cluster variation*. This is the amount that +data points within a cluster differ from each other. In K-means clustering, the distance +between data points within a cluster is used as a measure of within-cluster variation. +Using a specified clustering algorithm like K-means clustering increases our confidence +that our data can be partitioned into groups. + +To carry out K-means clustering, we first pick $k$ initial points as centres or +"centroids" of our clusters. There are a few ways to choose these initial "centroids", +but for simplicity let's imagine we just pick three random co-ordinates. +We then follow these two steps until convergence: + +1. Assign each data point to the cluster with the closest centroid +2. Update centroid positions as the average of the points in that cluster + +We can see this process in action in this animation: + +```{r kmeans-animation, echo=FALSE, fig.cap="Cap", fig.alt="Alt"} +knitr::include_graphics("fig/kmeans.gif") +``` + +While K-means has some advantages over other clustering methods (easy to implement and +to understand), it does have some disadvantages, namely difficulties in identifying +initial clusters which observations belong to and the need for the user to specifiy the +number of clusters that the data should be partitioned into. + +::::::::::::::::::::::::::::::::::::::::: callout + +### Initialisation + +The algorithm used in K-means clustering finds a *local* rather than a +*global* optimum, so that results of clustering are dependent on the initial +cluster that each observation is randomly assigned to. This initial +configuration can have a significant effect on the final configuration of the +clusters, so dealing with this limitation is an important part +of K-means clustering. Some strategies to deal with this problem are: + +- Choose $K$ points at random from the data as the cluster centroids. +- Randomly split the data into $K$ groups, and then average these groups. +- Use the K-means++ algorithm to choose initial values. + +These each have advantages and disadvantages. In general, it's good to be +aware of this limitation of K-means clustering and that this limitation can +be addressed by choosing a good initialisation method, initialising clusters +manually, or running the algorithm from multiple different starting points. + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +## K-means clustering applied to single-cell RNAseq data + +Let's carry out K-means clustering in `R` using some real high-dimensional data. +We're going to work with single-cell RNAseq data in these clustering challenges, +which is often *very* high-dimensional. Commonly, experiments profile the +expression level of 10,000+ genes in thousands of cells. Even after filtering +the data to remove low quality observations, the dataset we're using in this +episode contains measurements for over 9,000 genes in over 3,000 cells. + +One way to get a handle on a dataset of this size is to use something we covered +earlier in the course - dimensionality reduction. Dimensionality reduction +allows us to visualise this incredibly complex data in a small number of +dimensions. In this case, we'll be using principal component analysis (PCA) to +compress the data by identifying the major axes of variation in the data, +before running our clustering algorithms on this lower-dimensional data. + +The `scater` package has some easy-to-use tools to calculate a PCA for +`SummarizedExperiment` objects. +Let's load the `scRNAseq` data and calculate some principal components. + +```{r data} +library("SingleCellExperiment") +library("scater") + +scrnaseq <- readRDS(here::here("data/scrnaseq.rds")) +scrnaseq <- runPCA(scrnaseq, ncomponents = 15) +pcs <- reducedDim(scrnaseq)[, 1:2] +``` + +The first two principal components capture almost 50% of the variation within +the data. For now, we'll work with just these two principal components, since +we can visualise those easily, and they're a quantitative representation of +the underlying data, representing the two largest axes of variation. + +We can now run K-means clustering on the first and second principal components +of the `scRNAseq` data using the `kmeans` function. + +```{r kmeans, fig.cap="Title", fig.alt="Alt"} +set.seed(42) +cluster <- kmeans(pcs, centers = 4) +scrnaseq$kmeans <- as.character(cluster$cluster) +plotReducedDim(scrnaseq, "PCA", colour_by = "kmeans") +``` + +We can see that this produces a sensible-looking partition of the data. +However, is it totally clear whether there might be more or fewer clusters +here? + +::::::::::::::::::::::::::::::::::::::: challenge + +### Challenge 1 + +Cluster the data using a $K$ of 5, and plot it using `plotReducedDim`. +Save this with a variable name that's different to what we just used, +because we'll use this again later. + +::::::::::::::: solution + +### Solution + +```{r kmeans-ex} +set.seed(42) +cluster5 <- kmeans(pcs, centers = 5) +scrnaseq$kmeans5 <- as.character(cluster5$cluster) +plotReducedDim(scrnaseq, "PCA", colour_by = "kmeans5") +``` + +::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +::::::::::::::::::::::::::::::::::::::::: callout + +### K-medoids (PAM) + +One problem with K-means is that using the mean to define cluster centroids +means that clusters can be very sensitive to outlying observations. +K-medoids, also known as "partitioning around medoids (PAM)" is similar to +K-means, but uses the median rather than the mean as the method for defining +cluster centroids. Using the median rather than the mean reduces sensitivity of +clusters to outliers in the data. K-medioids has had popular application in +genomics, for example the well-known PAM50 gene set in breast cancer, which has seen some +prognostic applications. +The following example shows how cluster centroids differ when created using +medians rather than means. + +```{r} +x <- rnorm(20) +y <- rnorm(20) +x[10] <- x[10] + 10 +plot(x, y, pch = 16) +points(mean(x), mean(y), pch = 16, col = "firebrick") +points(median(x), median(y), pch = 16, col = "dodgerblue") +``` + +PAM can be carried out using `pam()` form the **`cluster`** package. + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +## Cluster separation + +When performing clustering, it is important for us to be able to measure +how well our clusters are separated. One measure to test this is silhouette width. +This is a number that is computed for every observation. It can range from -1 to 1. +A high silhouette width means an observation is closer to other observations +within the same cluster. For each cluster, the silhouette widths can then be +averaged or an overall average can be taken. + +::::::::::::::::::::::::::::::::::::::::: callout + +### More detail on silhouette widths + +In more detail, each observation's silhouette width is computed as follows: + +1. Compute the average distance between the focal observation and all other + observations in the same cluster. +2. For each of the other clusters, compute the average distance between + focal observation and all observations in the other cluster. Keep the + smallest of these average distances. +3. Subtract (1.)-(2.) then divivde by whichever is smaller (1.) or (2). + + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +Ideally, we would have only large positive silhouette widths, indicating +that each data point is much more similar to points within its cluster than it +is to the points in any other cluster. However, this is rarely the case. Often, +clusters are very fuzzy, and even if we are relatively sure about the existence +of discrete groupings in the data, observations on the boundaries can be difficult +to confidently place in either cluster. + +Here we use the `silhouette` function from the `cluster` package to calculate the +silhouette width of our K-means clustering using a distance matrix of distances +between points in the clusters. + +```{r silhouette} +library("cluster") +dist_mat <- dist(pcs) +sil <- silhouette(cluster$cluster, dist = dist_mat) +plot(sil, border = NA) +``` + +Let's plot the silhouette score on the original dimensions used to cluster +the data. Here, we're mapping cluster membership to point shape, and silhouette +width to colour. + +```{r plot-silhouette} +pc <- as.data.frame(pcs) +colnames(pc) <- c("x", "y") +pc$sil <- sil[, "sil_width"] +pc$clust <- factor(cluster$cluster) +mean(sil[, "sil_width"]) + +ggplot(pc) + + aes(x, y, shape = clust, colour = sil) + + geom_point() + + scale_colour_gradient2( + low = "dodgerblue", high = "firebrick" + ) + + scale_shape_manual( + values = setNames(1:4, 1:4) + ) +``` + +This plot shows that silhouette values for individual observations tends to be +very high in the centre of clusters, but becomes quite low towards the edges. +This makes sense, as points that are "between" two clusters may be more similar +to points in another cluster than they are to the points in the cluster one they +belong to. + +::::::::::::::::::::::::::::::::::::::: challenge + +### Challenge 2 + +Calculate the silhouette width for the K of 5 clustering we did earlier. +Is it better or worse than before? + +Can you identify where the differences lie? + +::::::::::::::: solution + +### Solution + +```{r silhouette-ex} +sil5 <- silhouette(cluster5$cluster, dist = dist_mat) +scrnaseq$kmeans5 <- as.character(cluster5$cluster) +plotReducedDim(scrnaseq, "PCA", colour_by = "kmeans5") +mean(sil5[, "sil_width"]) +``` + +The average silhouette width is lower when k=5. + +```{r, eval=FALSE} +plot(sil5, border = NA) +``` + +```{r, eval=FALSE, echo=FALSE} +## this needs to be plotted manually or it gets an unmapped memory crash +Cairo::Cairo("fig/silhouette5.png") +plot(sil5, border = NA) +dev.off() +``` + +```{r, echo=FALSE} +knitr::include_graphics("fig/silhouette5.png") +``` + +This seems to be because some observations in clusters 3 and 5 seem to be +more similar to other clusters than the one they have been assigned to. +This may indicate that K is too high. + + + +::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +::::::::::::::::::::::::::::::::::::::::: callout + +### Gap statistic + +Another measure of how good our clustering is is the "gap statistic". +This compares the observed squared distance between observations in a cluster +and the centre of the cluster to an "expected" squared distances. +The expected distances are calculated by randomly distributing cells within +the range of the original data. Larger values represent lower +squared distances within clusters, and thus better clustering. +We can see how this is calculated in the following example. + +```{r clusgap-call, eval=FALSE} +library("cluster") +gaps <- clusGap(pcs, kmeans, K.max = 20, iter.max = 20) +best_k <- maxSE(gaps$Tab[, "gap"], gaps$Tab[, "SE.sim"]) +best_k +plot(gaps$Tab[,"gap"], xlab = "Number of clusters", ylab = "Gap statistic") +abline(v = best_k, col = "red") +``` + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +## Cluster robustness + +When we cluster data, we want to be sure that the clusters we identify are +not a result of the exact properties of the input data. That is, if the +data we observed were slightly different, the clusters we would identify +in this different data would be very similar. This makes it more +likely that these can be reproduced. + +To assess this, we can use the *bootstrap*. What we do here is to take a sample +from the data with replacement. Sampling with replacement means that in the +sample that we take, we can include points from the input data more than once. +This is maybe easier to see with an example. First, we define some data: + +```{r bs-data} +data <- 1:5 +``` + +Then, we can take a sample from this data without replacement: + +```{r bs-sample} +sample(data, 5) +``` + +This sample is a subset of the original data, and points are only present once. +This is the case every time even if we do it many times: + +```{r bs-sample-rep} +## Each column is a sample +replicate(10, sample(data, 5)) +``` + +However, if we sample *with replacement*, then sometimes individual data points +are present more than once. + +```{r bs-sample-replace} +replicate(10, sample(data, 5, replace = TRUE)) +``` + +::::::::::::::::::::::::::::::::::::::::: callout + +### Bootstrapping + +The bootstrap is a powerful and common statistical technique. + +We would like to know about the sampling distribution of a statistic, +but we don't have any knowledge of its behaviour under the null hypothesis. + +For example, we might want to understand the uncertainty around an estimate +of the mean of our data. To do this, we could resample the data with +replacement and calculate the mean of each average. + +```{r boots} +boots <- replicate(1000, mean(sample(data, 5, replace = TRUE))) +hist(boots, + breaks = "FD", + main = "1,000 bootstrap samples", + xlab = "Mean of sample" +) +``` + +In this case, the example is simple, but it's possible to +devise more complex statistical tests using this kind of approach. + +The bootstrap, along with permutation testing, can be a very flexible and +general solution to many statistical problems. + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +In applying the bootstrap to clustering, we want to see two things: + +1. Will observations within a cluster consistently cluster together in + different bootstrap replicates? +2. Will observations frequently swap between clusters? + +In the plot below, the diagonal of the plot shows how often the clusters +are reproduced in boostrap replicates. High scores on +the diagonal mean that the clusters are consistently reproduced in each +boostrap replicate. Similarly, the off-diagonal elements represent how often +observations swap between clusters in bootstrap replicates. High scores +indicate that observations rarely swap between clusters. + +```{r bs-heatmap} +library("pheatmap") +library("bluster") +library("viridis") + +km_fun <- function(x) { + kmeans(x, centers = 4)$cluster +} +ratios <- bootstrapStability(pcs, FUN = km_fun, clusters = cluster$cluster) +pheatmap(ratios, + cluster_rows = FALSE, cluster_cols = FALSE, + col = viridis(10), + breaks = seq(0, 1, length.out = 10) +) +``` + +Yellow boxes indicate values slightly greater than 1, which may be observed. +These are "good" (despite missing in the colour bar). + +::::::::::::::::::::::::::::::::::::::: challenge + +### Challenge 3 + +Repeat the bootstrapping process with K=5. Are the results better or worse? +Can you identify where the differences occur on the `plotReducedDim`? + +::::::::::::::: solution + +### Solution + +```{r bs-ex} +km_fun5 <- function(x) { + kmeans(x, centers = 5)$cluster +} +set.seed(42) +ratios5 <- bootstrapStability(pcs, FUN = km_fun5, clusters = cluster5$cluster) +pheatmap(ratios5, + cluster_rows = FALSE, cluster_cols = FALSE, + col = viridis(10), + breaks = seq(0, 1, length.out = 10) +) +``` + +When K=5, we can see that the values on the diagonal of the matrix are +smaller, indicating that the clusters aren't exactly reproducible in the +bootstrap samples. + +Similarly, the off-diagonal elements are considerably lower for some +elements. +This indicates that observations are "swapping" between these clusters +in bootstrap replicates. + + + +::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +::::::::::::::::::::::::::::::::::::::::: callout + +### Consensus clustering + +One useful and generic method of clustering is *consensus clustering*. +This method can use k-means, or other clustering methods. + +The idea behind this is to bootstrap the data repeatedly, and cluster +it each time, perhaps using different numbers of clusters. +If a pair of data points always end up in the same cluster, +it's likely that they really belong to the same underlying cluster. + +This is really computationally demanding but has been shown to perform very +well in some situations. It also allows you to visualise how cluster +membership changes over different values of K. + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +::::::::::::::::::::::::::::::::::::::::: callout + +### Speed + +It's worth noting that a lot of the methods we've discussed here are very +computationally demanding. +When clustering data, we may have to compare points to each other many times. +This becomes more and more difficult when we have many observations and +many features. This is especially problematic when we want to do things like +bootstrapping that requires us to cluster the data over and over. + +As a result, there are a lot of approximate methods for finding clusters +in the data. For example, the +[mbkmeans](https://www.bioconductor.org/packages/3.13/bioc/html/mbkmeans.html) +package includes an algorithm for clustering extremely large data. The idea +behind this algorithm +is that if the clusters we find are robust, we don't need to look at all of +the data every time. This is very helpful because it reduces the amount of +data that needs to be held in memory at once, but also because it minimises the +computational cost. + +Similarly, approximate nearest neighbour methods like +[Annoy](https://pypi.org/project/annoy/) can be used to identify what the +$K$ closest points are in the data, and this can be used in some clustering +methods (for example, graph-based clustering). + +Generally, these methods sacrifice a bit of accuracy for a big gain in speed. + + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +### Further reading + +- [Wu, J. (2012) Cluster analysis and K-means clustering: An Introduction. In: Advances in K-means Clustering. Springer Berlin, Heidelberg.](https://doi.org/10.1007/978-3-642-29807-3_1). +- [Modern statistics for modern biology, Susan Holmes and Wolfgang Huber (Chapter 5)](https://web.stanford.edu/class/bios221/book/Chap-Clustering.html). +- [Understanding K-means clustering in machine learning, Towards Data Science](https://towardsdatascience.com/understanding-k-means-clustering-in-machine-learning-6a6e67336aa1). + + + +:::::::::::::::::::::::::::::::::::::::: keypoints + +- K-means is an intuitive algorithm for clustering data. +- K-means has various advantages but can be computationally intensive. +- Apparent clusters in high-dimensional data should always be treated with some scepticism. +- Silhouette width and bootstrapping can be used to assess how well our clustering algorithm has worked. + +:::::::::::::::::::::::::::::::::::::::::::::::::: + + diff --git a/episodes/07-hierarchical.Rmd b/episodes/07-hierarchical.Rmd new file mode 100644 index 00000000..4d8cf7d2 --- /dev/null +++ b/episodes/07-hierarchical.Rmd @@ -0,0 +1,741 @@ +--- +title: Hierarchical clustering +source: Rmd +teaching: 60 +exercises: 10 +math: yes +--- + +::::::::::::::::::::::::::::::::::::::: objectives + +- Understand when to use hierarchical clustering on high-dimensional data. +- Perform hierarchical clustering on high-dimensional data and evaluate dendrograms. +- Explore different distance matrix and linkage methods. +- Use the Dunn index to validate clustering methods. + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::: questions + +- What is hierarchical clustering and how does it differ from other clustering methods? +- How do we carry out hierarchical clustering in R? +- What distance matrix and linkage methods should we use? +- How can we validate identified clusters? + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +```{r settings, include=FALSE} +library("here") +source(here("bin/chunk-options.R")) +``` + +## Why use hierarchical clustering on high-dimensional data? + +When analysing high-dimensional data in the life sciences, it is often useful +to identify groups of similar data points to understand more about the relationships +within the dataset. In *hierarchical clustering* an algorithm groups similar +data points (or observations) into groups (or clusters). This results in a set +of clusters, where each cluster is distinct, and the data points within each +cluster have similar characteristics. The clustering algorithm works by iteratively +grouping data points so that different clusters may exist at different stages +of the algorithm's progression. + +Unlike K-means clustering, *hierarchical clustering* does not require the +number of clusters $k$ to be specified by the user before the analysis is carried +out. Hierarchical clustering also provides an attractive *dendrogram*, a +tree-like diagram showing the degree of similarity between clusters. + +The dendrogram is a key feature of hierarchical clustering. This tree-shaped graph allows +the similarity between data points in a dataset to be visualised and the +arrangement of clusters produced by the analysis to be illustrated. Dendrograms are created +using a distance (or dissimilarity) that quantify how different are pairs of observations, +and a clustering algorithm to fuse groups of similar data points together. + +In this episode we will explore hierarchical clustering for identifying +clusters in high-dimensional data. We will use *agglomerative* hierarchical +clustering (see box) in this episode. + +::::::::::::::::::::::::::::::::::::::::: callout + +### Agglomerative and Divisive hierarchical clustering + +There are two main methods of carrying out hierarchical clustering: +agglomerative clustering and divisive clustering. +The former is a 'bottom-up' approach to clustering whereby the clustering +approach begins with each data point (or observation) +being regarded as being in its own separate cluster. Pairs of data points are +merged as we move up the tree. +Divisive clustering is a 'top-down' approach in which all data points start +in a single cluster and an algorithm is used to split groups of data points +from this main group. + + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +## The agglomerative hierarchical clustering algorithm + +To start with, we measure distance +(or dissimilarity) between pairs of observations. Initially, and at the bottom +of the dendrogram, each observation is considered to be in its own individual +cluster. We start the clustering procedure by fusing the two observations that +are most similar according to a distance matrix. Next, the next-most similar observations are fused +so that the total number of clusters is *number of observations* - 2 (see +panel below). Groups of observations may then be merged into a larger cluster +(see next panel below, green box). This process continues until all the observations are included +in a single cluster. + +```{r hclustfig1, echo=FALSE, out.width="500px", fig.cap="Figure 1a: Example data showing two clusters of observation pairs"} +knitr::include_graphics("fig/hierarchical_clustering_1.png") +``` + +```{r hclustfig2, echo=FALSE, out.width="500px", fig.cap="Figure 1b: Example data showing fusing of one observation into larger cluster"} +knitr::include_graphics("fig/hierarchical_clustering_2.png") +``` + +## A motivating example + +To motivate this lesson, let's first look at an example where hierarchical +clustering is really useful, and then we can understand how to apply it in more +detail. To do this, we'll return to the large methylation dataset we worked +with in the regression lessons. Let's load the data and look at it. + +```{r} +library("minfi") +library("here") +library("ComplexHeatmap") + +methyl <- readRDS(here("data/methylation.rds")) + +# transpose this Bioconductor dataset to show features in columns +methyl_mat <- t(assay(methyl)) +``` + +Looking at a heatmap of these data, we may spot some patterns -- many columns +appear to have a similar methylation levels across all rows. However, they are +all quite jumbled at the moment, so it's hard to tell how many line up exactly. + +```{r heatmap-noclust, echo=FALSE} + +Heatmap(methyl_mat, + name = "Methylation level", + cluster_rows = FALSE, cluster_columns = FALSE, + show_row_names = FALSE, show_column_names = FALSE, + row_title="Individuals", column_title = "Methylation sites" +) +``` + +We can order these data to make the patterns more clear using hierarchical +clustering. To do this, we can change the arguments we pass to +`Heatmap()` from the **`ComplexHeatmap`** package. `Heatmap()` +groups features based on dissimilarity (here, Euclidean distance) and orders +rows and columns to show clustering of features and observations. + +```{r heatmap-clust} +Heatmap(methyl_mat, + name = "Methylation level", + cluster_rows = TRUE, cluster_columns = TRUE, + row_dend_width = unit(0.2, "npc"), + column_dend_height = unit(0.2, "npc"), + show_row_names = FALSE, show_column_names = FALSE, + row_title="Individuals", column_title = "Methylation sites" +) +``` + +We can see that clustering the features (CpG sites) results in an overall +gradient of high to low methylation levels from left to right. Maybe more +interesting is the fact that the rows (corresponding to individuals) are now +grouped according to methylation patterns. For example, 12 samples seem to have +lower methylation levels for a small subset of CpG sites in the middle, relative +to all the other samples. It's not clear without investigating further what the +cause of this is -- it could be a batch effect, or a known grouping (e.g., old +vs young samples). However, clustering like this can be a useful part of +exploratory analysis of data to build hypotheses. + +Now, let's cover the inner workings of hierarchical clustering in more detail. +There are two things to consider before carrying out clustering: + +- how to define dissimilarity between observations using a distance matrix, and +- how to define dissimilarity between clusters and when to fuse separate clusters. + +## Creating the distance matrix + +Agglomerative hierarchical clustering is performed in two steps: calculating +the distance matrix (containing distances between pairs of observations) and +iteratively grouping observations into clusters using this matrix. + +There are different ways to +specify a distance matrix for clustering: + +- Specify distance as a pre-defined option using the `method` argument in + `dist()`. Methods include `euclidean` (default), `maximum` and `manhattan`. +- Create a self-defined function which calculates distance from a matrix or + from two vectors. The function should only contain one argument. + +Of pre-defined methods of calculating the distance matrix, Euclidean is one of +the most commonly used. This method calculates the shortest straight-line +distances between pairs of observations. + +Another option is to use a correlation matrix as the input matrix to the +clustering algorithm. The type of distance matrix used in hierarchical +clustering can have a big effect on the resulting tree. The decision of which +distance matrix to use before carrying out hierarchical clustering depends on the +type of data and question to be addressed. + +## Linkage methods + +The second step in performing hierarchical clustering after defining the +distance matrix (or another function defining similarity between data points) +is determining how to fuse different clusters. + +*Linkage* is used to define dissimilarity between groups of observations +(or clusters) and is used to create the hierarchical structure in the +dendrogram. Different linkage methods of creating a dendrogram are discussed +below. + +`hclust()` supports various linkage methods (e.g `complete`, +`single`, `ward D`, `ward D2`, `average`, `median`) and these are also supported +within the `Heatmap()` function. The method used to perform hierarchical +clustering in `Heatmap()` can be specified by the arguments +`clustering_method_rows` and `clustering_method_columns`. Each linkage method +uses a slightly different algorithm to calculate how clusters are fused together +and therefore different clustering decisions are made depending on the linkage +method used. + +Complete linkage (the default in `hclust()`) works by computing all pairwise +dissimilarities between data points in different clusters. For each pair of two clusters, +it sets their dissimilarity ($d$) to the maximum dissimilarity value observed +between any of these clusters' constituent points. The two clusters +with smallest value of $d$ are then fused. + +## Computing a dendrogram + +Dendograms are useful tools to visualise the grouping of points and clusters into bigger clusters. +We can create and plot dendrograms in R using `hclust()` which takes +a distance matrix as input and creates the associated tree using hierarchical +clustering. Here we create some example data to carry out hierarchical +clustering. + +Let's generate 20 data points in 2D space. Each +point belongs to one of three classes. Suppose we did not know which class +data points belonged to and we want to identify these via cluster analysis. +Hierarchical clustering carried out on the data can be used to produce a +dendrogram showing how the data is partitioned into clusters. But how do we +interpret this dendrogram? Let's explore this using our example data. + +```{r plotexample} +#First, create some example data with two variables x1 and x2 +set.seed(450) +example_data <- data.frame( + x1 = rnorm(20, 8, 4.5), + x2 = rnorm(20, 6, 3.4) +) + +#plot the data and name data points by row numbers +plot(example_data$x1, example_data$x2, type = "n") +text( + example_data$x1, + example_data$x2, + labels = rownames(example_data), + cex = 0.7 +) + +## calculate distance matrix using euclidean distance +dist_m <- dist(example_data, method = "euclidean") +``` + +::::::::::::::::::::::::::::::::::::::: challenge + +### Challenge 1 + +Use `hclust()` to implement hierarchical clustering using the +distance matrix `dist_m` and +the `complete` linkage method and plot the results as a dendrogram using +`plot()`. + +::::::::::::::: solution + +### Solution: + +```{r plotclustex} +clust <- hclust(dist_m, method = "complete") +plot(clust) +``` + +::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +This dendrogram shows similarities/differences in distances between data points. +Each leaf of the dendrogram represents one of the 20 data points. These leaves +fuse into branches as the height increases. Observations that are similar fuse into +the same branches. The height at which any two +data points fuse indicates how different these two points are. Points that fuse +at the top of the tree are very different from each other compared with two +points that fuse at the bottom of the tree, which are quite similar. You can +see this by comparing the position of similar/dissimilar points according to +the scatterplot with their position on the tree. + +## Identifying clusters based on the dendrogram + +To do this, we can make a horizontal cut through the dendrogram at a user-defined height. +The sets of observations beneath this cut can be thought of as distinct clusters. For +example, a cut at height 10 produces two downstream clusters while a cut at +height 4 produces six downstream clusters. + +We can cut the dendrogram to determine number of clusters at different heights +using `cutree()`. This function cuts a dendrogram into several +groups (or clusters) where the number of desired groups is controlled by the +user, by defining either `k` (number of groups) or `h` (height at which tree is +cut). + +```{r cutree} +## k is a user defined parameter determining +## the desired number of clusters at which to cut the treee +cutree(clust, k = 3) +## h is a user defined parameter determining +## the numeric height at which to cut the tree +cutree(clust, h = 10) +## both give same results + +four_cut <- cutree(clust, h = 4) + +## we can produce the cluster each observation belongs to +## using the mutate and count functions +library(dplyr) +example_cl <- mutate(example_data, cluster = four_cut) +count(example_cl, cluster) + +#plot cluster each point belongs to on original scatterplot +library(ggplot2) +ggplot(example_cl, aes(x = x2, y = x1, color = factor(cluster))) + geom_point() +``` + +Note that this cut produces 8 clusters (two before the cut and another six +downstream of the cut). + +::::::::::::::::::::::::::::::::::::::: challenge + +### Challenge 2: + +Identify the value of `k` in `cutree()` that gives the same +output as `h = 5` + +::::::::::::::: solution + +### Solution: + +```{r h-k-ex-plot} +plot(clust) +## create horizontal line at height = 5 +abline(h = 5, lty = 2) + +cutree(clust, h = 5) +cutree(clust, k = 7) + +five_cut <- cutree(clust, h = 5) + +library(dplyr) +example_cl <- mutate(example_data, cluster = five_cut) +count(example_cl, cluster) + +library(ggplot2) +ggplot(example_cl, aes(x=x2, y = x1, color = factor(cluster))) + geom_point() +``` + +Seven clusters (`k = 7`) gives similar results to `h = 5`. You can plot a +horizontal line on the dendrogram at `h = 5` to help identify +corresponding value of `k`. + + + +::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +## Highlighting dendrogram branches + +In addition to visualising cluster identity in scatter plots, it is also possible to +highlight branches in dentrograms. In this example, we calculate a distance matrix between +samples in the `methyl_mat` dataset. We then draw boxes round clusters obtained with `cutree`. + +```{r plot-clust-method} +## create a distance matrix using euclidean method +distmat <- dist(methyl_mat) +## hierarchical clustering using complete method +clust <- hclust(distmat) +## plot resulting dendrogram +plot(clust) + +## draw border around three clusters +rect.hclust(clust, k = 3, border = 2:6) +## draw border around two clusters +rect.hclust(clust, k = 2, border = 2:6) +``` + +We can also colour clusters downstream of a specified cut using `color_branches()` +from the **`dendextend`** package. + +```{r plot-coloured-branches} +## cut tree at height = 4 +cut <- cutree(clust, h = 50) + +library("dendextend") +avg_dend_obj <- as.dendrogram(clust) +## colour branches of dendrogram depending on clusters +plot(color_branches(avg_dend_obj, h = 50)) +``` + +## The effect of different linkage methods + +Now let us look into changing the default behaviour of `hclust()`. Imagine we have two crescent-shaped point clouds as shown below. + +```{r crescents} +# These two functions are to help us make crescents. Don't worry it you do not understand all this code. +# The importent bit is the object "cres", which consists of two columns (x and y coordinates of two crescents). +is.insideCircle <- function(co, r=0.5, offs=c(0,0)){ + sqrt((co[,1]+offs[1])^2 + (co[,2]+offs[2])^2) <= r +} +make.crescent <- function(n){ + raw <- cbind(x=runif(n)-0.5, y=runif(n)-0.5) + raw[is.insideCircle(raw) & !is.insideCircle(raw, offs=c(0, -0.2)),] +} +# make x/y data in shape of two crescents +set.seed(123) +cres1 <- make.crescent(1000) # 1st crescent +cres2 <- make.crescent(1000) # 2nd crescent +cres2[,2] <- -cres2[,2] -0.1 # flip 2nd crescent upside-down and shift down +cres2[,1] <- cres2[,1] + 0.5 # shift second crescent to the right + +cres <- rbind(cres1, cres2) # concatente x/y values +plot(cres) +``` + +We might expect that the crescents are resolved into separate clusters. But if we +run hierarchical clustering with the default arguments, we get this: + +```{r cresClustDefault} +cresClass <- cutree(hclust(dist(cres)), k=2) # save partition for colouring +plot(cres, col=cresClass) # colour scatterplot by partition +``` + +::::::::::::::::::::::::::::::::::::::: challenge + +### Challenge 3 + +Carry out hierarchical clustering on the `cres` data that we generated above. +Try out different linkage methods and use `cutree()` to split each resulting +dendrogram into two clusters. Plot the results colouring the dots according to +their inferred cluster identity. + +Which method(s) give you the expected clustering outcome? + +Hint: Check `?hclust` to see the possible values of the argument `method` (the linkage method used). + +::::::::::::::: solution + +### Solution: + +```{r plot-clust-comp} +#?hclust +# "complete", "single", "ward.D", "ward.D2", "average", "mcquitty", "median" or "centroid" +cresClassSingle <- cutree(hclust(dist(cres),method = "single"), k=2) +plot(cres, col=cresClassSingle, main="single") +``` + +```{r plot-clust-wardD} +cresClassWard.D <- cutree(hclust(dist(cres), method="ward.D"), k=2) +plot(cres, col=cresClassWard.D, main="ward.D") +``` + +```{r plot-clust-wardD2} +cresClassWard.D2 <- cutree(hclust(dist(cres), method="ward.D2"), k=2) +plot(cres, col=cresClassWard.D2, main="ward.D2") +``` + +```{r plot-clust-average} +cresClassAverage <- cutree(hclust(dist(cres), method="average"), k=2) +plot(cres, col=cresClassAverage, main="average") +``` + +```{r plot-clust-mcq} +cresClassMcquitty <- cutree(hclust(dist(cres), method="mcquitty"), k=2) +plot(cres, col=cresClassMcquitty, main="mcquitty") +``` + +```{r plot-clust-median} +cresClassMedian<- cutree(hclust(dist(cres), method="median"), k=2) +plot(cres, col=cresClassMedian, main="median") +``` + +```{r plot-clust-centroid} +cresClassCentroid<- cutree(hclust(dist(cres), method="centroid"), k=2) +plot(cres, col=cresClassCentroid, main="centroid") +``` + +The linkage methods `single`, `ward.D`, and `average` resolve each crescent as a separate cluster. + +::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +The help page of `hclust()` gives some intuition on linkage methods. It describes `complete` +(the default) and `single` as opposite ends of a spectrum with all other methods in between. +When using complete linkage, the distance between two clusters is assumed to be the distance +between both clusters' most distant points. This opposite it true for single linkage, where +the minimum distance between any two points, one from each of two clusters is used. Single +linkage is described as friends-of-friends appporach - and really, it groups all close-together +points into the same cluster (thus resolving one cluster per crescent). Complete linkage on the +other hand recognises that some points a the tip of a crescent are much closer to points in the +other crescent and so it splits both crescents. + +## Using different distance methods + +So far, we've been using Euclidean distance to define the dissimilarity +or distance between observations. However, this isn't always the best +metric for how dissimilar different observations are. Let's make an +example to demonstrate. Here, we're creating two samples each with +ten observations of random noise: + +```{r create-cor-example} +set.seed(20) +cor_example <- data.frame( + sample_a = rnorm(10), + sample_b = rnorm(10) +) +rownames(cor_example) <- paste( + "Feature", 1:nrow(cor_example) +) +``` + +Now, let's create a new sample that has exactly the same pattern across all +our features as `sample_a`, just offset by 5: + +```{r sample-c} +cor_example$sample_c <- cor_example$sample_a + 5 +``` + +You can see that this is a lot like the `assay()` of our methylation object +from earlier, where columns are observations or samples, and rows are features: + +```{r head-cor-example} +head(cor_example) +``` + +If we plot a heatmap of this, we can see that `sample_a` and `sample_b` are +grouped together because they have a small distance to each other, despite +being quite different in their pattern across the different features. +In contrast, `sample_a` and `sample_c` are very distant, despite having +*exactly* the same pattern across the different features. + +```{r heatmap-cor-example} +Heatmap(as.matrix(cor_example)) +``` + +We can see that more clearly if we do a line plot: + +```{r lineplot-cor-example} +## create a blank plot (type = "n" means don't draw anything) +## with an x range to hold the number of features we have. +## the range of y needs to be enough to show all the values for every feature +plot( + 1:nrow(cor_example), + rep(range(cor_example), 5), + type = "n" +) +## draw a red line for sample_a +lines(cor_example$sample_a, col = "firebrick") +## draw a blue line for sample_b +lines(cor_example$sample_b, col = "dodgerblue") +## draw a green line for sample_c +lines(cor_example$sample_c, col = "forestgreen") +``` + +We can see that `sample_a` and `sample_c` have exactly the same pattern across +all of the different features. However, due to the overall difference between +the values, they have a high distance to each other. +We can see that if we cluster and plot the data ourselves using Euclidean +distance: + +```{r clust-euc-cor-example} +clust_dist <- hclust(dist(t(cor_example))) +plot(clust_dist) +``` + +In some cases, we might want to ensure that samples that have similar patterns, +whether that be of gene expression, or DNA methylation, have small distances +to each other. Correlation is a measure of this kind of similarity in pattern. +However, high correlations indicate similarity, while for a distance measure +we know that high distances indicate dissimilarity. Therefore, if we wanted +to cluster observations based on the correlation, or the similarity of patterns, +we can use `1 - cor(x)` as the distance metric. +The input to `hclust()` must be a `dist` object, so we also need to call +`as.dist()` on it before passing it in. + +```{r clust-cor-cor-example} +cor_as_dist <- as.dist(1 - cor(cor_example)) +clust_cor <- hclust(cor_as_dist) +plot(clust_cor) +``` + +Now, `sample_a` and `sample_c` that have identical patterns across the features +are grouped together, while `sample_b` is seen as distant because it has a +different pattern, even though its values are closer to `sample_a`. +Using your own distance function is often useful, especially if you have missing +or unusual data. It's often possible to use correlation and other custom +distance functions to functions that perform hierarchical clustering, such as +`pheatmap()` and `stats::heatmap()`: + +```{r heatmap-cor-cor-example} +## pheatmap allows you to select correlation directly +pheatmap(as.matrix(cor_example), clustering_distance_cols = "correlation") +## Using the built-in stats::heatmap +heatmap( + as.matrix(cor_example), + distfun = function(x) as.dist(1 - cor(t(x))) +) +``` + +## Validating clusters + +Now that we know how to carry out hierarchical clustering, how do we know how +many clusters are optimal for the dataset? + +Hierarchical clustering carried out on any dataset will produce clusters, +even when there are no 'real' clusters in the data! We need to be able to +determine whether identified clusters represent true groups in the data, or +whether clusters have been identified just due to chance. In the last episode, +we have introduced silhouette scores as a measure of cluster compactness and +bootstrapping to assess cluster robustness. Such tests can be used to compare +different clustering algorithms, for example, those fitted using different linkage +methods. + +Here, we introduce the Dunn index, which is a measure of cluster compactness. The +Dunn index is the ratio of the smallest distance between any two clusters +and to the largest intra-cluster distance found within any cluster. This can be +seen as a family of indices which differ depending on the method used to compute +distances. The Dunn index is a metric that penalises clusters that have +larger intra-cluster variance and smaller inter-cluster variance. The higher the +Dunn index, the better defined the clusters. + +Let's calculate the Dunn index for clustering carried out on the +`methyl_mat` dataset using the **`clValid`** package. + +```{r plot-clust-dunn} +## calculate dunn index +## (ratio of the smallest distance between obs not in the same cluster +## to the largest intra-cluster distance) +library("clValid") +## calculate euclidean distance between points +distmat <- dist(methyl_mat) +clust <- hclust(distmat, method = "complete") +plot(clust) + +cut <- cutree(clust, h = 50) + +## retrieve Dunn's index for given matrix and clusters +dunn(distance = distmat, cut) +``` + +The value of the Dunn index has no meaning in itself, but is used to compare +between sets of clusters with larger values being preferred. + +::::::::::::::::::::::::::::::::::::::: challenge + +### Challenge 4 + +Examine how changing the `h` or `k` arguments in `cutree()` +affects the value of the Dunn index. + +::::::::::::::: solution + +### Solution: + +```{r dunn-ex} +library("clValid") + +distmat <- dist(methyl_mat) +clust <- hclust(distmat, method = "complete") +plot(clust) + +#Varying h +## Obtaining the clusters +cut_h_20 <- cutree(clust, h = 20) +cut_h_30 <- cutree(clust, h = 30) + +## How many clusters? +length(table(cut_h_20)) +length(table(cut_h_30)) + +dunn(distance = distmat, cut_h_20) +dunn(distance = distmat, cut_h_30) + +#Varying k +## Obtaining the clusters +cut_k_10 <- cutree(clust, k = 10) +cut_k_5 <- cutree(clust, k = 5) + +## How many clusters? +length(table(cut_k_5)) +length(table(cut_k_10)) + +dunn(distance = distmat, cut_k_5) +dunn(distance = distmat, cut_k_10) +``` + +::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +The figures below show in a more systematic way how changing the values of `k` and +`h` using `cutree()` affect the Dunn index. + +```{r hclust-fig3, echo=TRUE, fig.cap="Figure 3: Dunn index"} +h_seq <- 70:10 +h_dunn <- sapply(h_seq, function(x) dunn(distance = distmat, cutree(clust, h = x))) +k_seq <- seq(2, 10) +k_dunn <- sapply(k_seq, function(x) dunn(distance = distmat, cutree(clust, k = x))) +plot(h_seq, h_dunn, xlab = "Height (h)", ylab = "Dunn index") +grid() +``` + +You can see that at low values of `h`, the Dunn index can be high. But this +is not very useful - cutting the given tree at a low `h` value like 15 leads to allmost all observations +ending up each in its own cluster. More relevant is the second maximum in the plot, around `h=55`. +Looking at the dendrogram, this corresponds to `k=4`. + +```{r hclust-fig4, echo=TRUE, fig.cap="Figure 4: Dunn index continued"} +plot(k_seq, k_dunn, xlab = "Number of clusters (k)", ylab = "Dunn index") +grid() +``` + +For the given range of `k` values explored, we obtain the highest Dunn index with `k=4`. +This is in agreement with the previous plot. + +There have been criticisms of the use of the Dunn index in validating +clustering results, due to its high sensitivity to noise in the dataset. +An alternative is to use silhouette scores (see the k-means clustering episode). + +As we said before (see previous episode), clustering is a non-trivial task. +It is important to think about the nature of your data and your expactations +rather than blindly using a some algorithm for clustering or cluster validation. + +## Further reading + +- Dunn, J. C. (1974) Well-separated clusters and optimal fuzzy partitions. Journal of Cybernetics 4(1):95–104. +- Halkidi, M., Batistakis, Y. \& Vazirgiannis, M. (2001) On clustering validation techniques. Journal of Intelligent Information Systems 17(2/3):107-145. +- James, G., Witten, D., Hastie, T. \& Tibshirani, R. (2013) An Introduction to Statistical Learning with Applications in R. + Section 10.3.2 (Hierarchical Clustering). +- [Understanding the concept of Hierarchical clustering Technique. towards data science blog](https://towardsdatascience.com/understanding-the-concept-of-hierarchical-clustering-technique-c6e8243758ec). + + + +:::::::::::::::::::::::::::::::::::::::: keypoints + +- Hierarchical clustering uses an algorithm to group similar data points into clusters. A dendrogram is used to plot relationships between clusters (using the `hclust()` function in R). +- Hierarchical clustering differs from k-means clustering as it does not require the user to specify expected number of clusters +- The distance (dissimilarity) matrix can be calculated in various ways, and different clustering algorithms (linkage methods) can affect the resulting dendrogram. +- The Dunn index can be used to validate clusters using the original dataset. + +:::::::::::::::::::::::::::::::::::::::::::::::::: + + diff --git a/_extras/.gitkeep b/episodes/data/.gitkeep similarity index 100% rename from _extras/.gitkeep rename to episodes/data/.gitkeep diff --git a/data/cancer_expression.R b/episodes/data/cancer_expression.R similarity index 96% rename from data/cancer_expression.R rename to episodes/data/cancer_expression.R index 46eff657..657328b3 100644 --- a/data/cancer_expression.R +++ b/episodes/data/cancer_expression.R @@ -1,95 +1,95 @@ -pkgs <- c("PCAtools", "Biobase", "GEOquery") -BiocManager::install(pkgs, upgrade = FALSE, ask = FALSE) -for (pkg in pkgs) { - suppressPackageStartupMessages(library(pkg, character.only = TRUE)) -} - - -Sys.setenv(VROOM_CONNECTION_SIZE = 500072) - -# load series and platform data from GEO -gset <- getGEO("GSE2990", GSEMatrix = TRUE, getGPL = FALSE) #eSet object -mat <- exprs(gset[[1]]) #access expression and error measurements of assay data - -# remove Affymetrix control probes from the data -mat <- mat[-grep("^AFFX", rownames(mat)), ] # see vignette for why this is done -#nrow = 22215 -#ncol = 189 - -# extract information of interest from the phenotype data (pdata) -idx <- which(colnames(pData(gset[[1]])) %in% - c( - "relation", "age:ch1", "distant rfs:ch1", "er:ch1", - "ggi:ch1", "grade:ch1", "size:ch1", - "time rfs:ch1" - ) -) -#keep these as this follows what researchers did in example -metadata <- data.frame( - pData(gset[[1]])[, idx], row.names = rownames(pData(gset[[1]])) -) -# nrow: 189 -# ncol: 8 - -#redefine column names -colnames(metadata) <- c( - "Study", "Age", "Distant.RFS", "ER", "GGI", "Grade","Size", "Time.RFS" -) - -#Prepare certain phenotypes of interest -#Follows example online -metadata$Study <- gsub("Reanalyzed by: ", "", as.character(metadata$Study)) -metadata$Age <- as.numeric(gsub("^KJ", NA, as.character(metadata$Age))) -metadata$Distant.RFS <- factor( - metadata$Distant.RFS, - levels = c(0, 1) -) -metadata$ER <- factor( - gsub("\\?", NA, as.character(metadata$ER)), - levels = c(0,1) -) -metadata$ER <- factor( - ifelse(metadata$ER == 1, "ER+", "ER-"), - levels = c("ER-", "ER+") -) -metadata$GGI <- as.numeric(as.character(metadata$GGI)) -metadata$Grade <- factor(gsub("\\?", NA, as.character(metadata$Grade)), - levels = c(1,2,3)) -metadata$Grade <- gsub( - 1, - "Grade 1", - gsub( - 2, - "Grade 2", - gsub(3, "Grade 3", metadata$Grade) - ) -) -metadata$Grade <- factor( - metadata$Grade, - levels = c("Grade 1", "Grade 2", "Grade 3") -) -metadata$Size <- as.numeric(as.character(metadata$Size)) -metadata$Time.RFS <- as.numeric(gsub("^KJX|^KJ", NA, metadata$Time.RFS)) - -# remove samples from the pdata that have any NA value -discard <- apply(metadata, 1, function(x) any(is.na(x))) -metadata <- metadata[!discard,] -# nrow: 91 -# ncol: 8 - -# filter the expression data to match the samples in pdata -mat <- mat[,which(colnames(mat) %in% rownames(metadata))] -# nrow: 22215 -# ncol: 91 - -# check that sample names match exactly between pdata and expression data -all(colnames(mat) == rownames(metadata)) - -#colnames in may are equal to rownames in metadata, so that"s good! - -# save datasets mat and metadata -se <- SummarizedExperiment::SummarizedExperiment( - assays = list(expression = mat), - colData = metadata -) -saveRDS(se, file = here::here("data/cancer_expression.rds")) +pkgs <- c("PCAtools", "Biobase", "GEOquery") +BiocManager::install(pkgs, upgrade = FALSE, ask = FALSE) +for (pkg in pkgs) { + suppressPackageStartupMessages(library(pkg, character.only = TRUE)) +} + + +Sys.setenv(VROOM_CONNECTION_SIZE = 500072) + +# load series and platform data from GEO +gset <- getGEO("GSE2990", GSEMatrix = TRUE, getGPL = FALSE) #eSet object +mat <- exprs(gset[[1]]) #access expression and error measurements of assay data + +# remove Affymetrix control probes from the data +mat <- mat[-grep("^AFFX", rownames(mat)), ] # see vignette for why this is done +#nrow = 22215 +#ncol = 189 + +# extract information of interest from the phenotype data (pdata) +idx <- which(colnames(pData(gset[[1]])) %in% + c( + "relation", "age:ch1", "distant rfs:ch1", "er:ch1", + "ggi:ch1", "grade:ch1", "size:ch1", + "time rfs:ch1" + ) +) +#keep these as this follows what researchers did in example +metadata <- data.frame( + pData(gset[[1]])[, idx], row.names = rownames(pData(gset[[1]])) +) +# nrow: 189 +# ncol: 8 + +#redefine column names +colnames(metadata) <- c( + "Study", "Age", "Distant.RFS", "ER", "GGI", "Grade","Size", "Time.RFS" +) + +#Prepare certain phenotypes of interest +#Follows example online +metadata$Study <- gsub("Reanalyzed by: ", "", as.character(metadata$Study)) +metadata$Age <- as.numeric(gsub("^KJ", NA, as.character(metadata$Age))) +metadata$Distant.RFS <- factor( + metadata$Distant.RFS, + levels = c(0, 1) +) +metadata$ER <- factor( + gsub("\\?", NA, as.character(metadata$ER)), + levels = c(0,1) +) +metadata$ER <- factor( + ifelse(metadata$ER == 1, "ER+", "ER-"), + levels = c("ER-", "ER+") +) +metadata$GGI <- as.numeric(as.character(metadata$GGI)) +metadata$Grade <- factor(gsub("\\?", NA, as.character(metadata$Grade)), + levels = c(1,2,3)) +metadata$Grade <- gsub( + 1, + "Grade 1", + gsub( + 2, + "Grade 2", + gsub(3, "Grade 3", metadata$Grade) + ) +) +metadata$Grade <- factor( + metadata$Grade, + levels = c("Grade 1", "Grade 2", "Grade 3") +) +metadata$Size <- as.numeric(as.character(metadata$Size)) +metadata$Time.RFS <- as.numeric(gsub("^KJX|^KJ", NA, metadata$Time.RFS)) + +# remove samples from the pdata that have any NA value +discard <- apply(metadata, 1, function(x) any(is.na(x))) +metadata <- metadata[!discard,] +# nrow: 91 +# ncol: 8 + +# filter the expression data to match the samples in pdata +mat <- mat[,which(colnames(mat) %in% rownames(metadata))] +# nrow: 22215 +# ncol: 91 + +# check that sample names match exactly between pdata and expression data +all(colnames(mat) == rownames(metadata)) + +#colnames in may are equal to rownames in metadata, so that"s good! + +# save datasets mat and metadata +se <- SummarizedExperiment::SummarizedExperiment( + assays = list(expression = mat), + colData = metadata +) +saveRDS(se, file = here::here("data/cancer_expression.rds")) diff --git a/data/cancer_expression.rds b/episodes/data/cancer_expression.rds similarity index 100% rename from data/cancer_expression.rds rename to episodes/data/cancer_expression.rds diff --git a/data/coefHorvath-raw.txt b/episodes/data/coefHorvath-raw.txt similarity index 100% rename from data/coefHorvath-raw.txt rename to episodes/data/coefHorvath-raw.txt diff --git a/data/coefHorvath.R b/episodes/data/coefHorvath.R similarity index 100% rename from data/coefHorvath.R rename to episodes/data/coefHorvath.R diff --git a/data/coefHorvath.rds b/episodes/data/coefHorvath.rds similarity index 100% rename from data/coefHorvath.rds rename to episodes/data/coefHorvath.rds diff --git a/data/methylation.R b/episodes/data/methylation.R similarity index 100% rename from data/methylation.R rename to episodes/data/methylation.R diff --git a/data/methylation.rds b/episodes/data/methylation.rds similarity index 100% rename from data/methylation.rds rename to episodes/data/methylation.rds diff --git a/data/prostate.R b/episodes/data/prostate.R similarity index 100% rename from data/prostate.R rename to episodes/data/prostate.R diff --git a/data/prostate.csv b/episodes/data/prostate.csv similarity index 100% rename from data/prostate.csv rename to episodes/data/prostate.csv diff --git a/data/prostate.rds b/episodes/data/prostate.rds similarity index 100% rename from data/prostate.rds rename to episodes/data/prostate.rds diff --git a/data/scrnaseq.R b/episodes/data/scrnaseq.R similarity index 100% rename from data/scrnaseq.R rename to episodes/data/scrnaseq.R diff --git a/data/scrnaseq.rds b/episodes/data/scrnaseq.rds similarity index 100% rename from data/scrnaseq.rds rename to episodes/data/scrnaseq.rds diff --git a/data/.gitkeep b/episodes/fig/.gitkeep similarity index 100% rename from data/.gitkeep rename to episodes/fig/.gitkeep diff --git a/fig/bio_index_vs_percentage_fallow.png b/episodes/fig/bio_index_vs_percentage_fallow.png similarity index 100% rename from fig/bio_index_vs_percentage_fallow.png rename to episodes/fig/bio_index_vs_percentage_fallow.png diff --git a/fig/bs_fs.png b/episodes/fig/bs_fs.png similarity index 100% rename from fig/bs_fs.png rename to episodes/fig/bs_fs.png diff --git a/fig/bs_fs_lasso.png b/episodes/fig/bs_fs_lasso.png similarity index 100% rename from fig/bs_fs_lasso.png rename to episodes/fig/bs_fs_lasso.png diff --git a/fig/cross_validation.png b/episodes/fig/cross_validation.png similarity index 100% rename from fig/cross_validation.png rename to episodes/fig/cross_validation.png diff --git a/fig/cross_validation.svg b/episodes/fig/cross_validation.svg similarity index 100% rename from fig/cross_validation.svg rename to episodes/fig/cross_validation.svg diff --git a/fig/hierarchical_clustering_1.png b/episodes/fig/hierarchical_clustering_1.png similarity index 100% rename from fig/hierarchical_clustering_1.png rename to episodes/fig/hierarchical_clustering_1.png diff --git a/fig/hierarchical_clustering_2.png b/episodes/fig/hierarchical_clustering_2.png similarity index 100% rename from fig/hierarchical_clustering_2.png rename to episodes/fig/hierarchical_clustering_2.png diff --git a/fig/hierarchical_clustering_3.png b/episodes/fig/hierarchical_clustering_3.png similarity index 100% rename from fig/hierarchical_clustering_3.png rename to episodes/fig/hierarchical_clustering_3.png diff --git a/fig/intro-scatterplot.png b/episodes/fig/intro-scatterplot.png similarity index 100% rename from fig/intro-scatterplot.png rename to episodes/fig/intro-scatterplot.png diff --git a/fig/intro-table.png b/episodes/fig/intro-table.png similarity index 100% rename from fig/intro-table.png rename to episodes/fig/intro-table.png diff --git a/fig/kmeans.R b/episodes/fig/kmeans.R similarity index 100% rename from fig/kmeans.R rename to episodes/fig/kmeans.R diff --git a/fig/kmeans.gif b/episodes/fig/kmeans.gif similarity index 100% rename from fig/kmeans.gif rename to episodes/fig/kmeans.gif diff --git a/fig/microbiome_schematic.png b/episodes/fig/microbiome_schematic.png similarity index 100% rename from fig/microbiome_schematic.png rename to episodes/fig/microbiome_schematic.png diff --git a/fig/one_dimension_species_vs_site.png b/episodes/fig/one_dimension_species_vs_site.png similarity index 100% rename from fig/one_dimension_species_vs_site.png rename to episodes/fig/one_dimension_species_vs_site.png diff --git a/fig/pca-animation.R b/episodes/fig/pca-animation.R similarity index 100% rename from fig/pca-animation.R rename to episodes/fig/pca-animation.R diff --git a/fig/pendulum.gif b/episodes/fig/pendulum.gif similarity index 100% rename from fig/pendulum.gif rename to episodes/fig/pendulum.gif diff --git a/fig/rmd-01-pairs-prostate-1.png b/episodes/fig/rmd-01-pairs-prostate-1.png similarity index 100% rename from fig/rmd-01-pairs-prostate-1.png rename to episodes/fig/rmd-01-pairs-prostate-1.png diff --git a/fig/rmd-01-plot-lm-1.png b/episodes/fig/rmd-01-plot-lm-1.png similarity index 100% rename from fig/rmd-01-plot-lm-1.png rename to episodes/fig/rmd-01-plot-lm-1.png diff --git a/fig/rmd-01-plot-lm-2.png b/episodes/fig/rmd-01-plot-lm-2.png similarity index 100% rename from fig/rmd-01-plot-lm-2.png rename to episodes/fig/rmd-01-plot-lm-2.png diff --git a/fig/rmd-01-plot-lm-3.png b/episodes/fig/rmd-01-plot-lm-3.png similarity index 100% rename from fig/rmd-01-plot-lm-3.png rename to episodes/fig/rmd-01-plot-lm-3.png diff --git a/fig/rmd-01-plot-lm-4.png b/episodes/fig/rmd-01-plot-lm-4.png similarity index 100% rename from fig/rmd-01-plot-lm-4.png rename to episodes/fig/rmd-01-plot-lm-4.png diff --git a/fig/rmd-01-plot-random-1.png b/episodes/fig/rmd-01-plot-random-1.png similarity index 100% rename from fig/rmd-01-plot-random-1.png rename to episodes/fig/rmd-01-plot-random-1.png diff --git a/fig/rmd-01-plot-random-2.png b/episodes/fig/rmd-01-plot-random-2.png similarity index 100% rename from fig/rmd-01-plot-random-2.png rename to episodes/fig/rmd-01-plot-random-2.png diff --git a/fig/rmd-01-plot-random-3.png b/episodes/fig/rmd-01-plot-random-3.png similarity index 100% rename from fig/rmd-01-plot-random-3.png rename to episodes/fig/rmd-01-plot-random-3.png diff --git a/fig/rmd-02-example1-1.png b/episodes/fig/rmd-02-example1-1.png similarity index 100% rename from fig/rmd-02-example1-1.png rename to episodes/fig/rmd-02-example1-1.png diff --git a/fig/rmd-02-example2-1.png b/episodes/fig/rmd-02-example2-1.png similarity index 100% rename from fig/rmd-02-example2-1.png rename to episodes/fig/rmd-02-example2-1.png diff --git a/fig/rmd-02-example3-1.png b/episodes/fig/rmd-02-example3-1.png similarity index 100% rename from fig/rmd-02-example3-1.png rename to episodes/fig/rmd-02-example3-1.png diff --git a/fig/rmd-02-heatmap-1.png b/episodes/fig/rmd-02-heatmap-1.png similarity index 100% rename from fig/rmd-02-heatmap-1.png rename to episodes/fig/rmd-02-heatmap-1.png diff --git a/fig/rmd-02-histx-1.png b/episodes/fig/rmd-02-histx-1.png similarity index 100% rename from fig/rmd-02-histx-1.png rename to episodes/fig/rmd-02-histx-1.png diff --git a/fig/rmd-02-limmavolc1-1.png b/episodes/fig/rmd-02-limmavolc1-1.png similarity index 100% rename from fig/rmd-02-limmavolc1-1.png rename to episodes/fig/rmd-02-limmavolc1-1.png diff --git a/fig/rmd-02-limmavolc2-1.png b/episodes/fig/rmd-02-limmavolc2-1.png similarity index 100% rename from fig/rmd-02-limmavolc2-1.png rename to episodes/fig/rmd-02-limmavolc2-1.png diff --git a/fig/rmd-02-p-fdr-1.png b/episodes/fig/rmd-02-p-fdr-1.png similarity index 100% rename from fig/rmd-02-p-fdr-1.png rename to episodes/fig/rmd-02-p-fdr-1.png diff --git a/fig/rmd-02-p-fwer-1.png b/episodes/fig/rmd-02-p-fwer-1.png similarity index 100% rename from fig/rmd-02-p-fwer-1.png rename to episodes/fig/rmd-02-p-fwer-1.png diff --git a/fig/rmd-02-plot-fdr-fwer-1.png b/episodes/fig/rmd-02-plot-fdr-fwer-1.png similarity index 100% rename from fig/rmd-02-plot-fdr-fwer-1.png rename to episodes/fig/rmd-02-plot-fdr-fwer-1.png diff --git a/fig/rmd-02-plot-limma-lm-effect-1.png b/episodes/fig/rmd-02-plot-limma-lm-effect-1.png similarity index 100% rename from fig/rmd-02-plot-limma-lm-effect-1.png rename to episodes/fig/rmd-02-plot-limma-lm-effect-1.png diff --git a/fig/rmd-02-plot-limma-lm-pval-1.png b/episodes/fig/rmd-02-plot-limma-lm-pval-1.png similarity index 100% rename from fig/rmd-02-plot-limma-lm-pval-1.png rename to episodes/fig/rmd-02-plot-limma-lm-pval-1.png diff --git a/fig/rmd-02-plot-lm-methyl1-1.png b/episodes/fig/rmd-02-plot-lm-methyl1-1.png similarity index 100% rename from fig/rmd-02-plot-lm-methyl1-1.png rename to episodes/fig/rmd-02-plot-lm-methyl1-1.png diff --git a/fig/rmd-02-screening-cor-1.png b/episodes/fig/rmd-02-screening-cor-1.png similarity index 100% rename from fig/rmd-02-screening-cor-1.png rename to episodes/fig/rmd-02-screening-cor-1.png diff --git a/fig/rmd-02-screening-var-1.png b/episodes/fig/rmd-02-screening-var-1.png similarity index 100% rename from fig/rmd-02-screening-var-1.png rename to episodes/fig/rmd-02-screening-var-1.png diff --git a/fig/rmd-02-tdist-1.png b/episodes/fig/rmd-02-tdist-1.png similarity index 100% rename from fig/rmd-02-tdist-1.png rename to episodes/fig/rmd-02-tdist-1.png diff --git a/fig/rmd-02-volcplotfake-1.png b/episodes/fig/rmd-02-volcplotfake-1.png similarity index 100% rename from fig/rmd-02-volcplotfake-1.png rename to episodes/fig/rmd-02-volcplotfake-1.png diff --git a/fig/rmd-03-binomial-1.png b/episodes/fig/rmd-03-binomial-1.png similarity index 100% rename from fig/rmd-03-binomial-1.png rename to episodes/fig/rmd-03-binomial-1.png diff --git a/fig/rmd-03-chooselambda-1.png b/episodes/fig/rmd-03-chooselambda-1.png similarity index 100% rename from fig/rmd-03-chooselambda-1.png rename to episodes/fig/rmd-03-chooselambda-1.png diff --git a/fig/rmd-03-coef-ridge-lm-1.png b/episodes/fig/rmd-03-coef-ridge-lm-1.png similarity index 100% rename from fig/rmd-03-coef-ridge-lm-1.png rename to episodes/fig/rmd-03-coef-ridge-lm-1.png diff --git a/fig/rmd-03-corr-mat-meth-1.png b/episodes/fig/rmd-03-corr-mat-meth-1.png similarity index 100% rename from fig/rmd-03-corr-mat-meth-1.png rename to episodes/fig/rmd-03-corr-mat-meth-1.png diff --git a/fig/rmd-03-corr-mat-prostate-1.png b/episodes/fig/rmd-03-corr-mat-prostate-1.png similarity index 100% rename from fig/rmd-03-corr-mat-prostate-1.png rename to episodes/fig/rmd-03-corr-mat-prostate-1.png diff --git a/fig/rmd-03-elastic-1.png b/episodes/fig/rmd-03-elastic-1.png similarity index 100% rename from fig/rmd-03-elastic-1.png rename to episodes/fig/rmd-03-elastic-1.png diff --git a/fig/rmd-03-elastic-contour-1.png b/episodes/fig/rmd-03-elastic-contour-1.png similarity index 100% rename from fig/rmd-03-elastic-contour-1.png rename to episodes/fig/rmd-03-elastic-contour-1.png diff --git a/fig/rmd-03-elastic-cv-1.png b/episodes/fig/rmd-03-elastic-cv-1.png similarity index 100% rename from fig/rmd-03-elastic-cv-1.png rename to episodes/fig/rmd-03-elastic-cv-1.png diff --git a/fig/rmd-03-elastic-plot-1.png b/episodes/fig/rmd-03-elastic-plot-1.png similarity index 100% rename from fig/rmd-03-elastic-plot-1.png rename to episodes/fig/rmd-03-elastic-plot-1.png diff --git a/fig/rmd-03-heatmap-lasso-1.png b/episodes/fig/rmd-03-heatmap-lasso-1.png similarity index 100% rename from fig/rmd-03-heatmap-lasso-1.png rename to episodes/fig/rmd-03-heatmap-lasso-1.png diff --git a/fig/rmd-03-lasso-cv-1.png b/episodes/fig/rmd-03-lasso-cv-1.png similarity index 100% rename from fig/rmd-03-lasso-cv-1.png rename to episodes/fig/rmd-03-lasso-cv-1.png diff --git a/fig/rmd-03-plot-ridge-1.png b/episodes/fig/rmd-03-plot-ridge-1.png similarity index 100% rename from fig/rmd-03-plot-ridge-1.png rename to episodes/fig/rmd-03-plot-ridge-1.png diff --git a/fig/rmd-03-plot-ridge-prediction-1.png b/episodes/fig/rmd-03-plot-ridge-prediction-1.png similarity index 100% rename from fig/rmd-03-plot-ridge-prediction-1.png rename to episodes/fig/rmd-03-plot-ridge-prediction-1.png diff --git a/fig/rmd-03-plotlas-1.png b/episodes/fig/rmd-03-plotlas-1.png similarity index 100% rename from fig/rmd-03-plotlas-1.png rename to episodes/fig/rmd-03-plotlas-1.png diff --git a/fig/rmd-03-regplot-1.png b/episodes/fig/rmd-03-regplot-1.png similarity index 100% rename from fig/rmd-03-regplot-1.png rename to episodes/fig/rmd-03-regplot-1.png diff --git a/fig/rmd-03-ridgeplot-1.png b/episodes/fig/rmd-03-ridgeplot-1.png similarity index 100% rename from fig/rmd-03-ridgeplot-1.png rename to episodes/fig/rmd-03-ridgeplot-1.png diff --git a/fig/rmd-03-shrink-lasso-1.png b/episodes/fig/rmd-03-shrink-lasso-1.png similarity index 100% rename from fig/rmd-03-shrink-lasso-1.png rename to episodes/fig/rmd-03-shrink-lasso-1.png diff --git a/fig/rmd-03-test-plot-lm-1.png b/episodes/fig/rmd-03-test-plot-lm-1.png similarity index 100% rename from fig/rmd-03-test-plot-lm-1.png rename to episodes/fig/rmd-03-test-plot-lm-1.png diff --git a/fig/rmd-04-fit-scale-1.png b/episodes/fig/rmd-04-fit-scale-1.png similarity index 100% rename from fig/rmd-04-fit-scale-1.png rename to episodes/fig/rmd-04-fit-scale-1.png diff --git a/fig/rmd-04-likelihood-1.png b/episodes/fig/rmd-04-likelihood-1.png similarity index 100% rename from fig/rmd-04-likelihood-1.png rename to episodes/fig/rmd-04-likelihood-1.png diff --git a/fig/rmd-04-residuals-1.png b/episodes/fig/rmd-04-residuals-1.png similarity index 100% rename from fig/rmd-04-residuals-1.png rename to episodes/fig/rmd-04-residuals-1.png diff --git a/fig/rmd-05-biplot-ex-1.png b/episodes/fig/rmd-05-biplot-ex-1.png similarity index 100% rename from fig/rmd-05-biplot-ex-1.png rename to episodes/fig/rmd-05-biplot-ex-1.png diff --git a/fig/rmd-05-pairsplot-1.png b/episodes/fig/rmd-05-pairsplot-1.png similarity index 100% rename from fig/rmd-05-pairsplot-1.png rename to episodes/fig/rmd-05-pairsplot-1.png diff --git a/fig/rmd-05-pca-biplot-1.png b/episodes/fig/rmd-05-pca-biplot-1.png similarity index 100% rename from fig/rmd-05-pca-biplot-1.png rename to episodes/fig/rmd-05-pca-biplot-1.png diff --git a/fig/rmd-05-pca-biplot-ex2-1.png b/episodes/fig/rmd-05-pca-biplot-ex2-1.png similarity index 100% rename from fig/rmd-05-pca-biplot-ex2-1.png rename to episodes/fig/rmd-05-pca-biplot-ex2-1.png diff --git a/fig/rmd-05-pca-loadings-1.png b/episodes/fig/rmd-05-pca-loadings-1.png similarity index 100% rename from fig/rmd-05-pca-loadings-1.png rename to episodes/fig/rmd-05-pca-loadings-1.png diff --git a/fig/rmd-05-scree-ex-1.png b/episodes/fig/rmd-05-scree-ex-1.png similarity index 100% rename from fig/rmd-05-scree-ex-1.png rename to episodes/fig/rmd-05-scree-ex-1.png diff --git a/fig/rmd-05-stats-biplot-1.png b/episodes/fig/rmd-05-stats-biplot-1.png similarity index 100% rename from fig/rmd-05-stats-biplot-1.png rename to episodes/fig/rmd-05-stats-biplot-1.png diff --git a/fig/rmd-05-var-hist-1.png b/episodes/fig/rmd-05-var-hist-1.png similarity index 100% rename from fig/rmd-05-var-hist-1.png rename to episodes/fig/rmd-05-var-hist-1.png diff --git a/fig/rmd-05-vardf-plot-1.png b/episodes/fig/rmd-05-vardf-plot-1.png similarity index 100% rename from fig/rmd-05-vardf-plot-1.png rename to episodes/fig/rmd-05-vardf-plot-1.png diff --git a/fig/rmd-06-biplot-1.png b/episodes/fig/rmd-06-biplot-1.png similarity index 100% rename from fig/rmd-06-biplot-1.png rename to episodes/fig/rmd-06-biplot-1.png diff --git a/fig/rmd-07-k-ex-1.png b/episodes/fig/rmd-07-k-ex-1.png similarity index 100% rename from fig/rmd-07-k-ex-1.png rename to episodes/fig/rmd-07-k-ex-1.png diff --git a/fig/rmd-07-ordplot-ex-1.png b/episodes/fig/rmd-07-ordplot-ex-1.png similarity index 100% rename from fig/rmd-07-ordplot-ex-1.png rename to episodes/fig/rmd-07-ordplot-ex-1.png diff --git a/fig/rmd-07-ordplot-ex-2.png b/episodes/fig/rmd-07-ordplot-ex-2.png similarity index 100% rename from fig/rmd-07-ordplot-ex-2.png rename to episodes/fig/rmd-07-ordplot-ex-2.png diff --git a/fig/rmd-07-ordplot-ex-3.png b/episodes/fig/rmd-07-ordplot-ex-3.png similarity index 100% rename from fig/rmd-07-ordplot-ex-3.png rename to episodes/fig/rmd-07-ordplot-ex-3.png diff --git a/fig/rmd-07-ordplot-ex-4.png b/episodes/fig/rmd-07-ordplot-ex-4.png similarity index 100% rename from fig/rmd-07-ordplot-ex-4.png rename to episodes/fig/rmd-07-ordplot-ex-4.png diff --git a/fig/rmd-07-ordplot1-1.png b/episodes/fig/rmd-07-ordplot1-1.png similarity index 100% rename from fig/rmd-07-ordplot1-1.png rename to episodes/fig/rmd-07-ordplot1-1.png diff --git a/fig/rmd-07-ordplot2-1.png b/episodes/fig/rmd-07-ordplot2-1.png similarity index 100% rename from fig/rmd-07-ordplot2-1.png rename to episodes/fig/rmd-07-ordplot2-1.png diff --git a/fig/rmd-07-ordplot3-1.png b/episodes/fig/rmd-07-ordplot3-1.png similarity index 100% rename from fig/rmd-07-ordplot3-1.png rename to episodes/fig/rmd-07-ordplot3-1.png diff --git a/fig/rmd-07-ordplots-123-1.png b/episodes/fig/rmd-07-ordplots-123-1.png similarity index 100% rename from fig/rmd-07-ordplots-123-1.png rename to episodes/fig/rmd-07-ordplots-123-1.png diff --git a/fig/rmd-07-ordplots-123-2.png b/episodes/fig/rmd-07-ordplots-123-2.png similarity index 100% rename from fig/rmd-07-ordplots-123-2.png rename to episodes/fig/rmd-07-ordplots-123-2.png diff --git a/fig/rmd-07-ordplots-123-3.png b/episodes/fig/rmd-07-ordplots-123-3.png similarity index 100% rename from fig/rmd-07-ordplots-123-3.png rename to episodes/fig/rmd-07-ordplots-123-3.png diff --git a/fig/rmd-07-stressplot-1.png b/episodes/fig/rmd-07-stressplot-1.png similarity index 100% rename from fig/rmd-07-stressplot-1.png rename to episodes/fig/rmd-07-stressplot-1.png diff --git a/fig/rmd-07-vegan-3d-1.png b/episodes/fig/rmd-07-vegan-3d-1.png similarity index 100% rename from fig/rmd-07-vegan-3d-1.png rename to episodes/fig/rmd-07-vegan-3d-1.png diff --git a/fig/rmd-08-boots-1.png b/episodes/fig/rmd-08-boots-1.png similarity index 100% rename from fig/rmd-08-boots-1.png rename to episodes/fig/rmd-08-boots-1.png diff --git a/fig/rmd-08-bs-ex-1.png b/episodes/fig/rmd-08-bs-ex-1.png similarity index 100% rename from fig/rmd-08-bs-ex-1.png rename to episodes/fig/rmd-08-bs-ex-1.png diff --git a/fig/rmd-08-bs-heatmap-1.png b/episodes/fig/rmd-08-bs-heatmap-1.png similarity index 100% rename from fig/rmd-08-bs-heatmap-1.png rename to episodes/fig/rmd-08-bs-heatmap-1.png diff --git a/fig/rmd-08-fake-cluster-1.png b/episodes/fig/rmd-08-fake-cluster-1.png similarity index 100% rename from fig/rmd-08-fake-cluster-1.png rename to episodes/fig/rmd-08-fake-cluster-1.png diff --git a/fig/rmd-08-kmeans-1.png b/episodes/fig/rmd-08-kmeans-1.png similarity index 100% rename from fig/rmd-08-kmeans-1.png rename to episodes/fig/rmd-08-kmeans-1.png diff --git a/fig/rmd-08-kmeans-ex-1.png b/episodes/fig/rmd-08-kmeans-ex-1.png similarity index 100% rename from fig/rmd-08-kmeans-ex-1.png rename to episodes/fig/rmd-08-kmeans-ex-1.png diff --git a/fig/rmd-08-plot-silhouette-1.png b/episodes/fig/rmd-08-plot-silhouette-1.png similarity index 100% rename from fig/rmd-08-plot-silhouette-1.png rename to episodes/fig/rmd-08-plot-silhouette-1.png diff --git a/fig/rmd-08-silhouette-1.png b/episodes/fig/rmd-08-silhouette-1.png similarity index 100% rename from fig/rmd-08-silhouette-1.png rename to episodes/fig/rmd-08-silhouette-1.png diff --git a/fig/rmd-08-silhouette-ex-1.png b/episodes/fig/rmd-08-silhouette-ex-1.png similarity index 100% rename from fig/rmd-08-silhouette-ex-1.png rename to episodes/fig/rmd-08-silhouette-ex-1.png diff --git a/fig/rmd-08-unnamed-chunk-1-1.png b/episodes/fig/rmd-08-unnamed-chunk-1-1.png similarity index 100% rename from fig/rmd-08-unnamed-chunk-1-1.png rename to episodes/fig/rmd-08-unnamed-chunk-1-1.png diff --git a/fig/rmd-09-clust-cor-cor-example-1.png b/episodes/fig/rmd-09-clust-cor-cor-example-1.png similarity index 100% rename from fig/rmd-09-clust-cor-cor-example-1.png rename to episodes/fig/rmd-09-clust-cor-cor-example-1.png diff --git a/fig/rmd-09-clust-euc-cor-example-1.png b/episodes/fig/rmd-09-clust-euc-cor-example-1.png similarity index 100% rename from fig/rmd-09-clust-euc-cor-example-1.png rename to episodes/fig/rmd-09-clust-euc-cor-example-1.png diff --git a/fig/rmd-09-cutree-1.png b/episodes/fig/rmd-09-cutree-1.png similarity index 100% rename from fig/rmd-09-cutree-1.png rename to episodes/fig/rmd-09-cutree-1.png diff --git a/fig/rmd-09-dunn-ex-1.png b/episodes/fig/rmd-09-dunn-ex-1.png similarity index 100% rename from fig/rmd-09-dunn-ex-1.png rename to episodes/fig/rmd-09-dunn-ex-1.png diff --git a/fig/rmd-09-h-k-ex-plot-1.png b/episodes/fig/rmd-09-h-k-ex-plot-1.png similarity index 100% rename from fig/rmd-09-h-k-ex-plot-1.png rename to episodes/fig/rmd-09-h-k-ex-plot-1.png diff --git a/fig/rmd-09-h-k-ex-plot-2.png b/episodes/fig/rmd-09-h-k-ex-plot-2.png similarity index 100% rename from fig/rmd-09-h-k-ex-plot-2.png rename to episodes/fig/rmd-09-h-k-ex-plot-2.png diff --git a/fig/rmd-09-hclust-fig3-1.png b/episodes/fig/rmd-09-hclust-fig3-1.png similarity index 100% rename from fig/rmd-09-hclust-fig3-1.png rename to episodes/fig/rmd-09-hclust-fig3-1.png diff --git a/fig/rmd-09-hclust-fig3-2.png b/episodes/fig/rmd-09-hclust-fig3-2.png similarity index 100% rename from fig/rmd-09-hclust-fig3-2.png rename to episodes/fig/rmd-09-hclust-fig3-2.png diff --git a/fig/rmd-09-heatmap-clust-1.png b/episodes/fig/rmd-09-heatmap-clust-1.png similarity index 100% rename from fig/rmd-09-heatmap-clust-1.png rename to episodes/fig/rmd-09-heatmap-clust-1.png diff --git a/fig/rmd-09-heatmap-cor-cor-example-1.png b/episodes/fig/rmd-09-heatmap-cor-cor-example-1.png similarity index 100% rename from fig/rmd-09-heatmap-cor-cor-example-1.png rename to episodes/fig/rmd-09-heatmap-cor-cor-example-1.png diff --git a/fig/rmd-09-heatmap-cor-cor-example-2.png b/episodes/fig/rmd-09-heatmap-cor-cor-example-2.png similarity index 100% rename from fig/rmd-09-heatmap-cor-cor-example-2.png rename to episodes/fig/rmd-09-heatmap-cor-cor-example-2.png diff --git a/fig/rmd-09-heatmap-cor-example-1.png b/episodes/fig/rmd-09-heatmap-cor-example-1.png similarity index 100% rename from fig/rmd-09-heatmap-cor-example-1.png rename to episodes/fig/rmd-09-heatmap-cor-example-1.png diff --git a/fig/rmd-09-heatmap-noclust-1.png b/episodes/fig/rmd-09-heatmap-noclust-1.png similarity index 100% rename from fig/rmd-09-heatmap-noclust-1.png rename to episodes/fig/rmd-09-heatmap-noclust-1.png diff --git a/fig/rmd-09-lineplot-cor-example-1.png b/episodes/fig/rmd-09-lineplot-cor-example-1.png similarity index 100% rename from fig/rmd-09-lineplot-cor-example-1.png rename to episodes/fig/rmd-09-lineplot-cor-example-1.png diff --git a/fig/rmd-09-plot-clust-average-1.png b/episodes/fig/rmd-09-plot-clust-average-1.png similarity index 100% rename from fig/rmd-09-plot-clust-average-1.png rename to episodes/fig/rmd-09-plot-clust-average-1.png diff --git a/fig/rmd-09-plot-clust-centroid-1.png b/episodes/fig/rmd-09-plot-clust-centroid-1.png similarity index 100% rename from fig/rmd-09-plot-clust-centroid-1.png rename to episodes/fig/rmd-09-plot-clust-centroid-1.png diff --git a/fig/rmd-09-plot-clust-comp-1.png b/episodes/fig/rmd-09-plot-clust-comp-1.png similarity index 100% rename from fig/rmd-09-plot-clust-comp-1.png rename to episodes/fig/rmd-09-plot-clust-comp-1.png diff --git a/fig/rmd-09-plot-clust-dunn-1.png b/episodes/fig/rmd-09-plot-clust-dunn-1.png similarity index 100% rename from fig/rmd-09-plot-clust-dunn-1.png rename to episodes/fig/rmd-09-plot-clust-dunn-1.png diff --git a/fig/rmd-09-plot-clust-mcq-1.png b/episodes/fig/rmd-09-plot-clust-mcq-1.png similarity index 100% rename from fig/rmd-09-plot-clust-mcq-1.png rename to episodes/fig/rmd-09-plot-clust-mcq-1.png diff --git a/fig/rmd-09-plot-clust-median-1.png b/episodes/fig/rmd-09-plot-clust-median-1.png similarity index 100% rename from fig/rmd-09-plot-clust-median-1.png rename to episodes/fig/rmd-09-plot-clust-median-1.png diff --git a/fig/rmd-09-plot-clust-method-1.png b/episodes/fig/rmd-09-plot-clust-method-1.png similarity index 100% rename from fig/rmd-09-plot-clust-method-1.png rename to episodes/fig/rmd-09-plot-clust-method-1.png diff --git a/fig/rmd-09-plot-clust-method-2.png b/episodes/fig/rmd-09-plot-clust-method-2.png similarity index 100% rename from fig/rmd-09-plot-clust-method-2.png rename to episodes/fig/rmd-09-plot-clust-method-2.png diff --git a/fig/rmd-09-plot-clust-single-1.png b/episodes/fig/rmd-09-plot-clust-single-1.png similarity index 100% rename from fig/rmd-09-plot-clust-single-1.png rename to episodes/fig/rmd-09-plot-clust-single-1.png diff --git a/fig/rmd-09-plot-clust-ward-1.png b/episodes/fig/rmd-09-plot-clust-ward-1.png similarity index 100% rename from fig/rmd-09-plot-clust-ward-1.png rename to episodes/fig/rmd-09-plot-clust-ward-1.png diff --git a/fig/rmd-09-plotclustex-1.png b/episodes/fig/rmd-09-plotclustex-1.png similarity index 100% rename from fig/rmd-09-plotclustex-1.png rename to episodes/fig/rmd-09-plotclustex-1.png diff --git a/fig/rmd-09-plotexample-1.png b/episodes/fig/rmd-09-plotexample-1.png similarity index 100% rename from fig/rmd-09-plotexample-1.png rename to episodes/fig/rmd-09-plotexample-1.png diff --git a/fig/rmd-10-fit-dlnorm-1.png b/episodes/fig/rmd-10-fit-dlnorm-1.png similarity index 100% rename from fig/rmd-10-fit-dlnorm-1.png rename to episodes/fig/rmd-10-fit-dlnorm-1.png diff --git a/fig/rmd-10-fit-mixem-1.png b/episodes/fig/rmd-10-fit-mixem-1.png similarity index 100% rename from fig/rmd-10-fit-mixem-1.png rename to episodes/fig/rmd-10-fit-mixem-1.png diff --git a/fig/rmd-10-fit-univar-1.png b/episodes/fig/rmd-10-fit-univar-1.png similarity index 100% rename from fig/rmd-10-fit-univar-1.png rename to episodes/fig/rmd-10-fit-univar-1.png diff --git a/fig/rmd-10-mix-converged-1.png b/episodes/fig/rmd-10-mix-converged-1.png similarity index 100% rename from fig/rmd-10-mix-converged-1.png rename to episodes/fig/rmd-10-mix-converged-1.png diff --git a/fig/rmd-10-mix-expt-1.png b/episodes/fig/rmd-10-mix-expt-1.png similarity index 100% rename from fig/rmd-10-mix-expt-1.png rename to episodes/fig/rmd-10-mix-expt-1.png diff --git a/fig/rmd-10-mix2-1.png b/episodes/fig/rmd-10-mix2-1.png similarity index 100% rename from fig/rmd-10-mix2-1.png rename to episodes/fig/rmd-10-mix2-1.png diff --git a/fig/rmd-10-mix3-1.png b/episodes/fig/rmd-10-mix3-1.png similarity index 100% rename from fig/rmd-10-mix3-1.png rename to episodes/fig/rmd-10-mix3-1.png diff --git a/fig/rmd-10-mix3_2-1.png b/episodes/fig/rmd-10-mix3_2-1.png similarity index 100% rename from fig/rmd-10-mix3_2-1.png rename to episodes/fig/rmd-10-mix3_2-1.png diff --git a/fig/rmd-10-mixture-1.png b/episodes/fig/rmd-10-mixture-1.png similarity index 100% rename from fig/rmd-10-mixture-1.png rename to episodes/fig/rmd-10-mixture-1.png diff --git a/fig/rmd-10-mixture-2.png b/episodes/fig/rmd-10-mixture-2.png similarity index 100% rename from fig/rmd-10-mixture-2.png rename to episodes/fig/rmd-10-mixture-2.png diff --git a/fig/rmd-10-mixture-animation-1.png b/episodes/fig/rmd-10-mixture-animation-1.png similarity index 100% rename from fig/rmd-10-mixture-animation-1.png rename to episodes/fig/rmd-10-mixture-animation-1.png diff --git a/fig/rmd-10-mixture-data-1.png b/episodes/fig/rmd-10-mixture-data-1.png similarity index 100% rename from fig/rmd-10-mixture-data-1.png rename to episodes/fig/rmd-10-mixture-data-1.png diff --git a/fig/rmd-10-mvnorm-1.png b/episodes/fig/rmd-10-mvnorm-1.png similarity index 100% rename from fig/rmd-10-mvnorm-1.png rename to episodes/fig/rmd-10-mvnorm-1.png diff --git a/fig/rmd-10-mvnormcor-1.png b/episodes/fig/rmd-10-mvnormcor-1.png similarity index 100% rename from fig/rmd-10-mvnormcor-1.png rename to episodes/fig/rmd-10-mvnormcor-1.png diff --git a/fig/rmd-10-norms-1.png b/episodes/fig/rmd-10-norms-1.png similarity index 100% rename from fig/rmd-10-norms-1.png rename to episodes/fig/rmd-10-norms-1.png diff --git a/fig/rmd-10-pcs-1.png b/episodes/fig/rmd-10-pcs-1.png similarity index 100% rename from fig/rmd-10-pcs-1.png rename to episodes/fig/rmd-10-pcs-1.png diff --git a/fig/rmd-10-reddim-1.png b/episodes/fig/rmd-10-reddim-1.png similarity index 100% rename from fig/rmd-10-reddim-1.png rename to episodes/fig/rmd-10-reddim-1.png diff --git a/fig/rmd-10-tsne-1.png b/episodes/fig/rmd-10-tsne-1.png similarity index 100% rename from fig/rmd-10-tsne-1.png rename to episodes/fig/rmd-10-tsne-1.png diff --git a/fig/rmd-10-unimodal-1.png b/episodes/fig/rmd-10-unimodal-1.png similarity index 100% rename from fig/rmd-10-unimodal-1.png rename to episodes/fig/rmd-10-unimodal-1.png diff --git a/fig/rotating.gif b/episodes/fig/rotating.gif similarity index 100% rename from fig/rotating.gif rename to episodes/fig/rotating.gif diff --git a/fig/silhouette5.png b/episodes/fig/silhouette5.png similarity index 100% rename from fig/silhouette5.png rename to episodes/fig/silhouette5.png diff --git a/fig/table_for_fa.png b/episodes/fig/table_for_fa.png similarity index 100% rename from fig/table_for_fa.png rename to episodes/fig/table_for_fa.png diff --git a/fig/training_test.png b/episodes/fig/training_test.png similarity index 100% rename from fig/training_test.png rename to episodes/fig/training_test.png diff --git a/fig/two_dimension_species_vs_site.png b/episodes/fig/two_dimension_species_vs_site.png similarity index 100% rename from fig/two_dimension_species_vs_site.png rename to episodes/fig/two_dimension_species_vs_site.png diff --git a/fig/validation.png b/episodes/fig/validation.png similarity index 100% rename from fig/validation.png rename to episodes/fig/validation.png diff --git a/fig/validation.tex b/episodes/fig/validation.tex similarity index 100% rename from fig/validation.tex rename to episodes/fig/validation.tex diff --git a/fig/.gitkeep b/episodes/files/.gitkeep similarity index 100% rename from fig/.gitkeep rename to episodes/files/.gitkeep diff --git a/files/.gitkeep b/files/.gitkeep deleted file mode 100644 index e69de29b..00000000 diff --git a/index.md b/index.md index c75b50a5..f9111b94 100644 --- a/index.md +++ b/index.md @@ -1,15 +1,23 @@ --- -layout: lesson -root: . # Is the only page that doesn't follow the pattern /:path/index.html -permalink: index.html # Is the only page that doesn't follow the pattern /:path/index.html +permalink: index.html +site: sandpaper::sandpaper_site --- -> ## Prerequisites -> -> - Knowledge of R programming (eg, [a data carpentries course](https://datacarpentry.org/lessons/)) -> - Knowledge of basic statistical techniques -> (eg, [an introduction to linear regression for health sciences](https://carpentries-incubator.github.io/simple-linear-regression-public-health/)) -{: .prereq} +> **ATTENTION** This is an experimental test of [The Carpentries Workbench](https://carpentries.github.io/workbench) lesson infrastructure. +> It was automatically converted from the source lesson via [the lesson transition script](https://github.com/carpentries/lesson-transition/). +> +> If anything seems off, please contact Zhian Kamvar [zkamvar@carpentries.org](mailto:zkamvar@carpentries.org) + +:::::::::::::::::::::::::::::::::::::::::: prereq + +## Prerequisites + +- Knowledge of R programming (eg, [a data carpentries course](https://datacarpentry.org/lessons/)) +- Knowledge of basic statistical techniques + (eg, [an introduction to linear regression for health sciences](https://carpentries-incubator.github.io/simple-linear-regression-public-health/)) + + +:::::::::::::::::::::::::::::::::::::::::::::::::: ## Extra resources @@ -23,4 +31,6 @@ to learn even broader topics! Some of these are listed here: - [Introduction to statistical learning](https://www.statlearning.com/) - [Elements of statistical learning (advanced)](https://web.stanford.edu/~hastie/ElemStatLearn/) -{% include links.md %} + + + diff --git a/_extras/about.md b/instructors/about.md similarity index 69% rename from _extras/about.md rename to instructors/about.md index 5f07f659..ad7d08c4 100644 --- a/_extras/about.md +++ b/instructors/about.md @@ -1,5 +1,8 @@ --- title: About --- + {% include carpentries.html %} -{% include links.md %} + + + diff --git a/instructors/instructor-notes.md b/instructors/instructor-notes.md new file mode 100644 index 00000000..6d7699e5 --- /dev/null +++ b/instructors/instructor-notes.md @@ -0,0 +1,9 @@ +--- +title: Instructor Notes +--- + +Coming soon. + + + + diff --git a/instructors/slides.md b/instructors/slides.md new file mode 100644 index 00000000..9bedcc72 --- /dev/null +++ b/instructors/slides.md @@ -0,0 +1,16 @@ +--- +title: Lecture slides +--- + + + + + +{% for p in site.slides %} + +- [{{p.title}}]({{p.url | replace: "Rmd", "html"}}) + {% endfor %} + + + + diff --git a/reference.md b/learners/reference.md similarity index 67% rename from reference.md rename to learners/reference.md index 0ed3c171..3be82d57 100644 --- a/reference.md +++ b/learners/reference.md @@ -1,9 +1,11 @@ --- -layout: reference +title: 'Glossary' --- ## Glossary Coming soon. Feel free to suggest entries via GitHub Issues! -{% include links.md %} + + + diff --git a/setup.md b/learners/setup.md similarity index 95% rename from setup.md rename to learners/setup.md index 72ab4e5b..033ecb91 100644 --- a/setup.md +++ b/learners/setup.md @@ -1,5 +1,4 @@ --- -layout: page title: Setup --- @@ -40,7 +39,7 @@ for (file in data_files) { } ``` -On Linux systems, part of the above may fail due to the **`bluster`** package and you may receive error messages after running `BiocManager::install(table[[1]])`, indicating that the package `igraph` was not installed successfully. +On Linux systems, part of the above may fail due to the **`bluster`** package and you may receive error messages after running `BiocManager::install(table[[1]])`, indicating that the package `igraph` was not installed successfully. Detailed installation instructions for `igraph` can be found at [https://r.igraph.org/](https://r.igraph.org/), but the following workaround code may resolve the issue: @@ -51,4 +50,6 @@ install.packages('igraph', repos=c(igraph = 'https://igraph.r-universe.dev', BiocManager::install('bluster') ``` -{% include links.md %} + + + diff --git a/profiles/learner-profiles.md b/profiles/learner-profiles.md new file mode 100644 index 00000000..434e335a --- /dev/null +++ b/profiles/learner-profiles.md @@ -0,0 +1,5 @@ +--- +title: FIXME +--- + +This is a placeholder file. Please add content here. diff --git a/renv/activate.R b/renv/activate.R new file mode 100644 index 00000000..cb5401f9 --- /dev/null +++ b/renv/activate.R @@ -0,0 +1,1180 @@ + +local({ + + # the requested version of renv + version <- "1.0.3" + attr(version, "sha") <- NULL + + # the project directory + project <- getwd() + + # use start-up diagnostics if enabled + diagnostics <- Sys.getenv("RENV_STARTUP_DIAGNOSTICS", unset = "FALSE") + if (diagnostics) { + start <- Sys.time() + profile <- tempfile("renv-startup-", fileext = ".Rprof") + utils::Rprof(profile) + on.exit({ + utils::Rprof(NULL) + elapsed <- signif(difftime(Sys.time(), start, units = "auto"), digits = 2L) + writeLines(sprintf("- renv took %s to run the autoloader.", format(elapsed))) + writeLines(sprintf("- Profile: %s", profile)) + print(utils::summaryRprof(profile)) + }, add = TRUE) + } + + # figure out whether the autoloader is enabled + enabled <- local({ + + # first, check config option + override <- getOption("renv.config.autoloader.enabled") + if (!is.null(override)) + return(override) + + # next, check environment variables + # TODO: prefer using the configuration one in the future + envvars <- c( + "RENV_CONFIG_AUTOLOADER_ENABLED", + "RENV_AUTOLOADER_ENABLED", + "RENV_ACTIVATE_PROJECT" + ) + + for (envvar in envvars) { + envval <- Sys.getenv(envvar, unset = NA) + if (!is.na(envval)) + return(tolower(envval) %in% c("true", "t", "1")) + } + + # enable by default + TRUE + + }) + + if (!enabled) + return(FALSE) + + # avoid recursion + if (identical(getOption("renv.autoloader.running"), TRUE)) { + warning("ignoring recursive attempt to run renv autoloader") + return(invisible(TRUE)) + } + + # signal that we're loading renv during R startup + options(renv.autoloader.running = TRUE) + on.exit(options(renv.autoloader.running = NULL), add = TRUE) + + # signal that we've consented to use renv + options(renv.consent = TRUE) + + # load the 'utils' package eagerly -- this ensures that renv shims, which + # mask 'utils' packages, will come first on the search path + library(utils, lib.loc = .Library) + + # unload renv if it's already been loaded + if ("renv" %in% loadedNamespaces()) + unloadNamespace("renv") + + # load bootstrap tools + `%||%` <- function(x, y) { + if (is.null(x)) y else x + } + + catf <- function(fmt, ..., appendLF = TRUE) { + + quiet <- getOption("renv.bootstrap.quiet", default = FALSE) + if (quiet) + return(invisible()) + + msg <- sprintf(fmt, ...) + cat(msg, file = stdout(), sep = if (appendLF) "\n" else "") + + invisible(msg) + + } + + header <- function(label, + ..., + prefix = "#", + suffix = "-", + n = min(getOption("width"), 78)) + { + label <- sprintf(label, ...) + n <- max(n - nchar(label) - nchar(prefix) - 2L, 8L) + if (n <= 0) + return(paste(prefix, label)) + + tail <- paste(rep.int(suffix, n), collapse = "") + paste0(prefix, " ", label, " ", tail) + + } + + startswith <- function(string, prefix) { + substring(string, 1, nchar(prefix)) == prefix + } + + bootstrap <- function(version, library) { + + friendly <- renv_bootstrap_version_friendly(version) + section <- header(sprintf("Bootstrapping renv %s", friendly)) + catf(section) + + # attempt to download renv + catf("- Downloading renv ... ", appendLF = FALSE) + withCallingHandlers( + tarball <- renv_bootstrap_download(version), + error = function(err) { + catf("FAILED") + stop("failed to download:\n", conditionMessage(err)) + } + ) + catf("OK") + on.exit(unlink(tarball), add = TRUE) + + # now attempt to install + catf("- Installing renv ... ", appendLF = FALSE) + withCallingHandlers( + status <- renv_bootstrap_install(version, tarball, library), + error = function(err) { + catf("FAILED") + stop("failed to install:\n", conditionMessage(err)) + } + ) + catf("OK") + + # add empty line to break up bootstrapping from normal output + catf("") + + return(invisible()) + } + + renv_bootstrap_tests_running <- function() { + getOption("renv.tests.running", default = FALSE) + } + + renv_bootstrap_repos <- function() { + + # get CRAN repository + cran <- getOption("renv.repos.cran", "https://cloud.r-project.org") + + # check for repos override + repos <- Sys.getenv("RENV_CONFIG_REPOS_OVERRIDE", unset = NA) + if (!is.na(repos)) { + + # check for RSPM; if set, use a fallback repository for renv + rspm <- Sys.getenv("RSPM", unset = NA) + if (identical(rspm, repos)) + repos <- c(RSPM = rspm, CRAN = cran) + + return(repos) + + } + + # check for lockfile repositories + repos <- tryCatch(renv_bootstrap_repos_lockfile(), error = identity) + if (!inherits(repos, "error") && length(repos)) + return(repos) + + # retrieve current repos + repos <- getOption("repos") + + # ensure @CRAN@ entries are resolved + repos[repos == "@CRAN@"] <- cran + + # add in renv.bootstrap.repos if set + default <- c(FALLBACK = "https://cloud.r-project.org") + extra <- getOption("renv.bootstrap.repos", default = default) + repos <- c(repos, extra) + + # remove duplicates that might've snuck in + dupes <- duplicated(repos) | duplicated(names(repos)) + repos[!dupes] + + } + + renv_bootstrap_repos_lockfile <- function() { + + lockpath <- Sys.getenv("RENV_PATHS_LOCKFILE", unset = "renv.lock") + if (!file.exists(lockpath)) + return(NULL) + + lockfile <- tryCatch(renv_json_read(lockpath), error = identity) + if (inherits(lockfile, "error")) { + warning(lockfile) + return(NULL) + } + + repos <- lockfile$R$Repositories + if (length(repos) == 0) + return(NULL) + + keys <- vapply(repos, `[[`, "Name", FUN.VALUE = character(1)) + vals <- vapply(repos, `[[`, "URL", FUN.VALUE = character(1)) + names(vals) <- keys + + return(vals) + + } + + renv_bootstrap_download <- function(version) { + + sha <- attr(version, "sha", exact = TRUE) + + methods <- if (!is.null(sha)) { + + # attempting to bootstrap a development version of renv + c( + function() renv_bootstrap_download_tarball(sha), + function() renv_bootstrap_download_github(sha) + ) + + } else { + + # attempting to bootstrap a release version of renv + c( + function() renv_bootstrap_download_tarball(version), + function() renv_bootstrap_download_cran_latest(version), + function() renv_bootstrap_download_cran_archive(version) + ) + + } + + for (method in methods) { + path <- tryCatch(method(), error = identity) + if (is.character(path) && file.exists(path)) + return(path) + } + + stop("All download methods failed") + + } + + renv_bootstrap_download_impl <- function(url, destfile) { + + mode <- "wb" + + # https://bugs.r-project.org/bugzilla/show_bug.cgi?id=17715 + fixup <- + Sys.info()[["sysname"]] == "Windows" && + substring(url, 1L, 5L) == "file:" + + if (fixup) + mode <- "w+b" + + args <- list( + url = url, + destfile = destfile, + mode = mode, + quiet = TRUE + ) + + if ("headers" %in% names(formals(utils::download.file))) + args$headers <- renv_bootstrap_download_custom_headers(url) + + do.call(utils::download.file, args) + + } + + renv_bootstrap_download_custom_headers <- function(url) { + + headers <- getOption("renv.download.headers") + if (is.null(headers)) + return(character()) + + if (!is.function(headers)) + stopf("'renv.download.headers' is not a function") + + headers <- headers(url) + if (length(headers) == 0L) + return(character()) + + if (is.list(headers)) + headers <- unlist(headers, recursive = FALSE, use.names = TRUE) + + ok <- + is.character(headers) && + is.character(names(headers)) && + all(nzchar(names(headers))) + + if (!ok) + stop("invocation of 'renv.download.headers' did not return a named character vector") + + headers + + } + + renv_bootstrap_download_cran_latest <- function(version) { + + spec <- renv_bootstrap_download_cran_latest_find(version) + type <- spec$type + repos <- spec$repos + + baseurl <- utils::contrib.url(repos = repos, type = type) + ext <- if (identical(type, "source")) + ".tar.gz" + else if (Sys.info()[["sysname"]] == "Windows") + ".zip" + else + ".tgz" + name <- sprintf("renv_%s%s", version, ext) + url <- paste(baseurl, name, sep = "/") + + destfile <- file.path(tempdir(), name) + status <- tryCatch( + renv_bootstrap_download_impl(url, destfile), + condition = identity + ) + + if (inherits(status, "condition")) + return(FALSE) + + # report success and return + destfile + + } + + renv_bootstrap_download_cran_latest_find <- function(version) { + + # check whether binaries are supported on this system + binary <- + getOption("renv.bootstrap.binary", default = TRUE) && + !identical(.Platform$pkgType, "source") && + !identical(getOption("pkgType"), "source") && + Sys.info()[["sysname"]] %in% c("Darwin", "Windows") + + types <- c(if (binary) "binary", "source") + + # iterate over types + repositories + for (type in types) { + for (repos in renv_bootstrap_repos()) { + + # retrieve package database + db <- tryCatch( + as.data.frame( + utils::available.packages(type = type, repos = repos), + stringsAsFactors = FALSE + ), + error = identity + ) + + if (inherits(db, "error")) + next + + # check for compatible entry + entry <- db[db$Package %in% "renv" & db$Version %in% version, ] + if (nrow(entry) == 0) + next + + # found it; return spec to caller + spec <- list(entry = entry, type = type, repos = repos) + return(spec) + + } + } + + # if we got here, we failed to find renv + fmt <- "renv %s is not available from your declared package repositories" + stop(sprintf(fmt, version)) + + } + + renv_bootstrap_download_cran_archive <- function(version) { + + name <- sprintf("renv_%s.tar.gz", version) + repos <- renv_bootstrap_repos() + urls <- file.path(repos, "src/contrib/Archive/renv", name) + destfile <- file.path(tempdir(), name) + + for (url in urls) { + + status <- tryCatch( + renv_bootstrap_download_impl(url, destfile), + condition = identity + ) + + if (identical(status, 0L)) + return(destfile) + + } + + return(FALSE) + + } + + renv_bootstrap_download_tarball <- function(version) { + + # if the user has provided the path to a tarball via + # an environment variable, then use it + tarball <- Sys.getenv("RENV_BOOTSTRAP_TARBALL", unset = NA) + if (is.na(tarball)) + return() + + # allow directories + if (dir.exists(tarball)) { + name <- sprintf("renv_%s.tar.gz", version) + tarball <- file.path(tarball, name) + } + + # bail if it doesn't exist + if (!file.exists(tarball)) { + + # let the user know we weren't able to honour their request + fmt <- "- RENV_BOOTSTRAP_TARBALL is set (%s) but does not exist." + msg <- sprintf(fmt, tarball) + warning(msg) + + # bail + return() + + } + + catf("- Using local tarball '%s'.", tarball) + tarball + + } + + renv_bootstrap_download_github <- function(version) { + + enabled <- Sys.getenv("RENV_BOOTSTRAP_FROM_GITHUB", unset = "TRUE") + if (!identical(enabled, "TRUE")) + return(FALSE) + + # prepare download options + pat <- Sys.getenv("GITHUB_PAT") + if (nzchar(Sys.which("curl")) && nzchar(pat)) { + fmt <- "--location --fail --header \"Authorization: token %s\"" + extra <- sprintf(fmt, pat) + saved <- options("download.file.method", "download.file.extra") + options(download.file.method = "curl", download.file.extra = extra) + on.exit(do.call(base::options, saved), add = TRUE) + } else if (nzchar(Sys.which("wget")) && nzchar(pat)) { + fmt <- "--header=\"Authorization: token %s\"" + extra <- sprintf(fmt, pat) + saved <- options("download.file.method", "download.file.extra") + options(download.file.method = "wget", download.file.extra = extra) + on.exit(do.call(base::options, saved), add = TRUE) + } + + url <- file.path("https://api.github.com/repos/rstudio/renv/tarball", version) + name <- sprintf("renv_%s.tar.gz", version) + destfile <- file.path(tempdir(), name) + + status <- tryCatch( + renv_bootstrap_download_impl(url, destfile), + condition = identity + ) + + if (!identical(status, 0L)) + return(FALSE) + + renv_bootstrap_download_augment(destfile) + + return(destfile) + + } + + # Add Sha to DESCRIPTION. This is stop gap until #890, after which we + # can use renv::install() to fully capture metadata. + renv_bootstrap_download_augment <- function(destfile) { + sha <- renv_bootstrap_git_extract_sha1_tar(destfile) + if (is.null(sha)) { + return() + } + + # Untar + tempdir <- tempfile("renv-github-") + on.exit(unlink(tempdir, recursive = TRUE), add = TRUE) + untar(destfile, exdir = tempdir) + pkgdir <- dir(tempdir, full.names = TRUE)[[1]] + + # Modify description + desc_path <- file.path(pkgdir, "DESCRIPTION") + desc_lines <- readLines(desc_path) + remotes_fields <- c( + "RemoteType: github", + "RemoteHost: api.github.com", + "RemoteRepo: renv", + "RemoteUsername: rstudio", + "RemotePkgRef: rstudio/renv", + paste("RemoteRef: ", sha), + paste("RemoteSha: ", sha) + ) + writeLines(c(desc_lines[desc_lines != ""], remotes_fields), con = desc_path) + + # Re-tar + local({ + old <- setwd(tempdir) + on.exit(setwd(old), add = TRUE) + + tar(destfile, compression = "gzip") + }) + invisible() + } + + # Extract the commit hash from a git archive. Git archives include the SHA1 + # hash as the comment field of the tarball pax extended header + # (see https://www.kernel.org/pub/software/scm/git/docs/git-archive.html) + # For GitHub archives this should be the first header after the default one + # (512 byte) header. + renv_bootstrap_git_extract_sha1_tar <- function(bundle) { + + # open the bundle for reading + # We use gzcon for everything because (from ?gzcon) + # > Reading from a connection which does not supply a 'gzip' magic + # > header is equivalent to reading from the original connection + conn <- gzcon(file(bundle, open = "rb", raw = TRUE)) + on.exit(close(conn)) + + # The default pax header is 512 bytes long and the first pax extended header + # with the comment should be 51 bytes long + # `52 comment=` (11 chars) + 40 byte SHA1 hash + len <- 0x200 + 0x33 + res <- rawToChar(readBin(conn, "raw", n = len)[0x201:len]) + + if (grepl("^52 comment=", res)) { + sub("52 comment=", "", res) + } else { + NULL + } + } + + renv_bootstrap_install <- function(version, tarball, library) { + + # attempt to install it into project library + dir.create(library, showWarnings = FALSE, recursive = TRUE) + output <- renv_bootstrap_install_impl(library, tarball) + + # check for successful install + status <- attr(output, "status") + if (is.null(status) || identical(status, 0L)) + return(status) + + # an error occurred; report it + header <- "installation of renv failed" + lines <- paste(rep.int("=", nchar(header)), collapse = "") + text <- paste(c(header, lines, output), collapse = "\n") + stop(text) + + } + + renv_bootstrap_install_impl <- function(library, tarball) { + + # invoke using system2 so we can capture and report output + bin <- R.home("bin") + exe <- if (Sys.info()[["sysname"]] == "Windows") "R.exe" else "R" + R <- file.path(bin, exe) + + args <- c( + "--vanilla", "CMD", "INSTALL", "--no-multiarch", + "-l", shQuote(path.expand(library)), + shQuote(path.expand(tarball)) + ) + + system2(R, args, stdout = TRUE, stderr = TRUE) + + } + + renv_bootstrap_platform_prefix <- function() { + + # construct version prefix + version <- paste(R.version$major, R.version$minor, sep = ".") + prefix <- paste("R", numeric_version(version)[1, 1:2], sep = "-") + + # include SVN revision for development versions of R + # (to avoid sharing platform-specific artefacts with released versions of R) + devel <- + identical(R.version[["status"]], "Under development (unstable)") || + identical(R.version[["nickname"]], "Unsuffered Consequences") + + if (devel) + prefix <- paste(prefix, R.version[["svn rev"]], sep = "-r") + + # build list of path components + components <- c(prefix, R.version$platform) + + # include prefix if provided by user + prefix <- renv_bootstrap_platform_prefix_impl() + if (!is.na(prefix) && nzchar(prefix)) + components <- c(prefix, components) + + # build prefix + paste(components, collapse = "/") + + } + + renv_bootstrap_platform_prefix_impl <- function() { + + # if an explicit prefix has been supplied, use it + prefix <- Sys.getenv("RENV_PATHS_PREFIX", unset = NA) + if (!is.na(prefix)) + return(prefix) + + # if the user has requested an automatic prefix, generate it + auto <- Sys.getenv("RENV_PATHS_PREFIX_AUTO", unset = NA) + if (auto %in% c("TRUE", "True", "true", "1")) + return(renv_bootstrap_platform_prefix_auto()) + + # empty string on failure + "" + + } + + renv_bootstrap_platform_prefix_auto <- function() { + + prefix <- tryCatch(renv_bootstrap_platform_os(), error = identity) + if (inherits(prefix, "error") || prefix %in% "unknown") { + + msg <- paste( + "failed to infer current operating system", + "please file a bug report at https://github.com/rstudio/renv/issues", + sep = "; " + ) + + warning(msg) + + } + + prefix + + } + + renv_bootstrap_platform_os <- function() { + + sysinfo <- Sys.info() + sysname <- sysinfo[["sysname"]] + + # handle Windows + macOS up front + if (sysname == "Windows") + return("windows") + else if (sysname == "Darwin") + return("macos") + + # check for os-release files + for (file in c("/etc/os-release", "/usr/lib/os-release")) + if (file.exists(file)) + return(renv_bootstrap_platform_os_via_os_release(file, sysinfo)) + + # check for redhat-release files + if (file.exists("/etc/redhat-release")) + return(renv_bootstrap_platform_os_via_redhat_release()) + + "unknown" + + } + + renv_bootstrap_platform_os_via_os_release <- function(file, sysinfo) { + + # read /etc/os-release + release <- utils::read.table( + file = file, + sep = "=", + quote = c("\"", "'"), + col.names = c("Key", "Value"), + comment.char = "#", + stringsAsFactors = FALSE + ) + + vars <- as.list(release$Value) + names(vars) <- release$Key + + # get os name + os <- tolower(sysinfo[["sysname"]]) + + # read id + id <- "unknown" + for (field in c("ID", "ID_LIKE")) { + if (field %in% names(vars) && nzchar(vars[[field]])) { + id <- vars[[field]] + break + } + } + + # read version + version <- "unknown" + for (field in c("UBUNTU_CODENAME", "VERSION_CODENAME", "VERSION_ID", "BUILD_ID")) { + if (field %in% names(vars) && nzchar(vars[[field]])) { + version <- vars[[field]] + break + } + } + + # join together + paste(c(os, id, version), collapse = "-") + + } + + renv_bootstrap_platform_os_via_redhat_release <- function() { + + # read /etc/redhat-release + contents <- readLines("/etc/redhat-release", warn = FALSE) + + # infer id + id <- if (grepl("centos", contents, ignore.case = TRUE)) + "centos" + else if (grepl("redhat", contents, ignore.case = TRUE)) + "redhat" + else + "unknown" + + # try to find a version component (very hacky) + version <- "unknown" + + parts <- strsplit(contents, "[[:space:]]")[[1L]] + for (part in parts) { + + nv <- tryCatch(numeric_version(part), error = identity) + if (inherits(nv, "error")) + next + + version <- nv[1, 1] + break + + } + + paste(c("linux", id, version), collapse = "-") + + } + + renv_bootstrap_library_root_name <- function(project) { + + # use project name as-is if requested + asis <- Sys.getenv("RENV_PATHS_LIBRARY_ROOT_ASIS", unset = "FALSE") + if (asis) + return(basename(project)) + + # otherwise, disambiguate based on project's path + id <- substring(renv_bootstrap_hash_text(project), 1L, 8L) + paste(basename(project), id, sep = "-") + + } + + renv_bootstrap_library_root <- function(project) { + + prefix <- renv_bootstrap_profile_prefix() + + path <- Sys.getenv("RENV_PATHS_LIBRARY", unset = NA) + if (!is.na(path)) + return(paste(c(path, prefix), collapse = "/")) + + path <- renv_bootstrap_library_root_impl(project) + if (!is.null(path)) { + name <- renv_bootstrap_library_root_name(project) + return(paste(c(path, prefix, name), collapse = "/")) + } + + renv_bootstrap_paths_renv("library", project = project) + + } + + renv_bootstrap_library_root_impl <- function(project) { + + root <- Sys.getenv("RENV_PATHS_LIBRARY_ROOT", unset = NA) + if (!is.na(root)) + return(root) + + type <- renv_bootstrap_project_type(project) + if (identical(type, "package")) { + userdir <- renv_bootstrap_user_dir() + return(file.path(userdir, "library")) + } + + } + + renv_bootstrap_validate_version <- function(version, description = NULL) { + + # resolve description file + # + # avoid passing lib.loc to `packageDescription()` below, since R will + # use the loaded version of the package by default anyhow. note that + # this function should only be called after 'renv' is loaded + # https://github.com/rstudio/renv/issues/1625 + description <- description %||% packageDescription("renv") + + # check whether requested version 'version' matches loaded version of renv + sha <- attr(version, "sha", exact = TRUE) + valid <- if (!is.null(sha)) + renv_bootstrap_validate_version_dev(sha, description) + else + renv_bootstrap_validate_version_release(version, description) + + if (valid) + return(TRUE) + + # the loaded version of renv doesn't match the requested version; + # give the user instructions on how to proceed + remote <- if (!is.null(description[["RemoteSha"]])) { + paste("rstudio/renv", description[["RemoteSha"]], sep = "@") + } else { + paste("renv", description[["Version"]], sep = "@") + } + + # display both loaded version + sha if available + friendly <- renv_bootstrap_version_friendly( + version = description[["Version"]], + sha = description[["RemoteSha"]] + ) + + fmt <- paste( + "renv %1$s was loaded from project library, but this project is configured to use renv %2$s.", + "- Use `renv::record(\"%3$s\")` to record renv %1$s in the lockfile.", + "- Use `renv::restore(packages = \"renv\")` to install renv %2$s into the project library.", + sep = "\n" + ) + catf(fmt, friendly, renv_bootstrap_version_friendly(version), remote) + + FALSE + + } + + renv_bootstrap_validate_version_dev <- function(version, description) { + expected <- description[["RemoteSha"]] + is.character(expected) && startswith(expected, version) + } + + renv_bootstrap_validate_version_release <- function(version, description) { + expected <- description[["Version"]] + is.character(expected) && identical(expected, version) + } + + renv_bootstrap_hash_text <- function(text) { + + hashfile <- tempfile("renv-hash-") + on.exit(unlink(hashfile), add = TRUE) + + writeLines(text, con = hashfile) + tools::md5sum(hashfile) + + } + + renv_bootstrap_load <- function(project, libpath, version) { + + # try to load renv from the project library + if (!requireNamespace("renv", lib.loc = libpath, quietly = TRUE)) + return(FALSE) + + # warn if the version of renv loaded does not match + renv_bootstrap_validate_version(version) + + # execute renv load hooks, if any + hooks <- getHook("renv::autoload") + for (hook in hooks) + if (is.function(hook)) + tryCatch(hook(), error = warnify) + + # load the project + renv::load(project) + + TRUE + + } + + renv_bootstrap_profile_load <- function(project) { + + # if RENV_PROFILE is already set, just use that + profile <- Sys.getenv("RENV_PROFILE", unset = NA) + if (!is.na(profile) && nzchar(profile)) + return(profile) + + # check for a profile file (nothing to do if it doesn't exist) + path <- renv_bootstrap_paths_renv("profile", profile = FALSE, project = project) + if (!file.exists(path)) + return(NULL) + + # read the profile, and set it if it exists + contents <- readLines(path, warn = FALSE) + if (length(contents) == 0L) + return(NULL) + + # set RENV_PROFILE + profile <- contents[[1L]] + if (!profile %in% c("", "default")) + Sys.setenv(RENV_PROFILE = profile) + + profile + + } + + renv_bootstrap_profile_prefix <- function() { + profile <- renv_bootstrap_profile_get() + if (!is.null(profile)) + return(file.path("profiles", profile, "renv")) + } + + renv_bootstrap_profile_get <- function() { + profile <- Sys.getenv("RENV_PROFILE", unset = "") + renv_bootstrap_profile_normalize(profile) + } + + renv_bootstrap_profile_set <- function(profile) { + profile <- renv_bootstrap_profile_normalize(profile) + if (is.null(profile)) + Sys.unsetenv("RENV_PROFILE") + else + Sys.setenv(RENV_PROFILE = profile) + } + + renv_bootstrap_profile_normalize <- function(profile) { + + if (is.null(profile) || profile %in% c("", "default")) + return(NULL) + + profile + + } + + renv_bootstrap_path_absolute <- function(path) { + + substr(path, 1L, 1L) %in% c("~", "/", "\\") || ( + substr(path, 1L, 1L) %in% c(letters, LETTERS) && + substr(path, 2L, 3L) %in% c(":/", ":\\") + ) + + } + + renv_bootstrap_paths_renv <- function(..., profile = TRUE, project = NULL) { + renv <- Sys.getenv("RENV_PATHS_RENV", unset = "renv") + root <- if (renv_bootstrap_path_absolute(renv)) NULL else project + prefix <- if (profile) renv_bootstrap_profile_prefix() + components <- c(root, renv, prefix, ...) + paste(components, collapse = "/") + } + + renv_bootstrap_project_type <- function(path) { + + descpath <- file.path(path, "DESCRIPTION") + if (!file.exists(descpath)) + return("unknown") + + desc <- tryCatch( + read.dcf(descpath, all = TRUE), + error = identity + ) + + if (inherits(desc, "error")) + return("unknown") + + type <- desc$Type + if (!is.null(type)) + return(tolower(type)) + + package <- desc$Package + if (!is.null(package)) + return("package") + + "unknown" + + } + + renv_bootstrap_user_dir <- function() { + dir <- renv_bootstrap_user_dir_impl() + path.expand(chartr("\\", "/", dir)) + } + + renv_bootstrap_user_dir_impl <- function() { + + # use local override if set + override <- getOption("renv.userdir.override") + if (!is.null(override)) + return(override) + + # use R_user_dir if available + tools <- asNamespace("tools") + if (is.function(tools$R_user_dir)) + return(tools$R_user_dir("renv", "cache")) + + # try using our own backfill for older versions of R + envvars <- c("R_USER_CACHE_DIR", "XDG_CACHE_HOME") + for (envvar in envvars) { + root <- Sys.getenv(envvar, unset = NA) + if (!is.na(root)) + return(file.path(root, "R/renv")) + } + + # use platform-specific default fallbacks + if (Sys.info()[["sysname"]] == "Windows") + file.path(Sys.getenv("LOCALAPPDATA"), "R/cache/R/renv") + else if (Sys.info()[["sysname"]] == "Darwin") + "~/Library/Caches/org.R-project.R/R/renv" + else + "~/.cache/R/renv" + + } + + renv_bootstrap_version_friendly <- function(version, shafmt = NULL, sha = NULL) { + sha <- sha %||% attr(version, "sha", exact = TRUE) + parts <- c(version, sprintf(shafmt %||% " [sha: %s]", substring(sha, 1L, 7L))) + paste(parts, collapse = "") + } + + renv_bootstrap_exec <- function(project, libpath, version) { + if (!renv_bootstrap_load(project, libpath, version)) + renv_bootstrap_run(version, libpath) + } + + renv_bootstrap_run <- function(version, libpath) { + + # perform bootstrap + bootstrap(version, libpath) + + # exit early if we're just testing bootstrap + if (!is.na(Sys.getenv("RENV_BOOTSTRAP_INSTALL_ONLY", unset = NA))) + return(TRUE) + + # try again to load + if (requireNamespace("renv", lib.loc = libpath, quietly = TRUE)) { + return(renv::load(project = getwd())) + } + + # failed to download or load renv; warn the user + msg <- c( + "Failed to find an renv installation: the project will not be loaded.", + "Use `renv::activate()` to re-initialize the project." + ) + + warning(paste(msg, collapse = "\n"), call. = FALSE) + + } + + renv_json_read <- function(file = NULL, text = NULL) { + + jlerr <- NULL + + # if jsonlite is loaded, use that instead + if ("jsonlite" %in% loadedNamespaces()) { + + json <- catch(renv_json_read_jsonlite(file, text)) + if (!inherits(json, "error")) + return(json) + + jlerr <- json + + } + + # otherwise, fall back to the default JSON reader + json <- catch(renv_json_read_default(file, text)) + if (!inherits(json, "error")) + return(json) + + # report an error + if (!is.null(jlerr)) + stop(jlerr) + else + stop(json) + + } + + renv_json_read_jsonlite <- function(file = NULL, text = NULL) { + text <- paste(text %||% read(file), collapse = "\n") + jsonlite::fromJSON(txt = text, simplifyVector = FALSE) + } + + renv_json_read_default <- function(file = NULL, text = NULL) { + + # find strings in the JSON + text <- paste(text %||% read(file), collapse = "\n") + pattern <- '["](?:(?:\\\\.)|(?:[^"\\\\]))*?["]' + locs <- gregexpr(pattern, text, perl = TRUE)[[1]] + + # if any are found, replace them with placeholders + replaced <- text + strings <- character() + replacements <- character() + + if (!identical(c(locs), -1L)) { + + # get the string values + starts <- locs + ends <- locs + attr(locs, "match.length") - 1L + strings <- substring(text, starts, ends) + + # only keep those requiring escaping + strings <- grep("[[\\]{}:]", strings, perl = TRUE, value = TRUE) + + # compute replacements + replacements <- sprintf('"\032%i\032"', seq_along(strings)) + + # replace the strings + mapply(function(string, replacement) { + replaced <<- sub(string, replacement, replaced, fixed = TRUE) + }, strings, replacements) + + } + + # transform the JSON into something the R parser understands + transformed <- replaced + transformed <- gsub("{}", "`names<-`(list(), character())", transformed, fixed = TRUE) + transformed <- gsub("[[{]", "list(", transformed, perl = TRUE) + transformed <- gsub("[]}]", ")", transformed, perl = TRUE) + transformed <- gsub(":", "=", transformed, fixed = TRUE) + text <- paste(transformed, collapse = "\n") + + # parse it + json <- parse(text = text, keep.source = FALSE, srcfile = NULL)[[1L]] + + # construct map between source strings, replaced strings + map <- as.character(parse(text = strings)) + names(map) <- as.character(parse(text = replacements)) + + # convert to list + map <- as.list(map) + + # remap strings in object + remapped <- renv_json_remap(json, map) + + # evaluate + eval(remapped, envir = baseenv()) + + } + + renv_json_remap <- function(json, map) { + + # fix names + if (!is.null(names(json))) { + lhs <- match(names(json), names(map), nomatch = 0L) + rhs <- match(names(map), names(json), nomatch = 0L) + names(json)[rhs] <- map[lhs] + } + + # fix values + if (is.character(json)) + return(map[[json]] %||% json) + + # handle true, false, null + if (is.name(json)) { + text <- as.character(json) + if (text == "true") + return(TRUE) + else if (text == "false") + return(FALSE) + else if (text == "null") + return(NULL) + } + + # recurse + if (is.recursive(json)) { + for (i in seq_along(json)) { + json[i] <- list(renv_json_remap(json[[i]], map)) + } + } + + json + + } + + # load the renv profile, if any + renv_bootstrap_profile_load(project) + + # construct path to library root + root <- renv_bootstrap_library_root(project) + + # construct library prefix for platform + prefix <- renv_bootstrap_platform_prefix() + + # construct full libpath + libpath <- file.path(root, prefix) + + # run bootstrap code + renv_bootstrap_exec(project, libpath, version) + + invisible() + +}) diff --git a/renv/profile b/renv/profile new file mode 100644 index 00000000..6d4023b5 --- /dev/null +++ b/renv/profile @@ -0,0 +1 @@ +lesson-requirements diff --git a/renv/profiles/lesson-requirements/renv.lock b/renv/profiles/lesson-requirements/renv.lock new file mode 100644 index 00000000..e17623e3 --- /dev/null +++ b/renv/profiles/lesson-requirements/renv.lock @@ -0,0 +1,407 @@ +{ + "R": { + "Version": "4.3.2", + "Repositories": [ + { + "Name": "carpentries", + "URL": "https://carpentries.r-universe.dev" + }, + { + "Name": "carpentries_archive", + "URL": "https://carpentries.github.io/drat" + }, + { + "Name": "CRAN", + "URL": "https://cran.rstudio.com" + } + ] + }, + "Packages": { + "R6": { + "Package": "R6", + "Version": "2.5.1", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R" + ], + "Hash": "470851b6d5d0ac559e9d01bb352b4021" + }, + "base64enc": { + "Package": "base64enc", + "Version": "0.1-3", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R" + ], + "Hash": "543776ae6848fde2f48ff3816d0628bc" + }, + "bslib": { + "Package": "bslib", + "Version": "0.6.1", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "base64enc", + "cachem", + "grDevices", + "htmltools", + "jquerylib", + "jsonlite", + "lifecycle", + "memoise", + "mime", + "rlang", + "sass" + ], + "Hash": "c0d8599494bc7fb408cd206bbdd9cab0" + }, + "cachem": { + "Package": "cachem", + "Version": "1.0.8", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "fastmap", + "rlang" + ], + "Hash": "c35768291560ce302c0a6589f92e837d" + }, + "cli": { + "Package": "cli", + "Version": "3.6.1", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "utils" + ], + "Hash": "89e6d8219950eac806ae0c489052048a" + }, + "digest": { + "Package": "digest", + "Version": "0.6.33", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "utils" + ], + "Hash": "b18a9cf3c003977b0cc49d5e76ebe48d" + }, + "ellipsis": { + "Package": "ellipsis", + "Version": "0.3.2", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "rlang" + ], + "Hash": "bb0eec2fe32e88d9e2836c2f73ea2077" + }, + "evaluate": { + "Package": "evaluate", + "Version": "0.23", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "methods" + ], + "Hash": "daf4a1246be12c1fa8c7705a0935c1a0" + }, + "fastmap": { + "Package": "fastmap", + "Version": "1.1.1", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "f7736a18de97dea803bde0a2daaafb27" + }, + "fontawesome": { + "Package": "fontawesome", + "Version": "0.5.2", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "htmltools", + "rlang" + ], + "Hash": "c2efdd5f0bcd1ea861c2d4e2a883a67d" + }, + "fs": { + "Package": "fs", + "Version": "1.6.3", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "methods" + ], + "Hash": "47b5f30c720c23999b913a1a635cf0bb" + }, + "glue": { + "Package": "glue", + "Version": "1.6.2", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "methods" + ], + "Hash": "4f2596dfb05dac67b9dc558e5c6fba2e" + }, + "highr": { + "Package": "highr", + "Version": "0.10", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "xfun" + ], + "Hash": "06230136b2d2b9ba5805e1963fa6e890" + }, + "htmltools": { + "Package": "htmltools", + "Version": "0.5.7", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "base64enc", + "digest", + "ellipsis", + "fastmap", + "grDevices", + "rlang", + "utils" + ], + "Hash": "2d7b3857980e0e0d0a1fd6f11928ab0f" + }, + "jquerylib": { + "Package": "jquerylib", + "Version": "0.1.4", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "htmltools" + ], + "Hash": "5aab57a3bd297eee1c1d862735972182" + }, + "jsonlite": { + "Package": "jsonlite", + "Version": "1.8.8", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "methods" + ], + "Hash": "e1b9c55281c5adc4dd113652d9e26768" + }, + "knitr": { + "Package": "knitr", + "Version": "1.45", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "evaluate", + "highr", + "methods", + "tools", + "xfun", + "yaml" + ], + "Hash": "1ec462871063897135c1bcbe0fc8f07d" + }, + "lifecycle": { + "Package": "lifecycle", + "Version": "1.0.4", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "cli", + "glue", + "rlang" + ], + "Hash": "b8552d117e1b808b09a832f589b79035" + }, + "magrittr": { + "Package": "magrittr", + "Version": "2.0.3", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R" + ], + "Hash": "7ce2733a9826b3aeb1775d56fd305472" + }, + "memoise": { + "Package": "memoise", + "Version": "2.0.1", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "cachem", + "rlang" + ], + "Hash": "e2817ccf4a065c5d9d7f2cfbe7c1d78c" + }, + "mime": { + "Package": "mime", + "Version": "0.12", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "tools" + ], + "Hash": "18e9c28c1d3ca1560ce30658b22ce104" + }, + "rappdirs": { + "Package": "rappdirs", + "Version": "0.3.3", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R" + ], + "Hash": "5e3c5dc0b071b21fa128676560dbe94d" + }, + "renv": { + "Package": "renv", + "Version": "1.0.3", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "utils" + ], + "Hash": "41b847654f567341725473431dd0d5ab" + }, + "rlang": { + "Package": "rlang", + "Version": "1.1.2", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "utils" + ], + "Hash": "50a6dbdc522936ca35afc5e2082ea91b" + }, + "rmarkdown": { + "Package": "rmarkdown", + "Version": "2.25", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "bslib", + "evaluate", + "fontawesome", + "htmltools", + "jquerylib", + "jsonlite", + "knitr", + "methods", + "stringr", + "tinytex", + "tools", + "utils", + "xfun", + "yaml" + ], + "Hash": "d65e35823c817f09f4de424fcdfa812a" + }, + "sass": { + "Package": "sass", + "Version": "0.4.7", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R6", + "fs", + "htmltools", + "rappdirs", + "rlang" + ], + "Hash": "6bd4d33b50ff927191ec9acbf52fd056" + }, + "stringi": { + "Package": "stringi", + "Version": "1.8.2", + "Source": "Repository", + "Repository": "https://carpentries.r-universe.dev", + "Requirements": [ + "R", + "stats", + "tools", + "utils" + ], + "Hash": "e68c45f81639001af5f1b15cd3599bbd" + }, + "stringr": { + "Package": "stringr", + "Version": "1.5.1", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "cli", + "glue", + "lifecycle", + "magrittr", + "rlang", + "stringi", + "vctrs" + ], + "Hash": "960e2ae9e09656611e0b8214ad543207" + }, + "tinytex": { + "Package": "tinytex", + "Version": "0.49", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "xfun" + ], + "Hash": "5ac22900ae0f386e54f1c307eca7d843" + }, + "vctrs": { + "Package": "vctrs", + "Version": "0.6.5", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "cli", + "glue", + "lifecycle", + "rlang" + ], + "Hash": "c03fa420630029418f7e6da3667aac4a" + }, + "xfun": { + "Package": "xfun", + "Version": "0.41", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "stats", + "tools" + ], + "Hash": "460a5e0fe46a80ef87424ad216028014" + }, + "yaml": { + "Package": "yaml", + "Version": "2.3.7", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "0d0056cc5383fbc240ccd0cb584bf436" + } + } +} diff --git a/site/README.md b/site/README.md new file mode 100644 index 00000000..42997e3d --- /dev/null +++ b/site/README.md @@ -0,0 +1,2 @@ +This directory contains rendered lesson materials. Please do not edit files +here.