Improve examples (#615)

Signed-off-by: Daena Rys <[email protected]> Co-authored-by: Tuomas Borman <[email protected]> Co-authored-by: TuomasBorman <[email protected]>
microbiome · Oct 2, 2024 · 239e338 · 239e338
1 parent 3296421
commit 239e338
Show file tree

Hide file tree

Showing 29 changed files with 327 additions and 378 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: OMA
 Title: Orchestrating Microbiome Analysis with Bioconductor
-Version: 0.98.25
-Date: 2024-04-26
+Version: 0.98.26
+Date: 2024-10-01
 Authors@R:
     c(person("Leo", "Lahti", role = c("aut"),
              comment = c(ORCID = "0000-0001-5537-637X")),
@@ -62,7 +62,6 @@ Suggests:
     gtools,
     gsEasy,
     igraph,
-    kableExtra,
     knitr,
     Maaslin2,
     mia,

diff --git a/inst/pages/agglomeration.qmd b/inst/pages/agglomeration.qmd
@@ -24,10 +24,10 @@ One of the main applications of taxonomic information in regards to count data
 is to agglomerate count data on taxonomic levels and track the influence of
 changing conditions through these levels. For this `mia` contains the
 `agglomerateByRank()` function. The ideal location to store the agglomerated data
-is as an alternative experiment.
+is as an alternative experiment(see [@sec-alt-exp]).
 
 ```{r}
-# Tranform data
+# Transform data
 tse <- transformAssay(tse, assay.type = "counts", method = "relabundance")
 # Agglomerate
 altExp(tse, "Family") <- agglomerateByRank(

diff --git a/inst/pages/alpha_diversity.qmd b/inst/pages/alpha_diversity.qmd
@@ -108,7 +108,7 @@ tse <- addAlpha(
     detection = 10)
 
 # Check some of the first values in colData
-head(tse$observed)
+tse$observed |> head()
 ```
 
 ::: {.callout-tip}
@@ -131,7 +131,7 @@ plotColData(
     "SampleType",
     colour_by = "Final_Barcode") +
     theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
-    labs(expression(Richness[Observed]))
+    labs(x = "Sample types", y = expression(Richness[Observed]))
 ```
 
 We can then analyze the statistical significance. We use the non-parametric
@@ -152,7 +152,7 @@ used function in _`picante`_ [@R_picante, @Kembel2010].
 
 ```{r phylo-div-1}
 tse <- addAlpha(tse, assay.type = "counts", index = "faith")
-head(tse$faith)
+tse$faith |> head()
 ```
 
 ::: {.callout-note}

diff --git a/inst/pages/beta_diversity.qmd b/inst/pages/beta_diversity.qmd
@@ -101,7 +101,7 @@ tse <- addDivergence(
     tse,
     assay.type = "counts",
     reference = "median",
-    FUN = vegan::vegdist)
+    FUN = getDissimilarity)
 ```
 
 ## Unsupervised ordination {#sec-unsupervised-ordination}
@@ -137,7 +137,7 @@ library(scater)
 # Run PCoA on relabundance assay with Bray-Curtis distances
 tse <- runMDS(
     tse,
-    FUN = vegan::vegdist,
+    FUN = getDissimilarity,
     method = "bray",
     assay.type = "relabundance",
     name = "MDS_bray")
@@ -202,7 +202,7 @@ tse <- runNMDS(
 ```
 
 Multiple ordination plots are combined into a multi-panel plot with the
-patchwork package, so that different methods can be compared to find
+`patchwork` package, so that different methods can be compared to find
 similarities between them or select the most suitable one to visualize
 beta diversity in the light of the research question.
 
@@ -292,11 +292,19 @@ distance calculations later on.
 
 # Let us see what happens when we operate with ntop highest variance features
 ntop <- 5
-tse_sub <- tse[head(rev(order(rowSds(assay(tse, "counts")))), ntop), ]
+
+# Calculate the standard deviations for each row
+row_sds <- rowSds(assay(tse, "counts"))
+
+# Get the indices of the top 5 rows with the highest standard deviations
+top_indices <- order(row_sds, decreasing = TRUE)[1:ntop]
+
+# Subset the tse object based on the top indices
+tse_sub <- tse[top_indices, ]
 ```
 
 Let us first convert the count assay to centered log ratio (clr) assay and
-calculate Aitchison distance without rarefaction:
+calculate MDS with Aitchison distance without rarefaction:
 
 ```{r}
 #| label: Aitchison distance without rarefaction
@@ -313,14 +321,14 @@ tse_sub <- transformAssay(
 # Run MDS on clr assay with Aitchison distance
 tse_sub <- runMDS(
   tse_sub,
-  FUN = vegan::vegdist,
+  FUN = getDissimilarity,
   method = "euclidean",
   assay.type = "clr",
   name = "MDS_aitchison"
   )
 ```
 
-The function `vegan::avgdist()` can use rarefaction to compute beta diversity:
+Then let's do the same with rarefaction:
 
 
 ```{r}
@@ -333,20 +341,17 @@ clr <- function (x) {
 
 # Run transformation after rarefactions before calculating the beta diversity..
 tse_sub <- runMDS(tse_sub,
-  assay.type="counts",
+  assay.type = "counts",
   ntop = nrow(tse_sub),
-  FUN = vegan::avgdist, # Custom beta diversity function that includes rarefaction
-  sample = min(colSums(assay(tse_sub, "counts"))), # rarefaction depth
-  distfun = vegan::vegdist, # Use vegdist beta diversity function
-  dmethod = "euclidean", # euclidean beta diversity
-  iterations = 10,
-  transf=clr,
-  replace=TRUE,
+  FUN = getDissimilarity,
+  method = "euclidean",
+  niter = 10, # Number of iterations
+  sample = min(colSums(assay(tse_sub, "counts"))), # Rarefaction depth
+  transf = clr, # Applied transformation
+  replace = TRUE,
   name = "MDS_aitchison_rarefied"
   )
 
-plotReducedDim(tse_sub, "MDS_aitchison_rarefied")
-
 # Generate plots for non-rarefied and rarefied Bray-Curtis distances scaled by
 # MDS
 plots <- lapply(
@@ -359,57 +364,50 @@ wrap_plots(plots) +
   plot_layout(guides = "collect")
 ```
 
-```{r}
-#| label = rarefaction3
-
-# Check correlation between non-rarefied and rarefied Aichitson distances
-print(all.equal(
-  current = cor(
-    as.vector(reducedDim(tse_sub, "MDS_aitchison")),
-    as.vector(reducedDim(tse_sub, "MDS_aitchison_rarefied"))
-    ),
-    target = 1,
-    tolerance = 0.05))
-```
-
 `ntop` is a `runMDS()` option. Here it is set to total number of features in 
 `GlobalPatterns` dataset so that they are not filtered. If `ntop` was not set to 
 the total number of features, only the `ntop` features with highest variance would
 be used for dimensionality reduction and the other features would be filtered.
 
+When rarefaction is not done, the `getDissimilarity()` utilizes
+`vegan::vegdist()` function while in rafection `vegan::avgdist()` is applied.
 The argument `sample` is set to the smallest metagenomic reads count across
 all samples. This ensures that no sample will be removed during the rarefaction
 process.
 
-The argument `iterations` is by default set to 100 but 10 iterations is often
+The argument `niter` is by default set to 100 but 10 iterations is often
 sufficient for beta diversity calculations.
 
 To use transformations after the rarefaction of the samples and before the beta 
-diversity calculation, `vegan::avgdist()` has the argument `transf`.
+diversity calculation,  we can use the argument `transf`.
 
 We can also plot the correlation between principal coordinates PCx and PCy for
 both the rarefied and non-rarefied distance calculations:
 
 ```{r}
 #| label: PC_correlation
-par(mfrow=c(1, 2))
-for (k in 1:2) {
 
+library(ggpubr)
+
+p <- lapply(1:2, function(i){
   # Principal axes are sign invariant;
   # align the signs; if the correlation is negative then swap the other axis
-  pcx <- reducedDim(tse_sub, "MDS_aitchison")[,k]
-  pcy <- reducedDim(tse_sub, "MDS_aitchison_rarefied")[,k]
-  pcy <- sign(cor(pcx, pcy)) * pcy
-  r <- cor(pcx, pcy)
-
-  plot(pcx, pcy, main=paste0("PC", k, "; r=",round(r, 3)));
-  abline(0, 1)
-}
+  original <- reducedDim(tse_sub, "MDS_aitchison")[, i]
+  rarefied <- reducedDim(tse_sub, "MDS_aitchison_rarefied")[, i]
+
+  temp  <- ggplot(data = data.frame(original, rarefied), aes(x = original, y = rarefied)) +
+    geom_point() + geom_smooth(method = "lm") +
+    stat_cor(method = "pearson") +
+    labs(title = paste0("Principal coordinate ", i))
+  return(temp)
+})
+wrap_plots(p)
 ```
 
-The mia package includes the rarefaction function `rarefyAssay()` that randomly
-subsamples a given assay of a `TreeSummarizedExperiment` dataset. However, the 
-iterations and beta diversity calculations are not included.
+The `mia` package's `addAlpha()` and `getDissimilarity()` functions support
+rarefaction in alpha and beta diversity calculations. Additionally, the
+`rarefyAssay()` function allows random subsampling of a given assay within a
+`TreeSummarizedExperiment` dataset.
 
 ### Other ordination methods {#sec-other-ord-methods}
 
@@ -468,14 +466,15 @@ better if smaller. This can be calculated as shown below.
 
 ```{r relstress}
 # Quantify dissimilarities in the original feature space
-d0 <- as.matrix(vegan::vegdist(t(assay(tse, "relabundance")), "bray"))
+d0 <- as.matrix(getDissimilarity(t(assay(tse, "relabundance")), "bray"))
 
 # PCoA Ordination
-tse <- runMDS(tse,
-                FUN = vegan::vegdist,
-                name = "PCoA",
-                method = "bray",
-                assay.type = "relabundance")
+tse <- runMDS(
+  tse,
+  FUN = getDissimilarity,
+  name = "PCoA",
+  method = "bray",
+  assay.type = "relabundance")
 
 # Quantify dissimilarities in the ordination space
 dp <- as.matrix(dist(reducedDim(tse, "PCoA")))
@@ -573,7 +572,7 @@ the community are equivalent between the compared groups. A p-value
 smaller than the significance threshold indicates that the groups have
 a different community composition. This method is implemented with the
 [`adonis2`](https://www.rdocumentation.org/packages/vegan/versions/2.4-2/topics/adonis)
-function from the vegan package. You can find more on PERMANOVA from
+function from the `vegan` package. You can find more on PERMANOVA from
 [here](https://microbiome.github.io/OMA/docs/devel/pages/97_extra_materials.html#compare-permanova).
 
 We see that both clinical status and age explain more than 10% of the
@@ -696,8 +695,9 @@ Next, we perform PCoA with Bray-Curtis dissimilarity.
 ```{r}
 tse_genus <- runMDS(
     tse_genus,
-    FUN = vegan::vegdist,
+    FUN = getDissimilarity,
     name = "PCoA_BC",
+    method = "bray",
     assay.type = "relabundance")
 ```
 

diff --git a/inst/pages/clustering.qmd b/inst/pages/clustering.qmd
@@ -39,7 +39,6 @@ as hierarchical clustering, DBSCAN, and K-means.
 ```{r load_bluster}
 # Load dependencies
 library(bluster)
-library(kableExtra)
 ```
 
 In the examples of this chapter we use *peerj13075* data for
@@ -91,7 +90,7 @@ altExp(tse, "prevalent") <- addCluster(
     altExp(tse, "prevalent"),
     assay.type = "counts",
     by = "cols",
-    HclustParam(method = "ward.D2", dist.fun = vegdist, metric = "bray"),
+    HclustParam(method = "ward.D2", dist.fun = getDissimilarity, metric = "bray"),
     full = TRUE,
     clust.col = "hclust")
 
@@ -128,18 +127,19 @@ altExp(tse, "prevalent") <- addDominant(altExp(tse, "prevalent"))
 labels(dend) <- altExp(tse, "prevalent")$dominant_taxa
 dend <- color_labels(dend, k = k, col = unlist(cols))
 
-# Plot dendrogram
+# Adjust frame for the figure
 par(mar=c(9, 3, 0.5, 0.5))
-dend %>% set("labels_cex", 0.8) %>% plot()
+# Plot dendrogram
+dend |> set("labels_cex", 0.8) |> plot()
 ```
 
 We can also visualize the clusters by projecting the data onto
 two-dimensional
 surface that captures the most variability in the data. In this example, we use
-multi-dimensional scaling (MDS).
+multi-dimensional scaling (MDS) (see [@sec-unsupervised-ordinantion]).
 
 ```{r hclust3}
-library (scater)
+library(scater)
 
 # Add the MDS dimensions for plotting
 altExp(tse, "prevalent") <- runMDS(
@@ -266,26 +266,8 @@ we can plot which taxa define the key cluster features.
 best_model <- metadata(altExp(tse, "prevalent"))$DMM$dmm[2]
 drivers <- as.data.frame(best_model[[1]]@fit$Estimate)
 
-# Clean names
-drivers$class <- rownames(drivers)
-
-for (i in 1:2) {
-  # Transfer to relative to optimize plotting
-  drivers[i] <- drivers[i]/sum(drivers[i])
-
-  drivers <- drivers[order(drivers[[i]], decreasing = TRUE),]
-  p <- ggplot(head(drivers, 5), aes(
-      x = reorder(head(class, 5), + head(drivers[[i]], 5)),
-      y = head(drivers[[i]], 5))) +
-
-      geom_bar(stat = "identity", fill = scales::hue_pal()(1), alpha = 0.6) +
-      coord_flip() +
-      labs(title = paste("Top phyla in group", i)) +
-      theme_minimal(base_size = 11) +
-      labs(x="", y="") +
-      scale_y_continuous(limits=c(0,1))
-  print(p)
-}
+# Plot by utilizng miaViz::plotLoadings
+plotLoadings(as.matrix(drivers), ncomponents = 2)
 ```
 
 As well as in hierarchical clustering, we can also visualize the clusters by