Rock_weathering_analysis4paper.RMD

---
title: "Role of BRC in arid rock weathering"
subtitle: "Data analysis and plotting for publication"
author: "Roey Angel (<roey.angel@bc.cas.cz>)"
date: "`r Sys.Date()`"
bibliography: references.bib
link-citations: yes
always_allow_html: yes
output:
  rmarkdown::html_document:
    toc: true
    toc_float: true
    keep_md: true
    keep_tex: true
    number_sections: false
    highlight: "pygments"
    theme: "flatly"
    dev: "png"
    df_print: "kable"
    fig_caption: true
    code_folding: "show"
---

```{r libraries, include=F}
# Load libraries
#.libPaths(c('~/R/library', .libPaths())) # Uncomment if you have no write access to R path

repo <- "http://cran.wu.ac.at"

.cran_libs <- c(
  "devtools", # Tools to Make Developing R Packages Easier
  "knitr", # A General-Purpose Package for Dynamic Report Generation in R
  "kableExtra", # Construct Complex Table with 'kable' and Pipe Syntax
  "rmarkdown", # Dynamic Documents for R
  "extrafont", # for extra figure fonts
  "MASS", # Support Functions and Datasets for Venables and Ripley's MASS
  "rcompanion", #Functions to Support Extension Education Program Evaluation
  "tidyverse", # for dplyr forcats ggplot2 readr tibble
  "cowplot", # wrappers for ggplot
  "magrittr", # pipes
  "scales", # Generic plot scaling methods
  "svglite", # for svg files
  "car", # Companion to Applied Regression
  "userfriendlyscience", # Quantitative Analysis Made Accessible
  "agricolae", # Statistical Procedures for Agricultural Research
  "vegan", # community ecology methods
  "doParallel", # parallel backend for the foreach/%dopar% function
  "BiodiversityR", # Package for Community Ecology and Suitability Analysis
  "hexbin", # Hexagonal Binning Routines
  "ggtern", # An Extension to 'ggplot2', for the Creation of Ternary Diagrams
  "nlme", # Linear and Nonlinear Mixed Effects Models 
  "MuMIn" # Multi-Model Inference
) 

# .inst <- .cran_libs %in% installed.packages()
# if (any(!.inst)) {
#   install.packages(.cran_libs[!.inst],
#                    repos = repo)
# }

.bioc_libs <- c(
  "phyloseq", # Handling and analysis of high-throughput phylogenetic sequence data
  "ALDEx2", # Analysis Of Differential Abundance Taking Sample Variation Into Account
  "vsn" # Variance stabilization and calibration for microarray data
)

# if (!"phyloseq" %in% installed.packages()) {
#   source(
#     "https://raw.githubusercontent.com/joey711/phyloseq/master/inst/scripts/installer.R",
#     local = TRUE
#  )
# }

# .inst <- .bioc_libs %in% installed.packages()
# if (any(!.inst)) {
#   source("http://bioconductor.org/biocLite.R")
#   biocLite(ask = F, lib = Sys.getenv("R_LIBS_USER")) # upgrade bioC packages
#   biocLite(.bioc_libs[!.inst], ask = F)
# }

.github_libs <- c(
  "gadenbuie/ggpomological", # Pomological plot themes for ggplot2
  "datarootsio/artyfarty" # Themes for ggplot2
)

.github_lib_names <- stringr::str_replace(.github_libs, ".*/(.*)$", "\\1")

# .github_inst <- .github_lib_names %in% installed.packages()
# if (any(!.github_inst)) {
#   devtools::install_github(.github_libs[!.github_inst],
#                            dependencies = TRUE)
# }

# Load packages into session, and print package version
(loaded.libs <- sapply(c(.cran_libs, .bioc_libs, .github_lib_names), require, character.only = TRUE))
if (!all(loaded.libs)) {stop(paste("Package(s):", names(loaded.libs[loaded.libs == FALSE]), "could not be loaded"))}
# sapply(c(.cran_libs, .bioc_libs, .github_lib_names), packageVersion)
```

```{r style settings, include=F}
graphic_device <- "svglite"
options(width = 90, knitr.table.format = "html") 
opts_chunk$set(
  warning = FALSE,
  message = FALSE,
  cache = TRUE,
  dev = graphic_device,
  fig.ext = "svg",
  #  fig.width=12,
  #  fig.height=8,
  cache.path = "Rock_weathering_cache/",
  fig.path = "Rock_weathering_figures/"
)

f_name <- "DejaVu Sans" #sub("\\s//", "", f_name)
f_size <- 12
font_import(pattern = "DejaVuSans", prompt = FALSE)
loadfonts() # registers fonts
theme_set(theme_bw(base_size = f_size, base_family = f_name))
pom4 <- ggpomological:::pomological_palette[c(2, 1, 9, 3)]
pom2 <- c(ggpomological:::pomological_base[[7]], ggpomological:::pomological_palette[[1]])
```

```{r functions, include=F}
PlotLibDist <- function(physeq) {
  ggplot(sample_data(physeq),
         aes(x = Replicate, y = Lib.size, fill = Source)) +
    geom_bar(stat = "identity",
             position = "dodge",
             color = "black") +
    scale_y_log10(
      breaks = trans_breaks("log10", function(x)
        10 ^ x),
      labels = trans_format("log10", math_format(10 ^ .x))
    ) +
    background_grid(major = "xy", minor = "none") +
    # scale_fill_locuszoom() +
    facet_grid(Climate ~ .)
}

PlotReadHist <- function(OTUmat, b.width = 10) {
  OTUmat %>%
    t() %>%
    as.tibble() %>%
    gather(key = sample, value = abundance) %>%
    ggplot(aes(abundance)) +
    # geom_histogram(binwidth = 1000) +
    geom_freqpoly(binwidth = b.width) +
    scale_y_log10()
}

GMPR <- function(comm,
                  intersect.no = 4,
                  ct.min = 4) {
  require(matrixStats)
  # Taken from: https://github.com/jchen1981/GMPR
  # 
  # Computes the GMPR size factor
  #
  # Args:
  #   comm: a matrix of counts, row - features (OTUs, genes, etc) , column - sample
  #   intersect.no: the minimum number of shared features between sample pair, where the ratio is calculated
  #   ct.min: the minimum number of counts required to calculate ratios （Empirical study found ct.min=4 is suitable)
  
  #
  # Returns:
  #   a list that contains:
  #      gmpr： the GMPR size factors for all samples; Samples with distinct sets of features will be output as NA.
  #      nss:   number of samples with significant sharing (> intersect.no) including itself
  
  # mask counts < ct.min
  comm[comm < ct.min] <- 0
  
  if (is.null(colnames(comm))) {
    colnames(comm) <- paste0('S', 1:ncol(comm))
  }
  
  cat('Begin GMPR size factor calculation ...\n')
  
  comm.no <- numeric(ncol(comm))
  gmpr <- sapply(1:ncol(comm),  function(i) {
    if (i %% 50 == 0) {
      cat(i, '\n')
    }
    x <- comm[, i]
    # Compute the pairwise ratio
    pr <- x / comm
    # Handling of the NA, NaN, Inf
    pr[is.nan(pr) | !is.finite(pr) | pr == 0] <- NA
    # Counting the number of non-NA, NaN, Inf
    incl.no <- colSums(!is.na(pr))
    # Calculate the median of PR
    pr.median <- colMedians(pr, na.rm = TRUE)
    # Record the number of samples used for calculating the GMPR
    comm.no[i] <<- sum(incl.no >= intersect.no)
    # Geometric mean of PR median
    if (comm.no[i] > 1) {
      return(exp(mean(log(pr.median[incl.no >= intersect.no]))))
    } else {
      return(NA)
    }
  })
  
  if (sum(is.na(gmpr))) {
    warning(
      paste0(
        'The following samples\n ',
        paste(colnames(comm)[is.na(gmpr)], collapse = '\n'),
        '\ndo not share at least ',
        intersect.no,
        ' common taxa with the rest samples! ',
        'For these samples, their size factors are set to be NA! \n',
        'You may consider removing these samples since they are potentially outliers or negative controls!\n',
        'You may also consider decreasing the minimum number of intersecting taxa and rerun the procedure!\n'
      )
    )
  }
  
  cat('Completed!\n')
  cat(
    'Please watch for the samples with limited sharing with other samples based on NSS! They may be outliers! \n'
  )
  names(gmpr) <- names(comm.no) <- colnames(comm)
  return(list(gmpr = gmpr, nss = comm.no))
}

PlotLmResid <- function(lm.df, which = c(1:6), mfrow = c(3, 2)){
  require(grid)

  if (length(levels(as.factor(lm.df$.fitted))) < 10) {# if number of unique x values is <10 just draw a line through the means
    smoother <- stat_summary(fun.y = mean, colour = "red", geom = "line")
  } else smoother <- stat_smooth(method = "loess", geom = "smooth", se = FALSE, colour = "firebrick", size = 1)
  
  # residuals vs fitted
  g1 <- ggplot(lm.df, aes(.fitted, .resid)) +
    geom_point()  +
    smoother + 
    geom_hline(yintercept = 0, linetype = 2, size = .2) +
    scale_x_continuous("Fitted Values") +
    scale_y_continuous("Residual") +
    labs(title = "Residuals vs Fitted")
  
  # normal qq
  a <- quantile(lm.df$.stdresid, c(0.25, 0.75), na.rm = TRUE)
  b <- qnorm(c(0.25, 0.75))
  slope <- diff(a)/diff(b)
  int <- a[1] - slope * b[1]
  g2 <- ggplot(lm.df, aes(sample = .stdresid)) +
    stat_qq() +
    geom_abline(slope = slope, intercept = int, colour = "firebrick", size = 1) +
      scale_x_continuous("Theoretical Quantiles") +
      scale_y_continuous("Standardized Quantiles") +
      labs(title = "Normal Q-Q")
 
  # scale-location
  g3 <- ggplot(lm.df, aes(.fitted, sqrt(abs(.stdresid)))) +
    geom_point() +
    smoother +
    scale_x_continuous("Fitted Values") +
    scale_y_continuous("Root of Standardized Residuals") +
    labs(title = "Scale-Location")
 
  # residuals vs leverage
  g4 <- ggplot(lm.df, aes(factors, .stdresid)) +
    geom_point() +
    smoother +
    geom_hline(yintercept = 0, linetype = 2, size = .2) +
    scale_x_continuous("Factor Level Combinations") +
    scale_y_continuous("Standardized Residuals") +
    labs(title = "Residuals vs Factor Levels")
 
#   # cook's distance
#   g5 <-  ggplot(lm.df, aes(rows, .cooksd, ymin=0, ymax=.cooksd)) +
#     geom_point() + geom_linerange() +
#     scale_x_continuous("Observation Number") +
#     scale_y_continuous("Cook's distance") +
#     labs(title="Cook's Distance")  
  
  # cooksd vs leverage
  g5 <- ggplot(lm.df, aes(factors, .cooksd)) +
    geom_point() +
    smoother +
    scale_x_continuous("Factor Level Combinations") +
    scale_y_continuous("Cook's distance") +
    labs(title = "Cook's dist vs Leverage")
  
  # g6 <- PlotACF(lm.df)
  bw <- diff(range(lm.df$.resid)) / (2 * IQR(lm.df$.resid) / length(lm.df$.resid) ^ (1/3))
  sshist <- function(x){ # optimise bins
  # 2006 Author Hideaki Shimazaki
  # Department of Physics, Kyoto University
  # shimazaki at ton.scphys.kyoto-u.ac.jp
	N <- 2 : 100
	C <- numeric(length(N))
	D <- C

	for (i in 1:length(N)) {
		D[i] <- diff(range(x)) / N[i]

		edges = seq(min(x), max(x), length=N[i])
		hp <- hist(x, breaks = edges, plot=FALSE)
		ki <- hp$counts

		k <- mean(ki)
		v <- sum((ki-k) ^ 2) / N[i]

		C[i] <- (2 * k-v) / D[i] ^ 2	#Cost Function
	}

	idx <- which.min(C)
	optD <- D[idx]

	bins <- seq(min(x), max(x), length=N[idx])
	# h = hist(x, breaks = bins)
	# rug(x)

	return(bins)
  }
  
  bins <- sshist(lm.df$.resid)
  g6 <- ggplot(lm.df, aes(.resid)) + 
    geom_histogram(breaks = bins)
 
  plots <- list(g1, g2, g3, g4, g5, g6)
 
  # making the plots
  grid.newpage()
 
  if (prod(mfrow) > 1) {
    mypos <- expand.grid(1:mfrow[1], 1:mfrow[2])
    mypos <- mypos[with(mypos, order(Var1)), ]
    pushViewport(viewport(layout = grid.layout(mfrow[1], mfrow[2])))
    formatter <- function(.){}
  } else {
    mypos <- data.frame(matrix(1, length(which), 2))
    pushViewport(viewport(layout = grid.layout(1, 1)))
    formatter <- function(.) {
      .dontcare <- readline("Hit <Return> to see next plot: ")
      grid.newpage()
    }
  }
 
  j <- 1
  for (i in which) {
    formatter()
    print(plots[[i]], vp = viewport(layout.pos.row = mypos[j, ][1], layout.pos.col = mypos[j, ][2]))
    j <- j + 1
  }
}

PlotACF <- function(lm.df){
  ## generate ACF plot for lm and lme
  # compute acf without plotting
  acz <- acf(lm.df$.resid, plot = F)
  # convert to data frame
  acd <- data.frame(lag = acz$lag, acf = acz$acf)
  # use data frame for ggplot
  ggplot(acd, aes(lag, acf)) + 
    geom_bar(colour = "black", fill = "black", stat = "identity", position = "dodge", width = 0.01) +
    geom_point(colour = "black") +
    geom_hline(yintercept = c(0.05, -0.05), linetype = "dashed") +
    geom_hline(yintercept = 0)
}

TestAlpha <-
  function(data2test = Richness_Diversity_long,
           response_name = "Estimate",
           factor_names = c("Climate", "Uni.Source"),
           boxcox.trans = FALSE) {
    print(leveneTest(as.formula(paste(response_name, paste(factor_names[1], factor_names[2], sep = " * "), sep = " ~ ")), data2test)) # test for homogeneity
    mod_data <-
        aov(as.formula(
          paste(response_name, paste(factor_names[1], factor_names[2], sep = " * "), sep = " ~ ")
        ), data2test)
    
    if (boxcox.trans) { # employ boxcox transformation then recalculate model
      print("Performing Box-Cox transformation of the data")
      lambdas <- boxcox(as.formula(
          paste(response_name, paste(factor_names[1], factor_names[2], sep = " * "), sep = " ~ ")
        ), data=data2test, lambda = seq(0, 1.0, 0.01))
      print(range(lambdas$x[lambdas$y > max(lambdas$y) - qchisq(0.95,1)/2]))
      print(l.max <- lambdas$x[which.max(lambdas$y)])
      if (l.max == 0) l.max <- 1
      data2test$Estimate.box <- (data2test$Estimate ^ l.max - 1)/l.max
      mod_data <-
        aov(as.formula(
          paste("Estimate.box", paste(factor_names[1], factor_names[2], sep = " * "), sep = " ~ ")
        ), data2test)
    }
    
    print(mod_data)
    mod_data_df <- fortify(mod_data)
    factor.combinations <-
      as.numeric(factor(
        paste(mod_data_df[, factor_names[1]], mod_data_df[, factor_names[2]]),
        levels = unique(as.character(paste(
          mod_data_df[, factor_names[1]], mod_data_df[, factor_names[2]]
        )))
      )) # needed for "residuals vs leverage
    mod_data_df <-
      cbind(mod_data_df,
            rows = 1:nrow(mod_data_df),
            factors = factor.combinations)
    PlotLmResid(mod_data_df)
    print(summary(mod_data)) # display Type I ANOVA table
    # drop1(mod_amp.2,~.,test="F") # type III SS and F Tests
    print(model.tables(mod_data,"means"), digits = 3) # Show the means
    return(mod_data)
  }

PairwiseAdonis <- function(x, factors, sim.function = "vegdist", sim.method = "bray", 
    p.adjust.m = "BH", reduce = NULL) 
{
  # Taken from: https://github.com/pmartinezarbizu/pairwiseAdonis
    co <- combn(unique(as.character(factors)), 2)
    pairs <- c()
    total.DF <- c()
    F.Model <- c()
    R2 <- c()
    p.value <- c()
    for (elem in 1:ncol(co)) {
        if (sim.function == "daisy") {
            x1 = daisy(x[factors %in% c(co[1, elem], co[2, elem]), 
                ], metric = sim.method)
        }
        else {
            x1 = vegdist(x[factors %in% c(co[1, elem], co[2, 
                elem]), ], method = sim.method)
        }
        ad <- adonis(x1 ~ factors[factors %in% c(co[1, elem], 
            co[2, elem])], permutations = 9999)
        pairs <- c(pairs, paste(co[1, elem], "vs", co[2, elem]))
        total.DF <- c(total.DF, ad$aov.tab["Total", 1])
        F.Model <- c(F.Model, ad$aov.tab[1, 4])
        R2 <- c(R2, ad$aov.tab[1, 5])
        p.value <- c(p.value, ad$aov.tab[1, 6])
    }
    p.adjusted <- p.adjust(p.value, method = p.adjust.m)
    sig = c(rep("", length(p.adjusted)))
    sig[p.adjusted <= 0.05] <- "."
    sig[p.adjusted <= 0.01] <- "*"
    sig[p.adjusted <= 0.001] <- "**"
    sig[p.adjusted <= 1e-04] <- "***"
    pairw.res <- data.frame(pairs, total.DF, F.Model, R2, p.value, 
        p.adjusted, sig)
    if (!is.null(reduce)) {
        pairw.res <- subset(pairw.res, grepl(reduce, pairs))
        pairw.res$p.adjusted <- p.adjust(pairw.res$p.value, method = p.adjust.m)
        sig = c(rep("", length(pairw.res$p.adjusted)))
        sig[pairw.res$p.adjusted <= 0.05] <- "."
        sig[pairw.res$p.adjusted <= 0.01] <- "*"
        sig[pairw.res$p.adjusted <= 0.001] <- "**"
        sig[pairw.res$p.adjusted <= 1e-04] <- "***"
        pairw.res <- data.frame(pairw.res[, 1:5], sig)
    }
    class(pairw.res) <- c("pwadonis", "data.frame")
    return(pairw.res)
}

STAMPR <- function(physeq_obj, rank = "Phylum", sig_pairs, threshold = 0.005) {
  # run a STAMP-like analysis: compare relative abundance differences using two-way analysis, then run a post-hoc test and correct for multiple comparisons
  
  physeq_glom <- tax_glom(physeq_obj,
                          rank,
                          NArm = TRUE)
  physeq_glom_rel <-
    transform_sample_counts(physeq_glom, function(x)
      x / sum(x))
  
  # group dataframe by rank, calculate median rel. abundance and keep taxa above threshold
  physeq_glom_rel %>% 
    psmelt %>% 
    group_by_(rank) %>%
    summarise(median = median(Abundance)) %>% 
    filter(median >= threshold) %>% 
    pull(1) %>% 
    as.character() ->
    Taxa2test
  
  physeq_glom_rel_abund <- prune_taxa(tax_table(physeq_glom_rel)[, rank] %in% Taxa2test, physeq_glom_rel)
  
  taxa_test_results <-
    bind_cols(Phylum = tax_table(physeq_glom_rel_abund)[, rank],
              as.data.frame(matrix(
                NA,
                nrow = ntaxa(physeq_glom_rel_abund),
                ncol = length(sig_pairs) + 6 # sig_pairs is taken from pairwise adonis
              )))
  colnames(taxa_test_results) <-
    c(
      rank,
      "Climate - P",
      "Climate - Eta",
      "Source - P",
      "Source - Eta",
      "ClimateXSource - P",
      "ClimateXSource - Eta",
      sig_pairs
    )
  
  taxa_test_stats <-
    bind_cols(Phylum = tax_table(physeq_glom_rel_abund)[, rank],
              as.data.frame(matrix(
                NA,
                nrow = ntaxa(physeq_glom_rel_abund),
                ncol = (length(sig_pairs) * 5) # sig_pairs is taken from pairwise adonis
              )))
  colnames(taxa_test_stats) <-
    c(rank, c(rbind(
      t(str_split_fixed(sig_pairs, " vs ", n = 2)), matrix(rep(
        c("Estimate diff.", "low CI", "high CI"), length(sig_pairs)
      ), ncol = length(sig_pairs))
    ))) # this is ugly but it works well, basically c() flattens a matrix
  
  for (phy_id in seq(ntaxa(physeq_glom_rel_abund))) {
    data2test <-
      bind_cols(Abundance = otu_table(physeq_glom_rel_abund)[, phy_id] * 100,
                as(sample_data(physeq_glom_rel_abund), "data.frame"))
    # kruskal.test(Abundance ~ Climate.Source, data = data2test)
    print(tax_table(physeq_glom_rel_abund)[phy_id, rank])
    print(SRH_mod <-
            scheirerRayHare(Abundance ~ Climate + Source, data = data2test))
    taxa_test_results[phy_id, c(2, 4, 6)] <-
      SRH_mod$p.value[1:3] # p values
    taxa_test_results[phy_id, c(3, 5, 7)] <-
      SRH_mod$`Sum Sq`[1:3] / sum(SRH_mod$`Sum Sq`) # Eta (effect size)
    for (pair in seq(length(sig_pairs))) {
      pair2test <- unlist(str_split(sig_pairs[pair], " vs "))
      possibleError <- tryCatch(
        wilcox_mod <-
          wilcox.test(
            Abundance ~ Climate.Source,
            data = data2test,
            subset = Climate.Source %in% pair2test,
            conf.int = TRUE,
            exact = FALSE
          ),
        error = function(e) e
      ) # AKA Mann Whitney
      if (inherits(possibleError, 'error')) {
        print(possibleError)
        taxa_test_results[phy_id, pair + 7] <- NA
      } else {
        print(wilcox_mod)
        taxa_test_results[phy_id, pair + 7] <- wilcox_mod$p.value
        taxa_test_stats[phy_id, (pair - 1) * 5 + 2] <-
          mean(data2test[data2test$Climate.Source %in% pair2test[1],]$Abundance)
        taxa_test_stats[phy_id, (pair - 1) * 5 + 3] <-
          mean(data2test[data2test$Climate.Source %in% pair2test[2],]$Abundance)
        taxa_test_stats[phy_id, (pair - 1) * 5 + 4] <-
          wilcox_mod$estimate
        taxa_test_stats[phy_id, (pair - 1) * 5 + c(5, 6)] <-
          wilcox_mod$conf.int[c(1, 2)]
      }
    }
  }
  
  # Correct for FDR for each comparison pair
  for (pair in seq(2, ncol(taxa_test_results))) {
    # print(pair)
    taxa_test_results[, pair] <-
      p.adjust(pull(taxa_test_results[, pair]) , method = "BH") # Benjamini, Y., and Yekutieli, D. (2001). The control of the false discovery rate in multiple testing under dependency. Annals of Statistics 29, 1165–1188.
    # qvalue(p = pull(taxa_test_results[, pair]))
  }
  
  write.csv(taxa_test_results, file = paste0("Results/", deparse(substitute(physeq_obj)), "_", rank, "_Pvals.csv"))
  write.csv(taxa_test_stats, file = paste0("Results/", deparse(substitute(physeq_obj)), "_", rank, "_CI.csv"))
  Taxa_tests <- list(taxa_test_results, taxa_test_stats)
  return(Taxa_tests)
}

CalcALDEx <- function(physeq_obj = Rock_weathering_filt3_Rocks, sig_level = 0.05, LFC = 0.322) {
  physeq_obj <- filter_taxa(physeq_obj, function(x) sum(x) > 0, TRUE)
  # physeq_obj <- prune_taxa(sig_taxa, physeq_obj) # remove taxa not significant under the full model
  data2test <- t(otu_table(physeq_obj))
  comparison <- as.character(unlist(sample_data(physeq_obj)[, "Climate.Source"]))
  ALDEx <- aldex.clr(
    data2test,
    comparison,
    mc.samples = 128,
    denom = "iqlr", # iqlr for slight assymetry in composition
    verbose = TRUE,
    useMC = TRUE
  ) 
  ALDEx_tt <- aldex.ttest(ALDEx, comparison, paired.test = FALSE) # for two conditions
  ALDEx_effect <- aldex.effect(
    ALDEx,
    comparison,
    include.sample.summary = TRUE,
    verbose = TRUE,
    useMC = TRUE
  ) # estimate effect sizes
  ALDEx2plot <- PrepAlDExData(ALDEx_tt, ALDEx_effect, physeq_obj, sig_level, LFC, Taxa_rank)
  return(ALDEx2plot)
}

PrepAlDExData <- function(ALDEx_tt, ALDEx_effect, physeq_obj = Rock_weathering_filt3_Rocks, sig_level, LFC, Taxa_rank) {
  ALDEx2plot <- data.frame(ALDEx_tt, ALDEx_effect) # merge results
  # group dataframe by OTU, calculate median rel. abundance
  physeq_obj %>%
    transform_sample_counts(., function(x) x / sum(x) * 100) %>% 
    psmelt() %>%
    group_by(OTU) %>%
    # filter(OTU %in% sig_taxa) %>%
    summarise(baseMean = mean(Abundance)) ->
    baseMean
  
  ALDEx2plot$OTU <- rownames(ALDEx2plot)
  ALDEx2plot %<>% left_join(., baseMean, by = "OTU") # add mean abundance to results table
  ALDEx2plot$Phylum <-
    tax_table(physeq_obj)[taxa_names(physeq_obj) %in% ALDEx2plot$OTU, "Phylum"] # add phylum data
  # change their name to "Rare"
  ALDEx2plot[ALDEx2plot$Phylum %in% Rare_phyla,]$Phylum <- 'Rare' # Rare_phyla is calcuted for the taxa box plots
  ALDEx2plot$Significance <- factor("Fail", levels = c("Fail", "Pass")) # define significance factor
  ALDEx2plot$Significance[ALDEx2plot$wi.eBH < sig_level &
                            !is.na(ALDEx2plot$wi.eBH) &
                            abs(ALDEx2plot$effect) > LFC] <- "Pass"
  # ALDEx2plot$Significance <- as.factor(sapply(ALDEx2plot$wi.eBH, function(x) if (is.na(x) | x > 0.05) {x <- "Fail"} else {x <- "Pass"}))
  # Rank by taxa abundance
  ALDEx2plot$Phylum %<>%
    factor(., levels = Taxa_rank$Phylum) %>%  # Taxa_rank is calcuted for the taxa box plots
    fct_relevel(., "Rare", after = Inf)
  return(ALDEx2plot)
}

GGPlotALDExTax <- function(ALDEx2plot, OTU_labels = FALSE, Taxa = "Phylum", Y_val = "effect", sig_level = 0.05) {
  p <-
    ggplot(ALDEx2plot) +
    geom_jitter(aes_string(
             x = Taxa,
             y = Y_val,
             colour = "Significance",
             size = "baseMean"),
             alpha = 2 / 3, 
             width = 0.3,
             stroke = 0) +
    xlab("") +
    ylab(expression(paste("Effect size (lo", g[2], " fold change)"))) +
    # ylab("Fold change") +
    labs(colour = paste("Significance at \n p <", sig_level), size = "Mean count (%)") +
    theme_grey(base_size = 18,  base_family = f_name) +
    theme(axis.text.x = element_text(angle = 45.0, vjust = 1, hjust = 1)) +
    guides(colour = guide_legend(override.aes = list(size = 5))) +
    scale_colour_manual(values = pom2) +
    scale_size_continuous(range = c(1, 5), breaks = c(1, 2.5, 5, 10))
  
  if (OTU_labels) {
    p <- p + geom_text_repel(
      aes_string(x = Taxa, y = Y_val),
      size = 8,
      label = sub("OTU_([0-9]+)", "\\1", rownames(ALDEx2plot[ALDEx2plot$Significance == "Pass", ])),
      data = ALDEx2plot[ALDEx2plot$Significance == "Pass", ],
      nudge_x = 0.4,
      colour = "#707070"
    )
  }
  return(p)
}

gz <- function(in_path, out_path = tempfile()) {
  # Compress a file using gz and delete the uncompressed file
  out <- gzfile(out_path, "w")
  writeLines(readLines(in_path), out)
  close(out)

  file.remove(in_path)
  invisible(out_path)
}

```
## New insights into the role of epilithic biological crust in arid rock weathering
This script reproduces all sequence analysis steps and plots included in the paper plus some additional exploratory analyses.
The analysis is heavily based on the phyloseq package [@mcmurdie_phyloseq_2013], but also on many other R packages.

```{r general parameters}
set.seed(123456789)
bootstraps <- 1000
min_lib_size <- 1000
```

**Load data**
```{r load data, cache=T}
read.csv("Data/Rock_weathering_new2_otuTab.txt", header = TRUE, row.names = 1, sep = "\t") %>%
  t() %>% 
  as.data.frame() ->
  Rock_weathering_OTUmat

sort_order <- as.numeric(gsub("OTU([0-9]+)", "\\1", colnames(Rock_weathering_OTUmat)))
Rock_weathering_OTUmat <- Rock_weathering_OTUmat[, order(sort_order)]
row.names(Rock_weathering_OTUmat) <- gsub("(.*)Nimrod[0-9]+|Osnat[0-9]+", "\\1", row.names(Rock_weathering_OTUmat))

Metadata <- read.csv("Data/Rock_weathering_metadata_RA.csv", row.names = 1, header = TRUE)
# Order abundance_mat samples according to the metadata
sample_order <- match(row.names(Rock_weathering_OTUmat), row.names(Metadata))
Rock_weathering_OTUmat %<>% arrange(., sample_order)
Metadata$sample_names <- row.names(Metadata)
Metadata$Uni.Source <- fct_collapse(Metadata$Source, Rock = c("Dolomite", "Limestone"))
Metadata$Climate.Source <-
  factor(
    paste(
      Metadata$Climate,
      Metadata$Source
    ),
    levels = c(
      "Arid Limestone",
      "Arid Dust",
      "Arid Loess soil",
      "Hyperarid Dolomite",
      "Hyperarid Dust",
      "Hyperarid Loess soil"
    ),
    labels = c(
      "Arid limestone",
      "Arid dust",
      "Arid loess soil",
      "Hyperarid dolomite",
      "Hyperarid dust",
      "Hyperarid loess soil"
    )
  )

Metadata$Climate.UniSource <-
  factor(
    paste(
      Metadata$Climate,
      Metadata$Uni.Source
    ),
    levels = c(
      "Arid Rock",
      "Arid Dust",
      "Arid Loess soil",
      "Hyperarid Rock",
      "Hyperarid Dust",
      "Hyperarid Loess soil"
    ),
    labels = c(
      "Arid rock",
      "Arid dust",
      "Arid loess soil",
      "Hyperarid rock",
      "Hyperarid dust",
      "Hyperarid loess soil"
    )
  )
# calculate sample size
Metadata$Lib.size = rowSums(Rock_weathering_OTUmat)
row.names(Rock_weathering_OTUmat) <- row.names(Metadata)

# Load taxonomy data
tax.file <- "Data/Rock_weathering_new2_silva.nrv119.taxonomy"
Taxonomy <- read.table(tax.file,  stringsAsFactors = FALSE) # read taxonomy file

# count how many ';' in each cell and add up to 6
for (i in 1:nrow(Taxonomy)) {
  semicolons <- length(gregexpr(";", Taxonomy$V2[i])[[1]])
  if (semicolons < 6) {
    x <- paste0(rep("Unclassified;", 6 - semicolons), collapse = "")
    Taxonomy$V2[i] <- paste0(Taxonomy$V2[i], x, sep = "")
  }
}

do.call( "rbind", strsplit( Taxonomy$V1, ";", fixed = TRUE)) %>% 
  gsub( "size=([0-9]+)", "\\1", .) %>%
  data.frame( ., do.call( "rbind", strsplit( Taxonomy$V2, ";", fixed = TRUE)), stringsAsFactors = F) %>% 
  apply(., 2, function(x) gsub( "\\(.*\\)", "", x)) %>% 
  replace(., . == "unclassified", "Unclassified") -> 
  Taxonomy

colnames( Taxonomy ) <- c( "OTU", "Frequency", "Domain", "Phylum", "Class", "Order", "Family", "Genus" )
# rownames(Taxonomy) <- colnames(Rock_weathering_OTUmat)
rownames(Taxonomy) <- Taxonomy[, 1]

# generate phyloseq object
Rock_dust <- phyloseq(otu_table(Rock_weathering_OTUmat, taxa_are_rows = FALSE),
                        tax_table(Taxonomy[, -c(1, 2)]),
                        sample_data(Metadata)
                        )

# Reorder factors for plotting
sample_data(Rock_dust)$Source %<>% fct_relevel("Limestone", "Dolomite", "Dust", "Loess soil")
```

Remove samples not for analysis
```{r remove samples, cache=T}
samples2remove <- c(2, 3, 4, 5, 6, 7, 8, 10, 12) 
Rock_dust <- subset_samples(Rock_dust, !grepl(paste(c(sample_names(Rock_dust)[samples2remove]), collapse = "|"), sample_names(Rock_dust)))
Rock_dust <- filter_taxa(Rock_dust, function(x) sum(x) > 0, TRUE)

domains2remove <- c("", "Eukaryota", "Unclassified")
classes2remove <- c("Chloroplast")
families2remove <- c("Mitochondria")

Rock_weathering_filt <- subset_taxa(Rock_dust, !is.na(Phylum) &
                        !Domain %in% domains2remove &
                      !Class %in% classes2remove &
                      !Family %in% families2remove)
```

First let's explore the prevalence of different taxa in the database.
```{r explore prevalence, cache=T}
prevdf <- apply(X = otu_table(Rock_weathering_filt),
                 MARGIN = ifelse(taxa_are_rows(Rock_weathering_filt), yes = 1, no = 2),
                 FUN = function(x){sum(x > 0)})
# Add taxonomy and total read counts to this data.frame
prevdf <- data.frame(Prevalence = prevdf,
                      TotalAbundance = taxa_sums(Rock_weathering_filt),
                      tax_table(Rock_weathering_filt))

prevdf %>%
  group_by(Phylum) %>%
  summarise(`Mean prevalence` = mean(Prevalence),
            `Sum prevalence` = sum(Prevalence)) ->
  Prevalence_phylum_summary

Prevalence_phylum_summary %>% 
  kable(., digits = c(0, 1, 0)) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), full_width = F)

prevdf %>%
  group_by(Order) %>%
  summarise(`Mean prevalence` = mean(Prevalence),
            `Sum prevalence` = sum(Prevalence)) ->
  Prevalence_Order_summary

Prevalence_Order_summary %>% 
  kable(., digits = c(0, 1, 0)) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), full_width = F)
```

Based on that we'll remove all phyla with a prevalence of under 7
```{r remove rare taxa, cache=T}
Prevalence_phylum_summary %>% 
  filter(`Sum prevalence` < 7) %>% 
  select(Phylum) %>% 
  map(as.character) %>% 
  unlist() ->
  filterPhyla

Rock_weathering_filt2 <- subset_taxa(Rock_weathering_filt, !Phylum %in% filterPhyla)
sample_data(Rock_weathering_filt2)$Lib.size <- rowSums(otu_table(Rock_weathering_filt2))
print(Rock_weathering_filt)
print(Rock_weathering_filt2)
```

Plot general prevalence features of the phyla
```{r prevalence phylum, cahce=T, fig.height=12, fig.width=10}
# Subset to the remaining phyla
prevdf_phylum_filt <- subset(prevdf, Phylum %in% get_taxa_unique(Rock_weathering_filt2, "Phylum"))
ggplot(prevdf_phylum_filt,
       aes(TotalAbundance, Prevalence / nsamples(Rock_weathering_filt2), color = Phylum)) +
  # Include a guess for parameter
  geom_hline(yintercept = 0.05,
             alpha = 0.5,
             linetype = 2) + geom_point(size = 2, alpha = 0.7) +
  scale_x_log10() +  xlab("Total Abundance") + ylab("Prevalence [Frac. Samples]") +
  facet_wrap( ~ Phylum) + theme(legend.position = "none")
```

Plot general prevalence features of the top 20 orders
```{r prevalence order, cache=T, fig.height=12, fig.width=10}
# Subset to the remaining phyla
prevdf_order_filt <- subset(prevdf, Order %in% get_taxa_unique(Rock_weathering_filt2, "Order"))

# grab the top 30 most abundant orders
prevdf_order_filt %>% 
  group_by(Order) %>%
  summarise(Combined.abundance = sum(TotalAbundance)) %>% 
  arrange(desc(Combined.abundance)) %>% 
  .[1:30, "Order"]  ->
  Orders2plot

prevdf_order_filt2 <- subset(prevdf, Order %in% Orders2plot$Order)

ggplot(prevdf_order_filt2,
       aes(TotalAbundance, Prevalence / nsamples(Rock_weathering_filt2), color = Order)) +
  # Include a guess for parameter
  geom_hline(yintercept = 0.05,
             alpha = 0.5,
             linetype = 2) + geom_point(size = 2, alpha = 0.7) +
  scale_x_log10() +  xlab("Total Abundance") + ylab("Prevalence [Frac. Samples]") +
  facet_wrap( ~ Order) + theme(legend.position = "none")
```

#### Unsupervised filtering by prevalence
We'll remove all sequences which appear in less than 10% of the samples
```{r}
# Define prevalence threshold as 10% of total samples
prevalenceThreshold <- 0.1 * nsamples(Rock_weathering_filt)
prevalenceThreshold

# Execute prevalence filter, using `prune_taxa()` function
keepTaxa <-
  row.names(prevdf_phylum_filt)[(prevdf_phylum_filt$Prevalence >= prevalenceThreshold)]
Rock_weathering_filt3 <- prune_taxa(keepTaxa, Rock_weathering_filt2)
sample_data(Rock_weathering_filt3)$Lib.size <- rowSums(otu_table(Rock_weathering_filt3))
print(Rock_weathering_filt2)
print(Rock_weathering_filt3)
```
This removed `r ntaxa(Rock_weathering_filt2) - ntaxa(Rock_weathering_filt3)` or `r percent(1 - (ntaxa(Rock_weathering_filt3) /  ntaxa(Rock_weathering_filt2)))` of the sequences.

### Exploring Rock_dust dataset features
First let's look at the count data distribution
```{r plot abundance, cache=T}
PlotLibDist(Rock_weathering_filt3)
sample_data(Rock_weathering_filt3) %>% 
  remove_rownames %>% 
  select(sample_title, Lib.size) %>% 
  as(., "data.frame") %>% 
  kable(.) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), full_width = F)
```
The figure and table indicate only a small deviation in the number of reads per samples.

```{r mod abundance, cache=T, fig.asp=.8}
(mod1 <- adonis(
  otu_table(Rock_weathering_filt3) ~ Lib.size,
  data = as(sample_data(Rock_weathering_filt3), "data.frame"), 
  method = "bray",
  permutations = 9999
))

PlotReadHist(as(otu_table(Rock_weathering_filt3), "matrix"))
notAllZero <- (rowSums(t(otu_table(Rock_weathering_filt3))) > 0)
meanSdPlot(as.matrix(log2(t(otu_table(Rock_weathering_filt3))[notAllZero, ] + 1)))
```

### Account for variation in library read-depth
We'll use the GMPR method [@chen_gmpr:_2017]
```{r GMPR, cache=T}
Rock_weathering_filt3_GMPR <- Rock_weathering_filt3
Rock_weathering_filt3 %>%
  otu_table(.) %>%
  t() %>%
  as(., "matrix") %>%
  GMPR() ->
  GMPR_factors

Rock_weathering_filt3 %>%
  otu_table(.) %>%
  t() %*% diag(1 / GMPR_factors$gmpr) %>%
  t() %>%
  as.data.frame(., row.names = sample_names(Rock_weathering_filt3)) %>%
  otu_table(., taxa_are_rows = FALSE) ->
  otu_table(Rock_weathering_filt3_GMPR)
sample_data(Rock_weathering_filt3_GMPR)$Lib.size <- sample_sums(Rock_weathering_filt3_GMPR)

adonis(
  otu_table(Rock_weathering_filt3_GMPR) ~ Lib.size,
  data = as(sample_data(Rock_weathering_filt3_GMPR), "data.frame"),
  method = "bray",
  permutations = 9999
)
PlotLibDist(Rock_weathering_filt3_GMPR)
```
```{r GMPR diag plots, cache=T, fig.asp=.8}
PlotReadHist(as(otu_table(Rock_weathering_filt3_GMPR), "matrix"))
notAllZero <- (rowSums(t(otu_table(Rock_weathering_filt3_GMPR))) > 0)
meanSdPlot(as.matrix(log2(t(otu_table(Rock_weathering_filt3_GMPR))[notAllZero, ] + 1)))
```

### Alpha diversity 
Calculate and plot alpha diversity mertrics.
```{r alpha-div, cache=T, fig.width=20, fig.height=10}
# non-parametric richness estimates
rarefaction.mat <- matrix(0, nrow = nsamples(Rock_weathering_filt3), ncol = bootstraps)
rownames(rarefaction.mat) <- sample_names(Rock_weathering_filt3)
rich.ests <- list(S.obs = rarefaction.mat, S.chao1 = rarefaction.mat, se.chao1 = rarefaction.mat,
                   S.ACE = rarefaction.mat, se.ACE = rarefaction.mat)

for (i in seq(bootstraps)) {
  sub.OTUmat <- rrarefy(otu_table(Rock_weathering_filt3), min(rowSums(otu_table(Rock_weathering_filt3))))
  for (j in seq(length(rich.ests))) {
    rich.ests[[j]][, i] <- t(estimateR(sub.OTUmat))[, j]
  }
}

Richness <- data.frame(row.names = row.names(rich.ests[[1]]))
for (i in c(1, seq(2, length(rich.ests), 2))) {
  S <- apply(rich.ests[[i]], 1, mean)
  if (i == 1) { 
    se <- apply(rich.ests[[i]], 1, function(x) (mean(x)/sqrt(length(x))))
    } else se <- apply(rich.ests[[i + 1]], 1, mean)
  Richness <- cbind(Richness, S, se)
}
colnames(Richness) <- c("S.obs", "S.obs.se", "S.chao1", "S.chao1.se", "S.ACE", "S.ACE.se")


saveRDS(Richness, file = "Results/Rock_weathering_Richness.Rds")
write.csv(Richness, file = "Results/Rock_weathering_Richness.csv")

ses <- grep("\\.se", colnames(Richness))
Richness[, ses] %>% 
  gather(key = "est.se") -> se.dat
Richness[, -unique(ses)] %>% 
  gather(key = "est") -> mean.dat

n <- length(unique(mean.dat$est))

# diversity indices
diversity.inds <- list(Shannon = rarefaction.mat, inv.simpson = rarefaction.mat, BP = rarefaction.mat)
for (i in seq(bootstraps)) {
  sub.OTUmat <- rrarefy(otu_table(Rock_weathering_filt3), min(rowSums(otu_table(Rock_weathering_filt3))))
  diversity.inds$Shannon[, i] <- diversityresult(sub.OTUmat, index = 'Shannon', method = 'each site', digits = 3)[, 1]
  diversity.inds$inv.simpson[, i] <- diversityresult(sub.OTUmat, index = 'inverseSimpson', method = 'each site', digits = 3)[, 1]
  diversity.inds$BP[, i] <- diversityresult(sub.OTUmat, index = 'Berger', method = 'each site', digits = 3)[, 1]
}

Diversity <- data.frame(row.names = row.names(diversity.inds[[1]]))
for (i in seq(length(diversity.inds))) {
  S <- apply(diversity.inds[[i]], 1, mean)
  se <- apply(diversity.inds[[i]], 1, function(x) (mean(x)/sqrt(length(x))))
  Diversity <- cbind(Diversity, S, se)
}
colnames(Diversity) <- c("Shannon", "Shannon.se", "Inv.simpson", "Inv.simpson.se", "BP", "BP.se")

ses <- grep("\\.se", colnames(Diversity))
Diversity[, ses] %>% gather(key = "est.se") -> se.dat
Diversity[, -unique(ses)] %>% gather(key = "est") -> mean.dat

saveRDS(Diversity, file = "Results/Rock_weathering_Diversity.Rds")
write.csv(Diversity, file = "Results/Rock_weathering_Diversity.csv")
```

Test the differences in alpha diversity.
```{r test alpha, cache=T, include=F}
# make combined richness diversity
Richness_Diversity <- cbind(Richness, Diversity)
ses <- grep("\\.se", colnames(Richness_Diversity))
Richness_Diversity[, ses] %>% 
  gather(key = "est.se") -> 
  se.dat
Richness_Diversity[, -unique(ses)] %>% 
  gather(key = "Metric", 
         value = "Estimate") -> 
  mean.dat

Richness_Diversity_long <-
  cbind(
    Sample = rep(rownames(Richness_Diversity), times = length(unique(mean.dat$Metric))),
    mean.dat,
    lerr = mean.dat$Estimate - se.dat$value,
    herr = mean.dat$Estimate + se.dat$value
  )

Richness_Diversity_long$Metric <-
  factor(
    Richness_Diversity_long$Metric,
    levels = c("S.obs", "S.chao1", "S.ACE", "Shannon", "Inv.simpson", "BP"),
    labels = c("S obs.", "Chao1", "ACE", "Shannon", "Inv. Simpson" , "Berger Parker")
  )

Richness_Diversity_long %<>%
  cbind(., 
        sample_data(Rock_weathering_filt3))

data2test <- Richness_Diversity_long[Richness_Diversity_long$Metric == "S obs.", ] 
mod_data <- TestAlpha(data = data2test, boxcox.trans = TRUE)
TukeyHSD(mod_data)
factors2test <- c("Climate", "Uni.Source")
(ph_Sobs <- HSD.test(mod_data, factors2test, group = TRUE, console = TRUE))
# Richness_Diversity_long$groups[Richness_Diversity_long$Metric == "S obs."] <- ph_Sobs$groups$groups

#But the problem is that Tukey assumes varience homogeneity. If we doubt that we should use something like the Games-Howell method. 
posthocTGH(y = data2test$Estimate, x = data2test$Climate.Source)

data2test <- Richness_Diversity_long[Richness_Diversity_long$Metric == "Shannon", ] 
mod_data <- TestAlpha(data = data2test, boxcox.trans = TRUE)
TukeyHSD(mod_data)
(ph_Shannon <- HSD.test(mod_data, factors2test, group = TRUE, console = TRUE))
posthocTGH(y = data2test$Estimate, x = data2test$Climate.Source)

data2test <- Richness_Diversity_long[Richness_Diversity_long$Metric == "ACE", ] 
mod_data <- TestAlpha(data = data2test, boxcox.trans = FALSE)
TukeyHSD(mod_data)
(ph_ACE <- HSD.test(mod_data, factors2test, group = TRUE, console = TRUE))
posthocTGH(y = data2test$Estimate, x = data2test$Climate.Source)

data2test <- Richness_Diversity_long[Richness_Diversity_long$Metric == "Berger Parker", ] 
mod_data <- TestAlpha(data = data2test, boxcox.trans = TRUE)
TukeyHSD(mod_data)
(ph_BP <- HSD.test(mod_data, factors2test, group = TRUE, console = TRUE))
posthocTGH(y = data2test$Estimate, x = data2test$Climate.Source)
```

#### Plot all alpha diversity metrics together
```{r plot alpha, cache=T, fig.width=10, fig.height=6, fig.cap=""}
Richness_Diversity_long[Richness_Diversity_long$Metric != "Chao1" &
                          Richness_Diversity_long$Metric != "Inv. Simpson" &
                          Richness_Diversity_long$Metric != "Berger Parker", ] %>% 
  droplevels() ->
  Richness_Diversity_long2plot

p_alpha <- ggplot(Richness_Diversity_long2plot, aes(
  x = Source,
  y = Estimate
)) +
  geom_violin(aes(colour = Climate, fill = Climate), alpha = 1/3) +
  geom_jitter(aes(colour = Climate, fill = Climate), shape = 16, size = 2, width = 0.2, alpha = 2/3) +
  scale_colour_manual(values = pom4, name = "") +
  scale_fill_manual(values = pom4, name = "") +
  theme_cowplot(font_size = 11, font_family = f_name) +
  # geom_errorbar(alpha = 1 / 2, width = 0.3) +
  xlab("") +
  ylab("") +
  theme(axis.text.x = element_text(
    angle = 45,
    vjust = 0.9,
    hjust = 0.9
  )) +
  facet_grid(Metric ~ Climate, shrink = FALSE, scale = "free") +
  background_grid(major = "y",
                  minor = "none") +
  theme(panel.spacing = unit(2, "lines"))

dat_text <- data.frame(
  label = as.character(fct_c(ph_Sobs$groups$groups, ph_ACE$groups$groups, ph_Shannon$groups$groups)),
  Metric   = rep(levels(Richness_Diversity_long2plot$Metric), each = 6),
  Climate = str_split(rownames(ph_Sobs$groups), ":", simplify = TRUE)[, 1], 
  x = c("Loess soil", "Loess soil", "Limestone", "Dust", "Dolomite", "Dust"),
  # x     = as.factor(levels(Richness_Diversity_long2plot$Climate.Source)),
  y = rep(c(460, 850, 6.5), each = 6)
  # y = rep(c(40, 140, 0.5), each = 6)
)


p_alpha <- p_alpha + geom_text(
  data    = dat_text,
  mapping = aes(x = x, y = y, label = label),
  nudge_x = -0.2,
  nudge_y = -0.1
)

print(p_alpha)

Richness_Diversity_long2plot %>%
  group_by(Metric, Climate.Source) %>%   # the grouping variable
  summarise(mean_PL = mean(Estimate),  # calculates the mean of each group
            sd_PL = sd(Estimate), # calculates the standard deviation of each group
            n_PL = n(),  # calculates the sample size per group
            SE_PL = sd(Estimate)/sqrt(n())) %>% # calculates the standard error of each group
  kable(.) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), full_width = F)
```

### Beta diversity
Calculate and plot beta diversity mertrics. 

#### 1. Possible changes in biofilm arid vs hyper arid
Is there a difference between the two sites. However, since we know that that samples are of different nature we'll have to control for rock type, source and location:

```{r ADONIS full, cache=T}
(mod1 <-  adonis(
  otu_table(Rock_weathering_filt3_GMPR) ~ Climate * Source * Location,
  data = as(sample_data(Rock_weathering_filt3_GMPR), "data.frame"),
  method = "horn",
  permutations = 9999
))

Rock_weathering_filt3_GMPR_Arid <- subset_samples(Rock_weathering_filt3_GMPR, Climate == "Arid")
Rock_weathering_filt3_GMPR_Arid <- filter_taxa(Rock_weathering_filt3_GMPR_Arid, function(x) sum(x) > 0, TRUE)
(mod2 <-  adonis(
  otu_table(Rock_weathering_filt3_GMPR_Arid) ~ Source * Location,
  data = as(sample_data(Rock_weathering_filt3_GMPR_Arid), "data.frame"),
  method = "horn",
  permutations = 9999
))

Rock_weathering_filt3_GMPR_Hyperarid <- subset_samples(Rock_weathering_filt3_GMPR, Climate == "Hyperarid")
Rock_weathering_filt3_GMPR_Hyperarid <- filter_taxa(Rock_weathering_filt3_GMPR_Hyperarid, function(x) sum(x) > 0, TRUE)
(mod3 <-  adonis(
  otu_table(Rock_weathering_filt3_GMPR_Hyperarid) ~ Source * Location,
  data = as(sample_data(Rock_weathering_filt3_GMPR_Hyperarid), "data.frame"),
  method = "horn",
  permutations = 9999
))
```
According to this model we see that indeed there's an effect of site on the community (p = 0.001), and that effect accounts for about 17% of the variance. Also, considering that Location is only borderline significant and explains very little of the data, we could probably take it out of the model to make a minimal adequate model.

```{r ADONIS reduced, cache=T}
(mod4 <-  adonis(
  otu_table(Rock_weathering_filt3_GMPR) ~ Climate * Source,
  data = as(sample_data(Rock_weathering_filt3_GMPR), "data.frame"),
  method = "horn",
  permutations = 9999
))

(mod5 <-  adonis(
  otu_table(Rock_weathering_filt3_GMPR_Arid) ~ Source,
  data = as(sample_data(Rock_weathering_filt3_GMPR_Arid), "data.frame"),
  method = "horn",
  permutations = 9999
))

(mod6 <-  adonis(
  otu_table(Rock_weathering_filt3_GMPR_Hyperarid) ~ Source,
  data = as(sample_data(Rock_weathering_filt3_GMPR_Hyperarid), "data.frame"),
  method = "horn",
  permutations = 9999
))
```

Final model
```{r ADONIS final, cache=T, fig.width=8, fig.aspect=.5}
print(mod4)

mod4_pairwise <- PairwiseAdonis(
  otu_table(Rock_weathering_filt3_GMPR),
  sample_data(Rock_weathering_filt3_GMPR)$Climate.Source,
  sim.function = "vegdist",
  sim.method = "horn",
  p.adjust.m = "BH"
)
print(mod4_pairwise)
sig_pairs <- as.character(mod4_pairwise$pairs[mod4_pairwise$p.adjusted < 0.05])
  
simper(otu_table(Rock_weathering_filt3_GMPR), sample_data(Rock_weathering_filt3_GMPR)$Climate.Source, parallel = 4)
```
```{r ordination final, cache=T, fig.width=8, fig.aspect=.5}
GMPR_ord <- ordinate(Rock_weathering_filt3_GMPR, "CAP", "horn", formula = Rock_weathering_filt3_GMPR ~ Climate * Source)
explained <- eigenvals(GMPR_ord)/sum( eigenvals(GMPR_ord)) * 100
explained <- as.numeric(format(round(explained, 1), nsmall = 1))
data2plot <- cbind(scores(GMPR_ord, display = "sites"), sample_data(Rock_weathering_filt3_GMPR))
p_ord <- ggplot(data2plot) +
  geom_point(aes(x = CAP1, y = CAP2, colour = Source, shape = Climate), size = 3, alpha = 2/3 ) +
  scale_colour_manual(values = pom4) +
  stat_ellipse(aes(x = CAP1, y = CAP2, group = Climate), color = "black", alpha = 0.5, type = "norm", level = 0.95, linetype = 2) +
  xlab(label = paste0("CAP1", " (", explained[1],"%)")) + 
  ylab(label = paste0("CAP2", " (", explained[2],"%)")) +
  coord_fixed(sqrt(explained[2] / explained[1])) 
  
print(p_ord)
```

### Taxonomic features
Explore and plot the taxonomic distribution of the sequences

```{r seqs heatmaps, cache=T}
Rock_weathering_filt3_GMPR_rel <- transform_sample_counts(Rock_weathering_filt3_GMPR, function(x) x / sum(x) )

Rock_weathering_filt3_GMPR_rel %>% 
  sample_data() %>% 
  arrange(Climate, Source) %>% 
  .$sample_names ->
  Sample_order

Rock_weathering_filt3_100 <-
  prune_taxa(names(sort(taxa_sums(Rock_weathering_filt3_GMPR_rel), TRUE)[1:100]), Rock_weathering_filt3_GMPR_rel)
plot_heatmap(
  Rock_weathering_filt3_100,
  method = NULL,
  distance = NULL,
  sample.label = "sample_title",
  taxa.label = "Order",
  sample_order = Sample_order,
  low = "#000033",
  high = "#FF3300"
) #+ theme_bw(base_size = 20) + theme(axis.text.x = element_text(hjust = 0, angle = -90.0))
```

Let's look at the agglomerated taxa

```{r agglomerated taxa box plot, fig.width=15, fig.height=10, cache=T}
Rock_weathering_filt3_glom <- tax_glom(Rock_weathering_filt3_GMPR, 
                             "Phylum", 
                             NArm = TRUE)
Rock_weathering_filt3_glom_rel <- transform_sample_counts(Rock_weathering_filt3_glom, function(x) x / sum(x)) 
Rock_weathering_filt3_glom_rel_DF <- psmelt(Rock_weathering_filt3_glom_rel)
Rock_weathering_filt3_glom_rel_DF$Phylum %<>% as.character()

# group dataframe by Phylum, calculate median rel. abundance
Rock_weathering_filt3_glom_rel_DF %>%
  group_by(Phylum) %>%
  summarise(median = median(Abundance)) ->
  medians

# find Phyla whose rel. abund. is less than 0.5%
Rare_phyla <- medians[medians$median <= 0.005, ]$Phylum

# change their name to "Rare"
Rock_weathering_filt3_glom_rel_DF[Rock_weathering_filt3_glom_rel_DF$Phylum %in% Rare_phyla, ]$Phylum <- 'Rare'
# re-group
Rock_weathering_filt3_glom_rel_DF %>%
  group_by(Sample, Climate, Phylum, Rock.type, Source) %>%
  summarise(Abundance = sum(Abundance)) ->
  Rock_weathering_filt3_glom_rel_DF_2plot

# ab.taxonomy$Freq <- sqrt(ab.taxonomy$Freq)
Rock_weathering_filt3_glom_rel_DF_2plot$Phylum %<>% sub("unclassified", "Unclassified", .)
Rock_weathering_filt3_glom_rel_DF_2plot$Phylum %<>% sub("uncultured", "Unclassified", .)

Rock_weathering_filt3_glom_rel_DF_2plot %>% 
  group_by(Sample) %>% 
  filter(Phylum == "Rare") %>% 
  summarise(`Rares (%)` = sum(Abundance * 100)) -> 
  Rares
# Percentage of reads classified as rare 
Rares %>% 
  kable(., digits = 2, caption = "Percentage of reads per sample type classified as rare:") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), full_width = F)

sample_order <- match(Rares$Sample, row.names(sample_data(Rock_weathering_filt3_glom)))
Rares %<>% arrange(., sample_order)

Rares %>% 
  cbind(., sample_data(Rock_weathering_filt3_glom)) %>% 
  group_by(Climate.Source) %>% 
  summarise(`Rares (%)` = mean(`Rares (%)`)) -> 
  Rares_merged

# Percentage of reads classified as rare 
Rares %>% 
  kable(., digits = 2, caption = "Percentage of reads per sample type classified as rare:") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), full_width = F)

Rock_weathering_filt3_glom_rel_DF_2plot %>% 
  group_by(Phylum) %>% 
  summarise(sum.Taxa = sum(Abundance)) %>% 
  arrange(desc(sum.Taxa)) -> Taxa_rank
Rock_weathering_filt3_glom_rel_DF_2plot$Phylum %<>% 
  factor(., levels = Taxa_rank$Phylum) %>% 
  fct_relevel(., "Rare", after = Inf)
  
p_taxa_box <-
  ggplot(Rock_weathering_filt3_glom_rel_DF_2plot, aes(x = Phylum, y = (Abundance * 100))) +
  geom_boxplot(aes(group = interaction(Phylum, Source)), position = position_dodge(width = 0.9), fatten = 1) +
  geom_point(
    aes(colour = Source),
    position = position_jitterdodge(dodge.width = 1),
    alpha = 1 / 2,
    stroke = 0,
    size = 2
  ) +
  scale_colour_manual(values = pom4, name = "") +
  theme_cowplot(font_size = 11, font_family = f_name) +
  labs(x = NULL, y = "Relative abundance (%)") +
  guides(colour = guide_legend(override.aes = list(size = 5))) +
  facet_grid(Climate ~ .) +
  background_grid(major = "xy",
                  minor = "none") +
  theme(axis.text.x = element_text(
    angle = 45,
    vjust = 0.9,
    hjust = 0.9
  ))
print(p_taxa_box)
```

#### Test differences between samples on the phylum level
```{r test phylum, fig.width=15, fig.height=10, cache=T}
Taxa_tests_phylum <- STAMPR(Rock_weathering_filt3_GMPR, "Phylum", sig_pairs)
Taxa_tests_order <- STAMPR(Rock_weathering_filt3_GMPR, "Order", sig_pairs)
```

#### Ternary plots
For the arid samples
```{r arid ternary, fig.width=11.69, fig.height=8.27}
Rock_weathering_filt3_GMPR_Arid_rel <- transform_sample_counts(Rock_weathering_filt3_GMPR_Arid, function(x) x / sum(x) ) # rel abundance
Rock_weathering_filt3_GMPR_Arid_merged <- merge_samples(Rock_weathering_filt3_GMPR_Arid_rel, "Source", fun = mean) # merge by source
Rock_weathering_filt3_GMPR_Arid_merged_rel <- transform_sample_counts(Rock_weathering_filt3_GMPR_Arid_merged, function(x) x / sum(x) ) # rel abundance per source
meandf <- as(otu_table(Rock_weathering_filt3_GMPR_Arid_merged_rel), "matrix")
if (!taxa_are_rows(Rock_weathering_filt3_GMPR_Arid_merged_rel)) { meandf <- t(meandf) }
abundance <- rowSums(meandf) / sum(meandf) * 100

Arid4Ternary <- data.frame(
  meandf,
  Abundance = abundance,
  Phylum = tax_table(Rock_weathering_filt3_GMPR_Arid_merged_rel)[, "Phylum"]
)
# Arid4Ternary <- dplyr::rename(Arid4Ternary, Loess_soil = Loess.soil)

Arid4Ternary$Phylum <-
  factor(Arid4Ternary$Phylum, levels = c(levels(Arid4Ternary$Phylum), 'Rare'))
Arid4Ternary$Phylum[Arid4Ternary$Phylum %in% Rare_phyla]  <- "Rare"
Arid4Ternary$Phylum %<>% 
  factor(., levels = Taxa_rank$Phylum) %>% 
  fct_relevel(., "Rare", after = Inf)

p_ternary_arid <-
  ggtern(data = Arid4Ternary,
         aes(
           x = Loess.soil,
           y = Dust,
           z = Limestone,
           size = Abundance,
           colour = Phylum
         )) +
  geom_point(alpha = 1 / 2) +
  scale_size(
    range = c(1, 5),
    name = "Abundance (%)"
  ) +
  theme_arrownormal() +
    scale_color_manual(values = pal("d3js")) +
  guides(colour = guide_legend(override.aes = list(size = 3))) +
  labs(x = "Loess soil") + 
  theme(axis.title = element_blank())
print(p_ternary_arid)
```

For the hyperarid samples
```{r hyperarid ternary, fig.width=11.69, fig.height=8.27}
Rock_weathering_filt3_GMPR_Hyperarid_rel <- transform_sample_counts(Rock_weathering_filt3_GMPR_Hyperarid, function(x) x / sum(x) ) # rel abundance
Rock_weathering_filt3_GMPR_Hyperarid_merged <- merge_samples(Rock_weathering_filt3_GMPR_Hyperarid_rel, "Source", fun = mean) # merge by source
Rock_weathering_filt3_GMPR_Hyperarid_merged_rel <- transform_sample_counts(Rock_weathering_filt3_GMPR_Hyperarid_merged, function(x) x / sum(x) ) # rel abundance per source
meandf <- as(otu_table(Rock_weathering_filt3_GMPR_Hyperarid_merged_rel), "matrix")
if (!taxa_are_rows(Rock_weathering_filt3_GMPR_Hyperarid_merged_rel)) { meandf <- t(meandf) }
abundance <- rowSums(meandf) / sum(meandf) * 100

Hyperarid4Ternary <- data.frame(
  meandf,
  Abundance = abundance,
  Phylum = tax_table(Rock_weathering_filt3_GMPR_Hyperarid_merged_rel)[, "Phylum"]
)

Hyperarid4Ternary$Phylum <-
  factor(Hyperarid4Ternary$Phylum, levels = c(levels(Hyperarid4Ternary$Phylum), 'Rare'))
Hyperarid4Ternary$Phylum[Hyperarid4Ternary$Phylum %in% Rare_phyla]  <- "Rare"
Hyperarid4Ternary$Phylum %<>% 
  factor(., levels = Taxa_rank$Phylum) %>% 
  fct_relevel(., "Rare", after = Inf)

p_ternary_hyperarid <-
  ggtern(data = Hyperarid4Ternary,
         aes(
           x = Loess.soil,
           y = Dust,
           z = Dolomite,
           size = Abundance,
           colour = Phylum
         )) +
  geom_point(alpha = 1 / 2) +
  scale_size(
    range = c(1, 5),
    name = "Abundance (%)"
  ) +
  theme_arrownormal() +
  scale_color_manual(values = pal("d3js")) +
  guides(colour = guide_legend(override.aes = list(size = 3))) +
  labs(x = "Loess soil") + 
  theme(axis.title = element_blank())
print(p_ternary_hyperarid)
```

#### Combine plots
Combine all sequence analysis plots to make Fig. 3
```{r combined plots, fig.width=11.69, fig.height=8.27}
ternary_legend <-
  get_legend(p_ternary_arid)# + theme(legend.direction = "horizontal"))
ord_legend <- get_legend(p_ord)
top_row <-
  plot_grid(
    p_alpha + theme(
      legend.position = "none",
      panel.spacing = unit(0.5, "lines")
    ),
    p_ord + theme(axis.title.y = element_text(vjust = -3)) ,
    labels = c('A', 'B'),
    label_size = 12,
    align = 'v',
    axis = "tl",
    nrow = 1,
    ncol = 2
  )
bottom_l <-
  plot_grid(
    p_taxa_box + theme(legend.position = "none"),
    labels = c('C'),
    label_size = 12,
    ncol = 1
  )
bottom_r <-
  plot_grid(
    p_ternary_arid + 
      theme(legend.position = "none", 
            plot.margin = unit(c(-0.1, -0.1, -0.1, -0.1), "cm"),
            axis.title = element_blank()),
    p_ternary_hyperarid + 
      theme(legend.position = "none", 
            plot.margin = unit(c(-0.1, -0.1, -0.1, -0.1), "cm"),
            axis.title = element_blank()),
    labels = c('D'),
    label_size = 12,
    align = 'hv',
    axis = "t",
    # rel_widths = c(1, 1, 0.1),
    scale = c(1.2, 1.2),
    nrow = 2,
    ncol = 1
  )

bottom_rows <- plot_grid(bottom_l, 
                         bottom_r,
                         ternary_legend,
                         align = 'h',
                         axis = "l",
                         scale = c(1, 1, 0.08),
                         rel_widths = c(0.5, 0.35, 0.15),
                         nrow = 1, 
                         ncol = 3)
p_all <- plot_grid(top_row, bottom_rows, align = 'v', axis = 'l', nrow = 2, rel_heights = c(0.43, 0.6)) # aligning vertically along the left axis
print(p_all)
```

### Differential abundance models
Detect differentially abundant OTUs using ALDEx2 [@fernandes_anova-like_2013]
```{r ALDEx2, cache=T}
# Rock_weathering_filt3_s <- prune_taxa(names(sort(taxa_sums(Rock_weathering_filt3), TRUE))[1:100], Rock_weathering_filt3)

# run full model 
data2test <- t(otu_table(Rock_weathering_filt3))
comparison <- as.character(unlist(sample_data(Rock_weathering_filt3)[, "Climate.Source"]))
ALDEx_full <- aldex.clr(data2test, comparison, mc.samples = 128, denom = "iqlr", verbose = TRUE, useMC = TRUE) # iqlr for slight assymetry in composition
ALDEx_full_glm <- aldex.glm(ALDEx_full, comparison, useMC = TRUE) # for more than two conditions
sig_taxa <- rownames(ALDEx_full_glm)[ALDEx_full_glm$glm.eBH < 0.05] # save names of taxa that are significant under the full model

# Pairwise comparisons
# 
# dolomite - limestone
Rock_weathering_filt3_Rocks <- subset_samples(Rock_weathering_filt3, Uni.Source == "Rock")
ALDEx2plot_Rocks <- CalcALDEx(Rock_weathering_filt3_Rocks, sig_level = 0.1, LFC = 0)
GGPlotALDExTax(ALDEx2plot_Rocks) + 
  ggtitle("Hyperarid dolomite vs. Arid limestone")
# dolomite - soil
Rock_weathering_filt3_DolSoil <- subset_samples(Rock_weathering_filt3, Climate == "Hyperarid" & Source != "Dust")
ALDEx2plot_DolSoil <- CalcALDEx(Rock_weathering_filt3_DolSoil, sig_level = 0.1, LFC = 0)
GGPlotALDExTax(ALDEx2plot_DolSoil) + 
  ggtitle("Hyperarid dolomite vs. Hyperarid soil")
# dolomite - dust
Rock_weathering_filt3_DolDust <- subset_samples(Rock_weathering_filt3, Climate == "Hyperarid" & Source != "Loess soil")
ALDEx2plot_DolDust <- CalcALDEx(Rock_weathering_filt3_DolDust, sig_level = 0.3, LFC = 0)
GGPlotALDExTax(ALDEx2plot_DolDust) + 
  ggtitle("Hyperarid dolomite vs. Hyperarid dust")
# limestone - soil
Rock_weathering_filt3_LimeSoil <- subset_samples(Rock_weathering_filt3, Climate == "Arid" & Source != "Dust")
ALDEx2plot_LimeSoil <- CalcALDEx(Rock_weathering_filt3_LimeSoil, sig_level = 0.1, LFC = 0)
GGPlotALDExTax(ALDEx2plot_LimeSoil) + 
  ggtitle("Arid limestone vs. Arid soil")
# limestone - dust
Rock_weathering_filt3_LimeDust <- subset_samples(Rock_weathering_filt3, Climate == "Arid" & Source != "Loess soil")
ALDEx2plot_LimeDust <- CalcALDEx(Rock_weathering_filt3_LimeDust, sig_level = 0.3, LFC = 0)
GGPlotALDExTax(ALDEx2plot_LimeDust) + 
  ggtitle("Arid limestone vs. Arid dust")

ALDEx2plot_Rocks %<>% cbind(., Var1 = "Dolomite", Var2 = "Limestone")
ALDEx2plot_DolSoil %<>% cbind(., Var1 = "Dolomite", Var2 = "Loess soil")
ALDEx2plot_DolDust %<>% cbind(., Var1 = "Dolomite", Var2 = "Dust")
ALDEx2plot_LimeSoil %<>% cbind(., Var1 = "Limestone", Var2 = "Loess soil")
ALDEx2plot_LimeDust %<>% cbind(., Var1 = "Limestone", Var2 = "Dust")

ALDEx2plot_all <- bind_rows(ALDEx2plot_Rocks, ALDEx2plot_DolSoil, ALDEx2plot_DolDust, ALDEx2plot_LimeSoil, ALDEx2plot_LimeDust)
ALDEx2plot_all$Var2 %<>%
    factor() %>%  # Taxa_rank is calcuted for the taxa box plots
    fct_relevel(., "Limestone")

# paste0(percent(sum(ALDEx2plot_Rocks$effect > 0 & ALDEx2plot_Rocks$Significance == "Pass")/nrow(ALDEx2plot_Rocks)), "/", percent(sum(ALDEx2plot_Rocks$effect < 0 & ALDEx2plot_Rocks$Significance == "Pass")/nrow(ALDEx2plot_Rocks)))

Labels <- c(
  paste0("⬆", sum(ALDEx2plot_Rocks$effect > 0 & ALDEx2plot_Rocks$Significance == "Pass"), " ⬇", sum(ALDEx2plot_Rocks$effect < 0 & ALDEx2plot_Rocks$Significance == "Pass"), " (", nrow(ALDEx2plot_Rocks), ")"),
  paste0("⬆", sum(ALDEx2plot_DolSoil$effect > 0 & ALDEx2plot_DolSoil$Significance == "Pass"), " ⬇", sum(ALDEx2plot_DolSoil$effect < 0 & ALDEx2plot_DolSoil$Significance == "Pass"), " (", nrow(ALDEx2plot_DolSoil), ")"),
  paste0("⬆", sum(ALDEx2plot_DolDust$effect > 0 & ALDEx2plot_DolDust$Significance == "Pass"), " ⬇", sum(ALDEx2plot_DolDust$effect < 0 & ALDEx2plot_DolDust$Significance == "Pass"), " (", nrow(ALDEx2plot_DolDust), ")"),
  paste0("⬆", sum(ALDEx2plot_LimeSoil$effect > 0 & ALDEx2plot_LimeSoil$Significance == "Pass"), " ⬇", sum(ALDEx2plot_LimeSoil$effect < 0 & ALDEx2plot_LimeSoil$Significance == "Pass"), " (", nrow(ALDEx2plot_LimeSoil), ")"),
  paste0("⬆", sum(ALDEx2plot_LimeDust$effect > 0 & ALDEx2plot_LimeDust$Significance == "Pass"), " ⬇", sum(ALDEx2plot_LimeDust$effect < 0 & ALDEx2plot_LimeDust$Significance == "Pass"), " (", nrow(ALDEx2plot_LimeDust), ")")
)
Label_text <- bind_cols(
  unique(ALDEx2plot_all[c("Var1", "Var2")]),
  Label = Labels
  )
```
```{r ALDEx2_combined_plot, cache=T,  fig.width=12, fig.height=10}
p_aldex2_all <- GGPlotALDExTax(ALDEx2plot_all) +
  facet_grid(Var2 ~ Var1, scales = "free_y") +
  # theme(strip.background = element_blank(), strip.placement = "outside") +
  geom_text(
    data    = Label_text,
    mapping = aes(x = Inf, y = Inf, label = Label),
    hjust   = 1.1,
    vjust   = 1.6
  ) 
print(p_aldex2_all)
```

## Other plots
Other plots in the paper which are not based on sequence data
### Isotopes profile
```{r isotopes, cache=T}
Isotopes <-
  read_csv(
    "Data/Isotopes_data.csv"
  )

Isotopes %<>% 
  mutate(Mean.Arid = (`Limestone Shivta Fm. NWSH1` + `Limestone Shivta Fm. NWSH2`) / 2)
Isotopes %<>% 
  mutate(Mean.Hyperarid = (`Dolomite Gerofit Fm.UVSL5` + `Dolomite Gerofit Fm.UVSL6` ) / 2)

Isotopes2plot <- data.frame(
  Rock = factor(c(rep("Limestone", 10), rep("Dolomite", 10)), 
                levels = c("Limestone", "Dolomite")),
  Depth = rep(Isotopes$`Depth (mm)`, 2),
  Isotope = rep(Isotopes$Isotope, 2),
  min = c(
    pmin(
      Isotopes$`Limestone Shivta Fm. NWSH1`,
      Isotopes$`Limestone Shivta Fm. NWSH2`
    ),
    pmin(
      Isotopes$`Dolomite Gerofit Fm.UVSL5`,
      Isotopes$`Dolomite Gerofit Fm.UVSL6`
    )
  ),
  max = c(
    pmax(
      Isotopes$`Limestone Shivta Fm. NWSH1`,
      Isotopes$`Limestone Shivta Fm. NWSH2`
    ),
    pmax(
      Isotopes$`Dolomite Gerofit Fm.UVSL5`,
      Isotopes$`Dolomite Gerofit Fm.UVSL6`
    )
  ),
  mean = c(Isotopes$Mean.Arid, Isotopes$Mean.Hyperarid)
)

p_isotopes <-
  ggplot(Isotopes2plot, aes(y = mean, x = Depth, colour = Isotope)) +
  geom_point(size = 4, alpha = 1 / 2) +
  geom_errorbar(aes(ymin = min, ymax = max), alpha = 1/2, width = 0.2) +
  geom_line(alpha = 1 / 2) +
  coord_flip() +
  theme_cowplot(font_size = 18, font_family = f_name) +
  background_grid(major = "xy",
                  minor = "none") +
  scale_x_reverse(limits = c(4.1, -0.1), expand = c(0.01, 0.01)) +
  # scale_x_continuous(limits = c(0, 50), expand = c(0.01, 0.01)) +
  facet_grid(Rock ~ . , scales = "free_x", labeller = label_parsed) +
  scale_color_manual(values =  pom4[c(2,1)],
                     labels = c(expression(paste(delta ^ {13}, "C")),
                                expression(paste(delta ^ {18}, "O")))) +
  ylab(expression(paste(delta ^ {13}, "C / ",
                        delta ^ {18}, "O", " (", "\u2030", "VPDB",")"
  )))

p_isotopes <- plot_grid(p_isotopes, labels = "b", label_size = 20)
print(p_isotopes)
```

### Drying experiment
```{r drying, cache=T}
read_csv("Data/Drying_data_full.csv") ->
  Drying_long
Drying_long$Rock %<>% fct_relevel(., "Limestone")
Drying_long$BRC %<>% 
  fct_relevel(., "Present")
Drying_long$Sample <- with(Drying_long, paste(Rock, BRC))

Drying_mods <- tibble(Sample = character(), Intercept = numeric(), b = numeric(), a = numeric(), P = numeric(), R2 = numeric())
mods <- list()
j <- 1
for (i in unique(Drying_long$Sample)) {
  data2model <- Drying_long[Drying_long$Sample == i, ]
  colnames(data2model) <- c("Time", "Replicate", "BRC", "Rock", "RWC", "Sample")
  (mod <- lme(RWC ~ poly(Time, 2, raw = TRUE), random = ~0 + Time|Replicate, data = data2model))
  intervals(mod)
  # mod <- lm(`Residual water content (%)` ~ sqrt(1/(`Time (h)` + 1)), data = data2model)
  mods[[j]] <- mod
  Drying_mods[j, "Sample"] <- i
  Drying_mods[j, "Intercept"] <- mod$coefficients$fixed[1]
  Drying_mods[j, "b"] <- mod$coefficients$fixed[2]
  Drying_mods[j, "a"] <- mod$coefficients$fixed[3]
  Drying_mods[j, "P"] <- anova(mod)$`p-value`[2]
  Drying_mods[j, "R2"] <- r.squaredGLMM(mod)[, "R2c"]
  j <- j + 1
}

Drying_mods %>% 
  kable(., digits = c(1, 1, 2, 2, 3, 2)) %>%
  kable_styling(bootstrap_options = c("hover", "condensed", "responsive"), full_width = F)

# comapre with and without crust
data2model <- Drying_long
colnames(data2model) <- c("Time", "Replicate", "BRC", "Rock", "RWC", "Sample")
mod_all <- lme(RWC ~ poly(Time, 2, raw = TRUE), random = ~0 + Time|Replicate, data = data2model)
mod_treatment <- lme(RWC ~ poly(Time, 2, raw = TRUE) * BRC, random = ~0 + Time|Replicate, data = data2model)
anova(mod_all, mod_treatment)

# comapre limestone vs dolomite - with crust
mod_all <- lme(RWC ~ poly(Time, 2, raw = TRUE), random = ~0 + Time|Replicate, data = data2model[data2model$BRC == "Present", ])
mod_treatment <- lme(RWC ~ poly(Time, 2, raw = TRUE) * Rock, random = ~0 + Time|Replicate, data = data2model[data2model$BRC == "Present", ])
anova(mod_all, mod_treatment)

# comapre limestone vs dolomite - without crust
mod_all <- lme(RWC ~ poly(Time, 2, raw = TRUE), random = ~0 + Time|Replicate, data = data2model[data2model$BRC == "Removed", ])
mod_treatment <- lme(RWC ~ poly(Time, 2, raw = TRUE) * Rock, random = ~0 + Time|Replicate, data = data2model[data2model$BRC == "Removed", ])
anova(mod_all, mod_treatment)

# comapre with and without crust - limestone
mod_all <- lme(RWC ~ poly(Time, 2, raw = TRUE), random = ~0 + Time|Replicate, data = data2model[data2model$Rock == "Limestone", ])
mod_treatment <- lme(RWC ~ poly(Time, 2, raw = TRUE) * BRC, random = ~0 + Time|Replicate, data = data2model[data2model$Rock == "Limestone", ])
anova(mod_all, mod_treatment)

mod_all <- lme(RWC ~ poly(Time, 2, raw = TRUE), random = ~0 + Time|Replicate, data = data2model[data2model$Rock == "Dolomite", ])
mod_treatment <- lme(RWC ~ poly(Time, 2, raw = TRUE) * BRC, random = ~0 + Time|Replicate, data = data2model[data2model$Rock == "Dolomite", ])
anova(mod_all, mod_treatment)


p_drying <-
  ggplot(
    Drying_long,
    aes(
      x = `Time (h)`,
      y = `Residual water content (%)`,
      colour = Rock,
      # fill = Rock,
      shape = BRC,
      linetype = BRC
    )
  ) +
  geom_point(size = 2, alpha = 2/3) +
  # geom_smooth(method = "lm", se = FALSE, alpha = 1/2, formula = (y ~ sqrt(1/(x+1)))) +
  geom_smooth(method = "lm", se = TRUE, alpha = 1/3, formula = (y ~ poly(x, 2)), size = 1) +
  # geom_line(alpha = 1/2) +
  scale_y_continuous(limits = c(0, 100), expand = c(0.01, 0.01)) +
  scale_x_continuous(limits = c(0, 50), expand = c(0.01, 0.01)) +
  # scale_fill_manual(values = pom4) +
  scale_color_manual(values = pom4)
print(p_drying)
```

```{r session_info, echo=T, results="markdown"}
devtools::session_info()
```

## References