Merge pull request #105 from adrientaudiere/dev

adrientaudiere · Nov 27, 2024 · 2c4553b · 2c4553b
2 parents d54233e + 6abb797
commit 2c4553b
Showing 193 changed files with 1,333 additions and 531 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: MiscMetabar
 Type: Package
 Title: Miscellaneous Functions for Metabarcoding Analysis
-Version: 0.10.2
+Version: 0.10.3
 Authors@R: person("Adrien", "Taudière", email = "[email protected]",
   role = c("aut", "cre", "cph"), comment = c(ORCID = "0000-0003-1088-1182"))
 Description: Facilitate the description, transformation, exploration, and reproducibility of metabarcoding analyses. 'MiscMetabar' is mainly built on top of the 'phyloseq', 'dada2' and 'targets' R packages. It helps to build reproducible and robust bioinformatics pipelines in R. 'MiscMetabar' makes ecological analysis of alpha and beta-diversity easier, more reproducible and more powerful by integrating a large number of tools. Important features are described in Taudière A. (2023) <doi:10.21105/joss.06038>.

diff --git a/NAMESPACE b/NAMESPACE
@@ -107,6 +107,7 @@ export(rename_samples_otu_table)
 export(reorder_taxa_pq)
 export(ridges_pq)
 export(rotl_pq)
+export(sam_data_matching_names)
 export(sample_data_with_new_names)
 export(sankey_phyloseq)
 export(sankey_pq)
@@ -166,5 +167,6 @@ importFrom(stats,runif)
 importFrom(stats,sd)
 importFrom(stats,terms)
 importFrom(utils,object.size)
+importFrom(utils,read.csv)
 importFrom(utils,setTxtProgressBar)
 importFrom(utils,txtProgressBar)
diff --git a/NEWS.md b/NEWS.md
@@ -1,7 +1,16 @@
+# MiscMetabar 0.10.3 (in development)
+
+- Add params `type`, `na_remove` and `verbose` to `ggvenn_pq()`. The type = "nb_seq" allow to plot Venn diagram with the number of shared sequences instead of shared ASV. 
+- Add automatic report in json for the function `cutadapt_remove_primers()`.
+- Add param `verbose` to `track_wkflow()` and improve examples for `track_wkflow()` and `list_fastq_files`
 
 # MiscMetabar 0.10.2 (in development)
 
 - Improve code thanks to {lintr} package
+- Add option `return_file_path` to `cutadapt_remove_primers()` in order to facilitate targets pipeline
+- Add function `sam_data_matching_names()` to match and verify congruence between fastq files names and sample metadata (sam_data)
+
+
 # MiscMetabar 0.10.1
 
 > CRAN 2024-09-10
@@ -12,7 +21,7 @@
 # MiscMetabar 0.9.4 
 
 - Set a seed in the example of `build_tree_pq` to resubmit to CRAN
-   Add a param `return_a_vector` in function `filter_trim()` to make possible to return a vector of path as it is usefull when used with `targets::tar_targets(..., format="file")`)
+   Add a param `return_a_vector` in function `filter_trim()` to make possible to return a vector of path as it is useful when used with `targets::tar_targets(..., format="file")`)
 - Make some storage amelioration by replacing `list()` by `vector(list, ...)` 
 
 # MiscMetabar 0.9.3

diff --git a/R/beta_div_test.R b/R/beta_div_test.R
@@ -127,15 +127,25 @@ graph_test_pq <- function(physeq,
 #'   effect.
 #' @param verbose (logical, default TRUE) If TRUE, prompt some messages.
 #' @param ... Other arguments passed on to [vegan::adonis2()] function.
+#'   Note that the parameter `by` is important. If by is set to NULL
+#'   (default) the p-value is computed for the entire model.  
+#' 	 by = NULL will assess the overall significance of all terms together, 
+#'   by = "terms" will assess significance for each term (sequentially from first to last), 
+#'   setting by = "margin" will assess the marginal effects of the terms (each marginal term analysed in a model with all other variables), 
+#'   by = "onedf" will analyse one-degree-of-freedom contrasts sequentially. The argument is passed on to anova.cca.
 #' @return The function returns an anova.cca result object with a
 #'   new column for partial R^2. See help of [vegan::adonis2()] for
 #'   more information.
 #' @examples
 #' data(enterotype)
 #' \donttest{
 #' adonis_pq(enterotype, "SeqTech*Enterotype", na_remove = TRUE)
-#' adonis_pq(enterotype, "SeqTech", dist_method = "jaccard")
-#' adonis_pq(enterotype, "SeqTech", dist_method = "robust.aitchison")
+#' adonis_pq(enterotype, "SeqTech*Enterotype", na_remove = TRUE, by = "terms")
+#' adonis_pq(enterotype, "SeqTech*Enterotype", na_remove = TRUE, by = "onedf")
+#' adonis_pq(enterotype, "SeqTech*Enterotype", na_remove = TRUE, by = "margin")
+#' 
+#' adonis_pq(enterotype, "SeqTech", dist_method = "jaccard", by = "terms")
+#' adonis_pq(enterotype, "SeqTech", dist_method = "robust.aitchison", by = "terms")
 #' }
 #' @export
 #' @author Adrien Taudière
@@ -1038,11 +1048,20 @@ plot_ancombc_pq <-
 #'
 #' @author Adrien Taudière
 #' @examples
-#' data_fungi_mini_woNA4height <- subset_samples(
-#'   data_fungi_mini,
-#'   !is.na(data_fungi_mini@sam_data$Height)
-#' )
-#' taxa_only_in_one_level(data_fungi_mini_woNA4height, "Height", "High")
+#'  data_fungi_mini_woNA4height <- subset_samples(
+#'     data_fungi_mini,
+#'     !is.na(data_fungi_mini@sam_data$Height)
+#'   )
+#'   taxa_only_in_one_level(data_fungi_mini_woNA4height, "Height", "High")
+#'   #' # Taxa present only in low height samples
+#'   suppressMessages(suppressWarnings(
+#'     taxa_only_in_one_level(data_fungi, "Height", "Low")
+#'   ))
+#'   # Number of taxa present only in sample of time equal to 15
+#'   suppressMessages(suppressWarnings(
+#'     length(taxa_only_in_one_level(data_fungi, "Time", "15"))
+#'   ))
+
 taxa_only_in_one_level <- function(physeq,
                                    modality,
                                    level,

diff --git a/R/dada_phyloseq.R b/R/dada_phyloseq.R
@@ -236,13 +236,20 @@ clean_pq <- function(physeq,
 #' data(enterotype)
 #' if (requireNamespace("pbapply")) {
 #'   track_wkflow(list(data_fungi, enterotype), taxonomy_rank = c(3, 5))
+#'   track_wkflow(list("data FUNGI"=data_fungi,
+#'     "fastq files forward" =
+#'     unlist(list_fastq_files(system.file("extdata", package = "MiscMetabar"),
+#'     paired_end = FALSE))))
 #' }
 track_wkflow <- function(list_of_objects,
                          obj_names = NULL,
                          clean_pq = FALSE,
                          taxonomy_rank = NULL,
+                         verbose=TRUE,
                          ...) {
-  message("Compute the number of sequences")
+  if(verbose) {
+    message("Compute the number of sequences")
+  }
   if (!is.null(obj_names)) {
     names(list_of_objects) <- obj_names
   }
@@ -257,7 +264,9 @@ track_wkflow <- function(list_of_objects,
 
   track_nb_seq_per_obj <-
     pbapply::pblapply(list_of_objects, function(object) {
+      if(verbose) {
       message(paste("Start object of class:", class(object), sep = " "))
+      }
       if (inherits(object, "phyloseq")) {
         sum(object@otu_table)
       } else if (inherits(object, "matrix")) {
@@ -292,7 +301,9 @@ track_wkflow <- function(list_of_objects,
   message("Compute the number of clusters")
   track_nb_cluster_per_obj <-
     pbapply::pblapply(list_of_objects, function(object) {
+      if(verbose) {
       message(paste("Start object of class:", class(object), sep = " "))
+      }
       if (inherits(object, "phyloseq")) {
         ntaxa(object)
       } else if (inherits(object, "matrix")) {
@@ -318,7 +329,9 @@ track_wkflow <- function(list_of_objects,
   message("Compute the number of samples")
   track_nb_sam_per_obj <-
     pbapply::pblapply(list_of_objects, function(object) {
+      if(verbose) {
       message(paste("Start object of class:", class(object), sep = " "))
+      }
       if (inherits(object, "phyloseq")) {
         nsamples(object)
       } else if (inherits(object, "matrix")) {
@@ -347,7 +360,9 @@ track_wkflow <- function(list_of_objects,
     message("Compute the number of values in taxonomic rank")
     track_nb_tax_value_per_obj <-
       pbapply::pblapply(list_of_objects, function(object) {
+        if(verbose) {
         message(paste("Start object of class:", class(object), sep = " "))
+        }
         if (inherits(object, "phyloseq")) {
           if (taxa_are_rows(object)) {
             apply(object@tax_table[taxonomy_rank, ], 1, function(x) {
@@ -365,7 +380,9 @@ track_wkflow <- function(list_of_objects,
 
     names_taxonomic_rank <-
       pbapply::pblapply(list_of_objects, function(object) {
+        if(verbose) {
         message(paste("Start object of class:", class(object), sep = " "))
+        }
         if (inherits(object, "phyloseq")) {
           if (taxa_are_rows(object)) {
             rownames(object@tax_table)[taxonomy_rank]
@@ -1384,7 +1401,7 @@ verify_pq <- function(physeq,
     if (sum(is.na(physeq@sam_data)) > 0) {
       warning("At least one of your samples metadata columns contains NA.")
     }
-    if (sum(grepl("^[0-9]", sample_names(physeq)) > 0) && !silent) {
+    if (sum(grepl("^[0-9]", sample_names(physeq)) > 0)) {
       message(
         "At least one sample name start with a number.
       It may introduce bug in some function such
@@ -2346,10 +2363,14 @@ physeq_or_string_to_dna <- function(physeq = NULL, dna_seq = NULL) {
 #' @param cmd_is_run (logical, default TRUE) Do the cutadapt command is run.
 #'   If set to FALSE, the only effect of the function is to return a list of
 #'   command to manually run in a terminal.
+#' @param return_file_path (logical, default FALSE) If true, the function
+#'   return the path of the output folder (param `folder_output`). Useful
+#'   in targets workflow
 #' @param args_before_cutadapt (String) A one line bash command to run before
 #' to run cutadapt. For examples, "source ~/miniconda3/etc/profile.d/conda.sh && conda activate cutadaptenv &&" allow to bypass the conda init which asks to restart the shell
 #'
-#' @return a list of command
+#' @return a list of command or if `return_file_path` is TRUE, the path to
+#'   the output folder
 #' @export
 #' @author Adrien Taudière
 #'
@@ -2397,12 +2418,15 @@ cutadapt_remove_primers <- function(path_to_fastq,
                                     pattern_R2 = "_R2",
                                     nb_files = Inf,
                                     cmd_is_run = TRUE,
-                                    args_before_cutadapt = "source ~/miniconda3/etc/profile.d/conda.sh && conda activate cutadaptenv && ") {
+                                    return_file_path = FALSE,
+                                    args_before_cutadapt = "source ~/miniconda3/etc/profile.d/conda.sh && conda activate cutadaptenv && "
+                                    ) {
   cmd <- list()
   if (!dir.exists(folder_output)) {
     dir.create(folder_output)
   }
 
+
   if (is.null(primer_rev)) {
     lff <- list_fastq_files(
       path_to_fastq,
@@ -2418,6 +2442,11 @@ cutadapt_remove_primers <- function(path_to_fastq,
           args_before_cutadapt,
           "cutadapt --cores=",
           nproc,
+          " --json=",
+          folder_output,
+          "/",
+          gsub(".fastq", "", gsub(".fastq.gz", "", basename(f))),
+          ".cutadapt.json",
           " --discard-untrimmed -g '",
           primer_fw,
           "' -o ",
@@ -2446,6 +2475,11 @@ cutadapt_remove_primers <- function(path_to_fastq,
           args_before_cutadapt,
           "cutadapt -n 2 --cores=",
           nproc,
+          " --json=",
+          folder_output,
+          "/",
+          gsub(".fastq", "", gsub(".fastq.gz", "", basename(f))),
+          ".cutadapt.json",
           " --discard-untrimmed -g '",
           primer_fw,
           "' -G '",
@@ -2478,7 +2512,11 @@ cutadapt_remove_primers <- function(path_to_fastq,
     ))
     unlink(paste0(tempdir(), "/script_cutadapt.sh"))
   }
-  return(cmd)
+  if(return_file_path){
+    return(normalizePath(folder_output))
+  } else {
+    return(cmd)
+  }
 }
 ################################################################################
 
@@ -2503,14 +2541,21 @@ cutadapt_remove_primers <- function(path_to_fastq,
 #' @return A vector of taxa names
 #' @export
 #'
-#' @examples
-#' # Taxa present only in low height samples
-#' suppressMessages(suppressWarnings(taxa_only_in_one_level(data_fungi, "Height", "Low")))
-#' # Number of taxa present only in sample of time equal to 15
-#' suppressMessages(suppressWarnings(length(taxa_only_in_one_level(data_fungi, "Time", "15"))))
-#' @seealso [ggvenn_pq()] and [upset_pq()]
-#' @export
 #' @author Adrien Taudière
+#' @examples
+#'  data_fungi_mini_woNA4height <- subset_samples(
+#'     data_fungi_mini,
+#'     !is.na(data_fungi_mini@sam_data$Height)
+#'   )
+#'   taxa_only_in_one_level(data_fungi_mini_woNA4height, "Height", "High")
+#'   #' # Taxa present only in low height samples
+#'   suppressMessages(suppressWarnings(
+#'     taxa_only_in_one_level(data_fungi, "Height", "Low")
+#'   ))
+#'   # Number of taxa present only in sample of time equal to 15
+#'   suppressMessages(suppressWarnings(
+#'     length(taxa_only_in_one_level(data_fungi, "Time", "15"))
+#'   ))
 
 taxa_only_in_one_level <- function(physeq,
                                    modality,
@@ -2544,7 +2589,6 @@ taxa_only_in_one_level <- function(physeq,
 ################################################################################
 
 
-
 ################################################################################
 #' Normalize OTU table using samples depth
 #' @description

diff --git a/R/plot_functions.R b/R/plot_functions.R
@@ -1199,6 +1199,13 @@ venn_pq <-
 #' @param return_data_for_venn (logical, default FALSE) If TRUE, the plot is
 #'   not returned, but the resulting dataframe to plot with ggVennDiagram package
 #'   is returned.
+#' @param verbose (logical, default TRUE) If TRUE, prompt some messages.
+#' @param type If "nb_taxa" (default), the number of taxa (ASV, OTU or
+#'   taxonomic_rank if `taxonomic_rank` is not NULL) is
+#'   used in plot. If "nb_seq", the number of sequences is plotted.
+#'   `taxonomic_rank` is never used if type = "nb_seq".
+#' @param na_remove (logical, default TRUE) If set to TRUE, remove samples with
+#'   NA in the variables set in `fact` param
 #' @param ... Other arguments for the `ggVennDiagram::ggVennDiagram` function
 #'   for ex. `category.names`.
 #' @return A \code{\link[ggplot2]{ggplot}}2 plot representing Venn diagram of
@@ -1225,6 +1232,8 @@ venn_pq <-
 #'     data_fungi@sam_data$Height %in% c("Low", "High"))
 #'   ggvenn_pq(data_fungi2, fact = "Height")
 #'
+#'   ggvenn_pq(data_fungi2, fact = "Height", type = "nb_seq")
+#'
 #'   ggvenn_pq(data_fungi, fact = "Height", add_nb_seq = TRUE, set_size = 4)
 #'   ggvenn_pq(data_fungi, fact = "Height", rarefy_before_merging = TRUE)
 #'   ggvenn_pq(data_fungi, fact = "Height", rarefy_after_merging = TRUE) +
@@ -1240,11 +1249,13 @@ venn_pq <-
 #'     geom_polygon(aes(X, Y, group = id, fill = name),
 #'       data = ggVennDiagram::venn_regionedge(res_venn)
 #'     ) +
+#'     scale_fill_manual(values = funky_color(7)) +
 #'     # 2. set edge layer
 #'     geom_path(aes(X, Y, color = id, group = id),
 #'       data = ggVennDiagram::venn_setedge(res_venn),
-#'       show.legend = FALSE, linewidth = 3
+#'       show.legend = FALSE, linewidth = 2
 #'     ) +
+#'      scale_color_manual(values = c("red", "red","blue")) +
 #'     # 3. set label layer
 #'     geom_text(aes(X, Y, label = name),
 #'       data = ggVennDiagram::venn_setlabel(res_venn)
@@ -1274,11 +1285,25 @@ ggvenn_pq <- function(physeq = NULL,
                       rarefy_before_merging = FALSE,
                       rarefy_after_merging = FALSE,
                       return_data_for_venn = FALSE,
+                      verbose = TRUE,
+                      type = "nb_taxa",
+                      na_remove = TRUE,
                       ...) {
   if (!is.factor(physeq@sam_data[[fact]])) {
     physeq@sam_data[[fact]] <- as.factor(physeq@sam_data[[fact]])
   }
 
+  if (na_remove) {
+    new_physeq <- subset_samples_pq(physeq, !is.na(physeq@sam_data[[fact]]))
+    if (nsamples(physeq) - nsamples(new_physeq) > 0 && verbose) {
+      message(paste0(
+        nsamples(physeq) - nsamples(new_physeq),
+        " were discarded due to NA in variable fact"
+      ))
+    }
+    physeq <- new_physeq
+  }
+
   physeq <- taxa_as_columns(physeq)
 
   if (rarefy_before_merging) {
@@ -1302,7 +1327,8 @@ ggvenn_pq <- function(physeq = NULL,
     newphyseq <- physeq
     new_DF <- newphyseq@sam_data[newphyseq@sam_data[[fact]] == f, ]
     sample_data(newphyseq) <- sample_data(new_DF)
-    if (is.null(taxonomic_rank)) {
+    newphyseq <- clean_pq(newphyseq)
+    if (is.null(taxonomic_rank) || type == "nb_seq") {
       res[[f]] <- colnames(newphyseq@otu_table[
         ,
         colSums(newphyseq@otu_table) > min_nb_seq
@@ -1316,9 +1342,15 @@ ggvenn_pq <- function(physeq = NULL,
     }
     nb_seq <-
       c(nb_seq, sum(physeq@otu_table[physeq@sam_data[[fact]] == f, ], na.rm = TRUE))
+
+    if(type == "nb_seq") {
+      res[[f]] <- unlist(sapply(res[[f]], function(x) {
+        paste0(x, "_", seq(1, taxa_sums(physeq)[[x]]))
+      }))
+    }
   }
 
-  if (max(nb_seq) / min(nb_seq) > 2) {
+  if (max(nb_seq) / min(nb_seq) > 2 && verbose) {
     message(
       paste0(
         "Two modalities differ greatly (more than x2) in their number of sequences (",
@@ -4010,7 +4042,7 @@ plot_var_part_pq <-
 #'
 #' @inheritParams clean_pq
 #' @param num_modality (required) Name of the numeric column in
-#'   `physeq@sam_data` to plot and test against hill numberk
+#'   `physeq@sam_data` to plot and test against hill number
 #' @param hill_scales (a vector of integer) The list of q values to compute
 #'   the hill number H^q. If Null, no hill number are computed. Default value
 #'   compute the Hill number 0 (Species richness), the Hill number 1