diff --git a/DESCRIPTION b/DESCRIPTION index 93f8f4f9..8d736667 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: MiscMetabar Type: Package Title: Miscellaneous Functions for Metabarcoding Analysis -Version: 0.10.2 +Version: 0.10.3 Authors@R: person("Adrien", "Taudière", email = "adrien.taudiere@zaclys.net", role = c("aut", "cre", "cph"), comment = c(ORCID = "0000-0003-1088-1182")) Description: Facilitate the description, transformation, exploration, and reproducibility of metabarcoding analyses. 'MiscMetabar' is mainly built on top of the 'phyloseq', 'dada2' and 'targets' R packages. It helps to build reproducible and robust bioinformatics pipelines in R. 'MiscMetabar' makes ecological analysis of alpha and beta-diversity easier, more reproducible and more powerful by integrating a large number of tools. Important features are described in Taudière A. (2023) . diff --git a/NAMESPACE b/NAMESPACE index cdd709a3..f9e6345d 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -107,6 +107,7 @@ export(rename_samples_otu_table) export(reorder_taxa_pq) export(ridges_pq) export(rotl_pq) +export(sam_data_matching_names) export(sample_data_with_new_names) export(sankey_phyloseq) export(sankey_pq) @@ -166,5 +167,6 @@ importFrom(stats,runif) importFrom(stats,sd) importFrom(stats,terms) importFrom(utils,object.size) +importFrom(utils,read.csv) importFrom(utils,setTxtProgressBar) importFrom(utils,txtProgressBar) diff --git a/NEWS.md b/NEWS.md index 12620449..7df9df5f 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,7 +1,16 @@ +# MiscMetabar 0.10.3 (in development) + +- Add params `type`, `na_remove` and `verbose` to `ggvenn_pq()`. The type = "nb_seq" allow to plot Venn diagram with the number of shared sequences instead of shared ASV. +- Add automatic report in json for the function `cutadapt_remove_primers()`. +- Add param `verbose` to `track_wkflow()` and improve examples for `track_wkflow()` and `list_fastq_files` # MiscMetabar 0.10.2 (in development) - Improve code thanks to {lintr} package +- Add option `return_file_path` to `cutadapt_remove_primers()` in order to facilitate targets pipeline +- Add function `sam_data_matching_names()` to match and verify congruence between fastq files names and sample metadata (sam_data) + + # MiscMetabar 0.10.1 > CRAN 2024-09-10 @@ -12,7 +21,7 @@ # MiscMetabar 0.9.4 - Set a seed in the example of `build_tree_pq` to resubmit to CRAN - Add a param `return_a_vector` in function `filter_trim()` to make possible to return a vector of path as it is usefull when used with `targets::tar_targets(..., format="file")`) + Add a param `return_a_vector` in function `filter_trim()` to make possible to return a vector of path as it is useful when used with `targets::tar_targets(..., format="file")`) - Make some storage amelioration by replacing `list()` by `vector(list, ...)` # MiscMetabar 0.9.3 diff --git a/R/beta_div_test.R b/R/beta_div_test.R index c49de7bf..11553491 100644 --- a/R/beta_div_test.R +++ b/R/beta_div_test.R @@ -127,6 +127,12 @@ graph_test_pq <- function(physeq, #' effect. #' @param verbose (logical, default TRUE) If TRUE, prompt some messages. #' @param ... Other arguments passed on to [vegan::adonis2()] function. +#' Note that the parameter `by` is important. If by is set to NULL +#' (default) the p-value is computed for the entire model. +#' by = NULL will assess the overall significance of all terms together, +#' by = "terms" will assess significance for each term (sequentially from first to last), +#' setting by = "margin" will assess the marginal effects of the terms (each marginal term analysed in a model with all other variables), +#' by = "onedf" will analyse one-degree-of-freedom contrasts sequentially. The argument is passed on to anova.cca. #' @return The function returns an anova.cca result object with a #' new column for partial R^2. See help of [vegan::adonis2()] for #' more information. @@ -134,8 +140,12 @@ graph_test_pq <- function(physeq, #' data(enterotype) #' \donttest{ #' adonis_pq(enterotype, "SeqTech*Enterotype", na_remove = TRUE) -#' adonis_pq(enterotype, "SeqTech", dist_method = "jaccard") -#' adonis_pq(enterotype, "SeqTech", dist_method = "robust.aitchison") +#' adonis_pq(enterotype, "SeqTech*Enterotype", na_remove = TRUE, by = "terms") +#' adonis_pq(enterotype, "SeqTech*Enterotype", na_remove = TRUE, by = "onedf") +#' adonis_pq(enterotype, "SeqTech*Enterotype", na_remove = TRUE, by = "margin") +#' +#' adonis_pq(enterotype, "SeqTech", dist_method = "jaccard", by = "terms") +#' adonis_pq(enterotype, "SeqTech", dist_method = "robust.aitchison", by = "terms") #' } #' @export #' @author Adrien Taudière @@ -1038,11 +1048,20 @@ plot_ancombc_pq <- #' #' @author Adrien Taudière #' @examples -#' data_fungi_mini_woNA4height <- subset_samples( -#' data_fungi_mini, -#' !is.na(data_fungi_mini@sam_data$Height) -#' ) -#' taxa_only_in_one_level(data_fungi_mini_woNA4height, "Height", "High") +#' data_fungi_mini_woNA4height <- subset_samples( +#' data_fungi_mini, +#' !is.na(data_fungi_mini@sam_data$Height) +#' ) +#' taxa_only_in_one_level(data_fungi_mini_woNA4height, "Height", "High") +#' #' # Taxa present only in low height samples +#' suppressMessages(suppressWarnings( +#' taxa_only_in_one_level(data_fungi, "Height", "Low") +#' )) +#' # Number of taxa present only in sample of time equal to 15 +#' suppressMessages(suppressWarnings( +#' length(taxa_only_in_one_level(data_fungi, "Time", "15")) +#' )) + taxa_only_in_one_level <- function(physeq, modality, level, diff --git a/R/dada_phyloseq.R b/R/dada_phyloseq.R index e1d81974..263b2cf4 100644 --- a/R/dada_phyloseq.R +++ b/R/dada_phyloseq.R @@ -236,13 +236,20 @@ clean_pq <- function(physeq, #' data(enterotype) #' if (requireNamespace("pbapply")) { #' track_wkflow(list(data_fungi, enterotype), taxonomy_rank = c(3, 5)) +#' track_wkflow(list("data FUNGI"=data_fungi, +#' "fastq files forward" = +#' unlist(list_fastq_files(system.file("extdata", package = "MiscMetabar"), +#' paired_end = FALSE)))) #' } track_wkflow <- function(list_of_objects, obj_names = NULL, clean_pq = FALSE, taxonomy_rank = NULL, + verbose=TRUE, ...) { - message("Compute the number of sequences") + if(verbose) { + message("Compute the number of sequences") + } if (!is.null(obj_names)) { names(list_of_objects) <- obj_names } @@ -257,7 +264,9 @@ track_wkflow <- function(list_of_objects, track_nb_seq_per_obj <- pbapply::pblapply(list_of_objects, function(object) { + if(verbose) { message(paste("Start object of class:", class(object), sep = " ")) + } if (inherits(object, "phyloseq")) { sum(object@otu_table) } else if (inherits(object, "matrix")) { @@ -292,7 +301,9 @@ track_wkflow <- function(list_of_objects, message("Compute the number of clusters") track_nb_cluster_per_obj <- pbapply::pblapply(list_of_objects, function(object) { + if(verbose) { message(paste("Start object of class:", class(object), sep = " ")) + } if (inherits(object, "phyloseq")) { ntaxa(object) } else if (inherits(object, "matrix")) { @@ -318,7 +329,9 @@ track_wkflow <- function(list_of_objects, message("Compute the number of samples") track_nb_sam_per_obj <- pbapply::pblapply(list_of_objects, function(object) { + if(verbose) { message(paste("Start object of class:", class(object), sep = " ")) + } if (inherits(object, "phyloseq")) { nsamples(object) } else if (inherits(object, "matrix")) { @@ -347,7 +360,9 @@ track_wkflow <- function(list_of_objects, message("Compute the number of values in taxonomic rank") track_nb_tax_value_per_obj <- pbapply::pblapply(list_of_objects, function(object) { + if(verbose) { message(paste("Start object of class:", class(object), sep = " ")) + } if (inherits(object, "phyloseq")) { if (taxa_are_rows(object)) { apply(object@tax_table[taxonomy_rank, ], 1, function(x) { @@ -365,7 +380,9 @@ track_wkflow <- function(list_of_objects, names_taxonomic_rank <- pbapply::pblapply(list_of_objects, function(object) { + if(verbose) { message(paste("Start object of class:", class(object), sep = " ")) + } if (inherits(object, "phyloseq")) { if (taxa_are_rows(object)) { rownames(object@tax_table)[taxonomy_rank] @@ -1384,7 +1401,7 @@ verify_pq <- function(physeq, if (sum(is.na(physeq@sam_data)) > 0) { warning("At least one of your samples metadata columns contains NA.") } - if (sum(grepl("^[0-9]", sample_names(physeq)) > 0) && !silent) { + if (sum(grepl("^[0-9]", sample_names(physeq)) > 0)) { message( "At least one sample name start with a number. It may introduce bug in some function such @@ -2346,10 +2363,14 @@ physeq_or_string_to_dna <- function(physeq = NULL, dna_seq = NULL) { #' @param cmd_is_run (logical, default TRUE) Do the cutadapt command is run. #' If set to FALSE, the only effect of the function is to return a list of #' command to manually run in a terminal. +#' @param return_file_path (logical, default FALSE) If true, the function +#' return the path of the output folder (param `folder_output`). Useful +#' in targets workflow #' @param args_before_cutadapt (String) A one line bash command to run before #' to run cutadapt. For examples, "source ~/miniconda3/etc/profile.d/conda.sh && conda activate cutadaptenv &&" allow to bypass the conda init which asks to restart the shell #' -#' @return a list of command +#' @return a list of command or if `return_file_path` is TRUE, the path to +#' the output folder #' @export #' @author Adrien Taudière #' @@ -2397,12 +2418,15 @@ cutadapt_remove_primers <- function(path_to_fastq, pattern_R2 = "_R2", nb_files = Inf, cmd_is_run = TRUE, - args_before_cutadapt = "source ~/miniconda3/etc/profile.d/conda.sh && conda activate cutadaptenv && ") { + return_file_path = FALSE, + args_before_cutadapt = "source ~/miniconda3/etc/profile.d/conda.sh && conda activate cutadaptenv && " + ) { cmd <- list() if (!dir.exists(folder_output)) { dir.create(folder_output) } + if (is.null(primer_rev)) { lff <- list_fastq_files( path_to_fastq, @@ -2418,6 +2442,11 @@ cutadapt_remove_primers <- function(path_to_fastq, args_before_cutadapt, "cutadapt --cores=", nproc, + " --json=", + folder_output, + "/", + gsub(".fastq", "", gsub(".fastq.gz", "", basename(f))), + ".cutadapt.json", " --discard-untrimmed -g '", primer_fw, "' -o ", @@ -2446,6 +2475,11 @@ cutadapt_remove_primers <- function(path_to_fastq, args_before_cutadapt, "cutadapt -n 2 --cores=", nproc, + " --json=", + folder_output, + "/", + gsub(".fastq", "", gsub(".fastq.gz", "", basename(f))), + ".cutadapt.json", " --discard-untrimmed -g '", primer_fw, "' -G '", @@ -2478,7 +2512,11 @@ cutadapt_remove_primers <- function(path_to_fastq, )) unlink(paste0(tempdir(), "/script_cutadapt.sh")) } - return(cmd) + if(return_file_path){ + return(normalizePath(folder_output)) + } else { + return(cmd) + } } ################################################################################ @@ -2503,14 +2541,21 @@ cutadapt_remove_primers <- function(path_to_fastq, #' @return A vector of taxa names #' @export #' -#' @examples -#' # Taxa present only in low height samples -#' suppressMessages(suppressWarnings(taxa_only_in_one_level(data_fungi, "Height", "Low"))) -#' # Number of taxa present only in sample of time equal to 15 -#' suppressMessages(suppressWarnings(length(taxa_only_in_one_level(data_fungi, "Time", "15")))) -#' @seealso [ggvenn_pq()] and [upset_pq()] -#' @export #' @author Adrien Taudière +#' @examples +#' data_fungi_mini_woNA4height <- subset_samples( +#' data_fungi_mini, +#' !is.na(data_fungi_mini@sam_data$Height) +#' ) +#' taxa_only_in_one_level(data_fungi_mini_woNA4height, "Height", "High") +#' #' # Taxa present only in low height samples +#' suppressMessages(suppressWarnings( +#' taxa_only_in_one_level(data_fungi, "Height", "Low") +#' )) +#' # Number of taxa present only in sample of time equal to 15 +#' suppressMessages(suppressWarnings( +#' length(taxa_only_in_one_level(data_fungi, "Time", "15")) +#' )) taxa_only_in_one_level <- function(physeq, modality, @@ -2544,7 +2589,6 @@ taxa_only_in_one_level <- function(physeq, ################################################################################ - ################################################################################ #' Normalize OTU table using samples depth #' @description diff --git a/R/plot_functions.R b/R/plot_functions.R index 6b5a2356..a8524cda 100644 --- a/R/plot_functions.R +++ b/R/plot_functions.R @@ -1199,6 +1199,13 @@ venn_pq <- #' @param return_data_for_venn (logical, default FALSE) If TRUE, the plot is #' not returned, but the resulting dataframe to plot with ggVennDiagram package #' is returned. +#' @param verbose (logical, default TRUE) If TRUE, prompt some messages. +#' @param type If "nb_taxa" (default), the number of taxa (ASV, OTU or +#' taxonomic_rank if `taxonomic_rank` is not NULL) is +#' used in plot. If "nb_seq", the number of sequences is plotted. +#' `taxonomic_rank` is never used if type = "nb_seq". +#' @param na_remove (logical, default TRUE) If set to TRUE, remove samples with +#' NA in the variables set in `fact` param #' @param ... Other arguments for the `ggVennDiagram::ggVennDiagram` function #' for ex. `category.names`. #' @return A \code{\link[ggplot2]{ggplot}}2 plot representing Venn diagram of @@ -1225,6 +1232,8 @@ venn_pq <- #' data_fungi@sam_data$Height %in% c("Low", "High")) #' ggvenn_pq(data_fungi2, fact = "Height") #' +#' ggvenn_pq(data_fungi2, fact = "Height", type = "nb_seq") +#' #' ggvenn_pq(data_fungi, fact = "Height", add_nb_seq = TRUE, set_size = 4) #' ggvenn_pq(data_fungi, fact = "Height", rarefy_before_merging = TRUE) #' ggvenn_pq(data_fungi, fact = "Height", rarefy_after_merging = TRUE) + @@ -1240,11 +1249,13 @@ venn_pq <- #' geom_polygon(aes(X, Y, group = id, fill = name), #' data = ggVennDiagram::venn_regionedge(res_venn) #' ) + +#' scale_fill_manual(values = funky_color(7)) + #' # 2. set edge layer #' geom_path(aes(X, Y, color = id, group = id), #' data = ggVennDiagram::venn_setedge(res_venn), -#' show.legend = FALSE, linewidth = 3 +#' show.legend = FALSE, linewidth = 2 #' ) + +#' scale_color_manual(values = c("red", "red","blue")) + #' # 3. set label layer #' geom_text(aes(X, Y, label = name), #' data = ggVennDiagram::venn_setlabel(res_venn) @@ -1274,11 +1285,25 @@ ggvenn_pq <- function(physeq = NULL, rarefy_before_merging = FALSE, rarefy_after_merging = FALSE, return_data_for_venn = FALSE, + verbose = TRUE, + type = "nb_taxa", + na_remove = TRUE, ...) { if (!is.factor(physeq@sam_data[[fact]])) { physeq@sam_data[[fact]] <- as.factor(physeq@sam_data[[fact]]) } + if (na_remove) { + new_physeq <- subset_samples_pq(physeq, !is.na(physeq@sam_data[[fact]])) + if (nsamples(physeq) - nsamples(new_physeq) > 0 && verbose) { + message(paste0( + nsamples(physeq) - nsamples(new_physeq), + " were discarded due to NA in variable fact" + )) + } + physeq <- new_physeq + } + physeq <- taxa_as_columns(physeq) if (rarefy_before_merging) { @@ -1302,7 +1327,8 @@ ggvenn_pq <- function(physeq = NULL, newphyseq <- physeq new_DF <- newphyseq@sam_data[newphyseq@sam_data[[fact]] == f, ] sample_data(newphyseq) <- sample_data(new_DF) - if (is.null(taxonomic_rank)) { + newphyseq <- clean_pq(newphyseq) + if (is.null(taxonomic_rank) || type == "nb_seq") { res[[f]] <- colnames(newphyseq@otu_table[ , colSums(newphyseq@otu_table) > min_nb_seq @@ -1316,9 +1342,15 @@ ggvenn_pq <- function(physeq = NULL, } nb_seq <- c(nb_seq, sum(physeq@otu_table[physeq@sam_data[[fact]] == f, ], na.rm = TRUE)) + + if(type == "nb_seq") { + res[[f]] <- unlist(sapply(res[[f]], function(x) { + paste0(x, "_", seq(1, taxa_sums(physeq)[[x]])) + })) + } } - if (max(nb_seq) / min(nb_seq) > 2) { + if (max(nb_seq) / min(nb_seq) > 2 && verbose) { message( paste0( "Two modalities differ greatly (more than x2) in their number of sequences (", @@ -4010,7 +4042,7 @@ plot_var_part_pq <- #' #' @inheritParams clean_pq #' @param num_modality (required) Name of the numeric column in -#' `physeq@sam_data` to plot and test against hill numberk +#' `physeq@sam_data` to plot and test against hill number #' @param hill_scales (a vector of integer) The list of q values to compute #' the hill number H^q. If Null, no hill number are computed. Default value #' compute the Hill number 0 (Species richness), the Hill number 1 diff --git a/R/targets_misc.R b/R/targets_misc.R index 558e1d54..1fdca990 100644 --- a/R/targets_misc.R +++ b/R/targets_misc.R @@ -19,8 +19,9 @@ #' @export #' #' @examples -#' list_fastq_files("extdata") -#' list_fastq_files("extdata", paired_end = FALSE, pattern_R1 = "") +#' list_fastq_files(system.file("extdata", package = "MiscMetabar")) +#' list_fastq_files(system.file("extdata", package = "MiscMetabar"), +#' paired_end = FALSE, pattern_R1 = "") #' #' @author Adrien Taudière @@ -173,36 +174,25 @@ filter_trim <- ) dir.create(output_rev) dir.create(output_fw) - file.rename( - paste0(output_rev, "interm"), - paste0(output_rev, "/", basename(rev)) - ) - file.rename( - paste0(output_fw, "interm"), - paste0(output_fw, "/", basename(fw)) - ) + file.rename(paste0(output_rev, "interm"), + paste0(output_rev, "/", basename(rev))) + file.rename(paste0(output_fw, "interm"), + paste0(output_fw, "/", basename(fw))) return(list("fw" = output_fw, "rv" = output_rev)) } else { - dada2::filterAndTrim( - filt = paste0(output_fw, "interm"), - fwd = fw, - ... - ) + dada2::filterAndTrim(filt = paste0(output_fw, "interm"), + fwd = fw, + ...) dir.create(output_fw) - file.rename( - paste0(output_fw, "interm"), - paste0(output_fw, "/", basename(fw)) - ) + file.rename(paste0(output_fw, "interm"), + paste0(output_fw, "/", basename(fw))) return(output_fw) } } else { if (is.null(rev)) { - dada2::filterAndTrim( - filt = output_fw, - fwd = fw, ... - ) + dada2::filterAndTrim(filt = output_fw, fwd = fw, ...) return(output_fw) } else { dada2::filterAndTrim( @@ -300,7 +290,8 @@ sample_data_with_new_names <- function(file_path, rename_samples <- function(phyloseq_component, names_of_samples, taxa_are_rows = FALSE) { - if (is.null(sample_names(phyloseq_component)) && inherits(phyloseq_component, "matrix")) { + if (is.null(sample_names(phyloseq_component)) && + inherits(phyloseq_component, "matrix")) { phyloseq_component <- otu_table(phyloseq_component, taxa_are_rows = taxa_are_rows) } if (length(names_of_samples) != length(sample_names(phyloseq_component))) { @@ -312,3 +303,127 @@ rename_samples <- function(phyloseq_component, return(new_pq_component) } ################################################################################ + + +################################################################################ +#' Match sample names from sam_data and fastq files +#' +#' @description +#' +#' +#' lifecycle-experimental +#' +#' Useful for targets bioinformatic pipeline. +#' +#' @param path_sam_data (Required) Path to sample data file. +#' @param sample_col_name (Required) The name of the column defining sample names in the sample data file. +#' @param path_raw_seq (Required) Path to the folder containing fastq files +#' @param pattern_remove_sam_data If not null, describe the pattern that will be deleted from +#' sam_data samples names. +#' @param pattern_remove_fastq_files If not null, describe the pattern that will be deleted from +#' fastq files names. +#' @param verbose (logical, default TRUE) If TRUE, print some additional messages. +#' @param remove_undocumented_fastq_files (logical, default FALSE) If set to TRUE +#' fastq files not present in sam_data are removed from your folder. +#' Keep a copy of those files somewhere before. +#' @param prefix Add a prefix to new samples names (ex. prefix = "samp") +#' @param ... Other parameters passed on to [utils::read.csv()] function. +#' @return A list of two objects : +#' - $sam_names_matching is a tibble of corresponding samples names +#' - $sam_data is a sample data files including only matching sample names +#' @importFrom utils read.csv +#' @export +#' @author Adrien Taudière +sam_data_matching_names <- function(path_sam_data, + sample_col_name, + path_raw_seq, + pattern_remove_sam_data = NULL, + pattern_remove_fastq_files = NULL, + verbose = TRUE, + remove_undocumented_fastq_files = FALSE, + prefix = NULL, + ...) { + sam_d <- read.csv(path_sam_data, ...) + names_sam_data <- sam_d[[sample_col_name]] + names_fastq_files_fullpath <- list.files(path_raw_seq) + names_fastq_files <- basename(names_fastq_files_fullpath) + + if (!is.null(pattern_remove_sam_data)) { + names_sam_data_clean <- gsub(pattern_remove_sam_data, "", names_sam_data) + if (sum(duplicated(names_sam_data_clean)) > 0) { + stop( + "Their is duplicates in samples names from sam_data when removing pattern_remove_sam_data", + names_sam_data_clean[duplicated(names_sam_data_clean)] + ) + } + } else { + names_sam_data_clean <- names_sam_data + } + + if (!is.null(pattern_remove_fastq_files)) { + names_fastq_files_clean <- gsub(pattern_remove_fastq_files, "", names_fastq_files) + } else { + names_fastq_files_clean <- names_fastq_files + } + + tib_sam_data <- tibble( + clean_sam = as.character(names_sam_data_clean), + raw_sam = as.character(names_sam_data) + ) + + tib_fastq <- tibble( + clean_fastq = as.character(names_fastq_files_clean), + raw_fastq = as.character(names_fastq_files), + raw_fastq_full_path = names_fastq_files_fullpath + ) + + tib_j <- full_join(tib_fastq, tib_sam_data, by = join_by(clean_fastq == clean_sam)) |> + rename(common_names = clean_fastq) + + if (sum(is.na(tib_j$raw_fastq)) > 0) { + message(sum(is.na(tib_j$raw_fastq)), + " samples in sam_data files are not present in fastq_files") + if (verbose) { + warning(tib_j$raw_sam[is.na(tib_j$raw_fastq)], + "not_matching_names_from_sam_data.txt") + } + } + + if (sum(is.na(tib_j$raw_sam)) > 0) { + message(sum(is.na(tib_j$raw_sam)), + " samples in fastq files are not present in sam_data") + if (verbose) { + warning(tib_j$raw_fastq[is.na(tib_j$raw_sam)], + "not_matching_names_from_fastq_files.txt") + } + if (remove_undocumented_fastq_files) { + if (verbose) { + warning( + "Files ", + tib_j$raw_fastq_full_path[is.na(tib_j$raw_sam)], + "will be removed from folder", + path_raw_seq + ) + } + unlink(tib_j$raw_fastq_full_path[is.na(tib_j$raw_sam)]) + } + } + + if (!is.null(prefix)) { + tib_j$common_names <- paste0(prefix, tib_j$common_names) + } + + if (is.null(pattern_remove_sam_data)) { + sam_d_new <- sam_d |> + dplyr::filter(.data[[sample_col_name]] %in% tib_j$raw_sam[!is.na(tib_j$raw_fastq)]) |> + dplyr::mutate("samples_names_common" = paste0(prefix, .data[[sample_col_name]])) |> + relocate(samples_names_common) + } else { + sam_d_new <- sam_d |> + dplyr::filter(.data[[sample_col_name]] %in% tib_j$raw_sam[!is.na(tib_j$raw_fastq)]) |> + dplyr::mutate("samples_names_common" = paste0(prefix, gsub(pattern_remove_sam_data, "", .data[[sample_col_name]]))) |> + relocate(samples_names_common) + } + return(list("sam_names_matching" = tib_j, "sam_data" = sam_d_new)) +} +################################################################################ diff --git a/README.Rmd b/README.Rmd index ca950b87..fd68ee05 100644 --- a/README.Rmd +++ b/README.Rmd @@ -86,6 +86,11 @@ For developers, I also wrote an article describing some [rules of codes](https:/ ### Summarize a physeq object ```{r example} +#| fig.alt: > +#| Four rectangles represent the four component of an example phyloseq +#| dataset. In each rectangle, some informations about the component are +#| shown. + library("MiscMetabar") library("phyloseq") library("magrittr") @@ -95,20 +100,29 @@ summary_plot_pq(data_fungi) ### Alpha-diversity analysis -```{r, fig.cap="Hill number 1"} +```{r, fig.cap="Hill number 0"} +#| fig.alt: > +#| Hill number 0, aka richness are plot in function of +#| the height modality p <- MiscMetabar::hill_pq(data_fungi, fact = "Height") p$plot_Hill_0 ``` ```{r, fig.cap="Result of the Tuckey post-hoc test"} +#| fig.alt: > +#| The result of the tuckey HSD test of hill number by the +#| height modality. p$plot_tuckey ``` ### Beta-diversity analysis ```{r} +#| fig.alt: > +#| A venn diagram showing the number of shared ASV and the percentage +#| of shared ASV between the three modality of Height (low, middle and high). if (!require("ggVennDiagram", quietly = TRUE)) { - install.packages("ggVennDiagramà") + install.packages("ggVennDiagram") } ggvenn_pq(data_fungi, fact = "Height") + ggplot2::scale_fill_distiller(palette = "BuPu", direction = 1) + @@ -117,7 +131,7 @@ ggvenn_pq(data_fungi, fact = "Height") + ### Note for non-Linux users -Some functions may not work on Windows (*e.g.* `track_wflow()`, `cutadapt_remove_primers()`, `krona()`, `vsearch_clustering()`, ...). A solution is to exploit docker container, for example the using the great [rocker project](https://rocker-project.org/). +Some functions may not work on Windows (*e.g.* `track_wkflow()`, `cutadapt_remove_primers()`, `krona()`, `vsearch_clustering()`, ...). A solution is to exploit docker container, for example the using the great [rocker project](https://rocker-project.org/). Here is a list of functions with some limitations or not working at all on Windows OS: diff --git a/README.md b/README.md index 86114e03..e41a085f 100644 --- a/README.md +++ b/README.md @@ -123,7 +123,7 @@ data("data_fungi") summary_plot_pq(data_fungi) ``` - +Four rectangles represent the four component of an example phyloseq dataset. In each rectangle, some informations about the component are shown. ### Alpha-diversity analysis @@ -142,18 +142,18 @@ p$plot_tuckey ``` r if (!require("ggVennDiagram", quietly = TRUE)) { - install.packages("ggVennDiagramà") + install.packages("ggVennDiagram") } ggvenn_pq(data_fungi, fact = "Height") + ggplot2::scale_fill_distiller(palette = "BuPu", direction = 1) + labs(title = "Share number of ASV among Height in tree") ``` - +A venn diagram showing the number of shared ASV and the percentage of shared ASV between the three modality of Height (low, middle and high). ### Note for non-Linux users -Some functions may not work on Windows (*e.g.* `track_wflow()`, +Some functions may not work on Windows (*e.g.* `track_wkflow()`, `cutadapt_remove_primers()`, `krona()`, `vsearch_clustering()`, …). A solution is to exploit docker container, for example the using the great [rocker project](https://rocker-project.org/). diff --git a/_pkgdown.yml b/_pkgdown.yml index 0099ff01..69b483c1 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -108,6 +108,7 @@ reference: - subtitle: Sample data contents: - add_info_to_sam_data + - sam_data_matching_names - subtitle: Phylogenetic tree contents: diff --git a/docs/404.html b/docs/404.html index 1345ae21..be9a96d5 100644 --- a/docs/404.html +++ b/docs/404.html @@ -27,7 +27,7 @@ MiscMetabar - 0.10.2 + 0.10.3 + + + + + +
+
+
+ +
+

+lifecycle-experimental

+ +

Useful for targets bioinformatic pipeline.

+
+ +
+

Usage

+
sam_data_matching_names(
+  path_sam_data,
+  sample_col_name,
+  path_raw_seq,
+  pattern_remove_sam_data = NULL,
+  pattern_remove_fastq_files = NULL,
+  verbose = TRUE,
+  remove_undocumented_fastq_files = FALSE,
+  prefix = NULL,
+  ...
+)
+
+ +
+

Arguments

+ + +
path_sam_data
+

(Required) Path to sample data file.

+ + +
sample_col_name
+

(Required) The name of the column defining sample names in the sample data file.

+ + +
path_raw_seq
+

(Required) Path to the folder containing fastq files

+ + +
pattern_remove_sam_data
+

If not null, describe the pattern that will be deleted from +sam_data samples names.

+ + +
pattern_remove_fastq_files
+

If not null, describe the pattern that will be deleted from +fastq files names.

+ + +
verbose
+

(logical, default TRUE) If TRUE, print some additional messages.

+ + +
remove_undocumented_fastq_files
+

(logical, default FALSE) If set to TRUE +fastq files not present in sam_data are removed from your folder. +Keep a copy of those files somewhere before.

+ + +
prefix
+

Add a prefix to new samples names (ex. prefix = "samp")

+ + +
...
+

Other parameters passed on to utils::read.csv() function.

+ +
+
+

Value

+

A list of two objects :

  • $sam_names_matching is a tibble of corresponding samples names

  • +
  • $sam_data is a sample data files including only matching sample names

  • +
+
+

Author

+

Adrien Taudière

+
+ +
+ + +
+ + + +
+ + + + + + + diff --git a/docs/reference/sample_data_with_new_names.html b/docs/reference/sample_data_with_new_names.html index cc203d99..db55bcd0 100644 --- a/docs/reference/sample_data_with_new_names.html +++ b/docs/reference/sample_data_with_new_names.html @@ -13,7 +13,7 @@ MiscMetabar - 0.10.2 + 0.10.3