Skip to content

Commit

Permalink
Merge pull request #105 from adrientaudiere/dev
Browse files Browse the repository at this point in the history
adrientaudiere authored Nov 27, 2024
2 parents d54233e + 6abb797 commit 2c4553b
Showing 193 changed files with 1,333 additions and 531 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: MiscMetabar
Type: Package
Title: Miscellaneous Functions for Metabarcoding Analysis
Version: 0.10.2
Version: 0.10.3
Authors@R: person("Adrien", "Taudière", email = "[email protected]",
role = c("aut", "cre", "cph"), comment = c(ORCID = "0000-0003-1088-1182"))
Description: Facilitate the description, transformation, exploration, and reproducibility of metabarcoding analyses. 'MiscMetabar' is mainly built on top of the 'phyloseq', 'dada2' and 'targets' R packages. It helps to build reproducible and robust bioinformatics pipelines in R. 'MiscMetabar' makes ecological analysis of alpha and beta-diversity easier, more reproducible and more powerful by integrating a large number of tools. Important features are described in Taudière A. (2023) <doi:10.21105/joss.06038>.
2 changes: 2 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -107,6 +107,7 @@ export(rename_samples_otu_table)
export(reorder_taxa_pq)
export(ridges_pq)
export(rotl_pq)
export(sam_data_matching_names)
export(sample_data_with_new_names)
export(sankey_phyloseq)
export(sankey_pq)
@@ -166,5 +167,6 @@ importFrom(stats,runif)
importFrom(stats,sd)
importFrom(stats,terms)
importFrom(utils,object.size)
importFrom(utils,read.csv)
importFrom(utils,setTxtProgressBar)
importFrom(utils,txtProgressBar)
11 changes: 10 additions & 1 deletion NEWS.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,16 @@
# MiscMetabar 0.10.3 (in development)

- Add params `type`, `na_remove` and `verbose` to `ggvenn_pq()`. The type = "nb_seq" allow to plot Venn diagram with the number of shared sequences instead of shared ASV.
- Add automatic report in json for the function `cutadapt_remove_primers()`.
- Add param `verbose` to `track_wkflow()` and improve examples for `track_wkflow()` and `list_fastq_files`

# MiscMetabar 0.10.2 (in development)

- Improve code thanks to {lintr} package
- Add option `return_file_path` to `cutadapt_remove_primers()` in order to facilitate targets pipeline
- Add function `sam_data_matching_names()` to match and verify congruence between fastq files names and sample metadata (sam_data)


# MiscMetabar 0.10.1

> CRAN 2024-09-10
@@ -12,7 +21,7 @@
# MiscMetabar 0.9.4

- Set a seed in the example of `build_tree_pq` to resubmit to CRAN
Add a param `return_a_vector` in function `filter_trim()` to make possible to return a vector of path as it is usefull when used with `targets::tar_targets(..., format="file")`)
Add a param `return_a_vector` in function `filter_trim()` to make possible to return a vector of path as it is useful when used with `targets::tar_targets(..., format="file")`)
- Make some storage amelioration by replacing `list()` by `vector(list, ...)`

# MiscMetabar 0.9.3
33 changes: 26 additions & 7 deletions R/beta_div_test.R
Original file line number Diff line number Diff line change
@@ -127,15 +127,25 @@ graph_test_pq <- function(physeq,
#' effect.
#' @param verbose (logical, default TRUE) If TRUE, prompt some messages.
#' @param ... Other arguments passed on to [vegan::adonis2()] function.
#' Note that the parameter `by` is important. If by is set to NULL
#' (default) the p-value is computed for the entire model.
#' by = NULL will assess the overall significance of all terms together,
#' by = "terms" will assess significance for each term (sequentially from first to last),
#' setting by = "margin" will assess the marginal effects of the terms (each marginal term analysed in a model with all other variables),
#' by = "onedf" will analyse one-degree-of-freedom contrasts sequentially. The argument is passed on to anova.cca.
#' @return The function returns an anova.cca result object with a
#' new column for partial R^2. See help of [vegan::adonis2()] for
#' more information.
#' @examples
#' data(enterotype)
#' \donttest{
#' adonis_pq(enterotype, "SeqTech*Enterotype", na_remove = TRUE)
#' adonis_pq(enterotype, "SeqTech", dist_method = "jaccard")
#' adonis_pq(enterotype, "SeqTech", dist_method = "robust.aitchison")
#' adonis_pq(enterotype, "SeqTech*Enterotype", na_remove = TRUE, by = "terms")
#' adonis_pq(enterotype, "SeqTech*Enterotype", na_remove = TRUE, by = "onedf")
#' adonis_pq(enterotype, "SeqTech*Enterotype", na_remove = TRUE, by = "margin")
#'
#' adonis_pq(enterotype, "SeqTech", dist_method = "jaccard", by = "terms")
#' adonis_pq(enterotype, "SeqTech", dist_method = "robust.aitchison", by = "terms")
#' }
#' @export
#' @author Adrien Taudière
@@ -1038,11 +1048,20 @@ plot_ancombc_pq <-
#'
#' @author Adrien Taudière
#' @examples
#' data_fungi_mini_woNA4height <- subset_samples(
#' data_fungi_mini,
#' !is.na(data_fungi_mini@sam_data$Height)
#' )
#' taxa_only_in_one_level(data_fungi_mini_woNA4height, "Height", "High")
#' data_fungi_mini_woNA4height <- subset_samples(
#' data_fungi_mini,
#' !is.na(data_fungi_mini@sam_data$Height)
#' )
#' taxa_only_in_one_level(data_fungi_mini_woNA4height, "Height", "High")
#' #' # Taxa present only in low height samples
#' suppressMessages(suppressWarnings(
#' taxa_only_in_one_level(data_fungi, "Height", "Low")
#' ))
#' # Number of taxa present only in sample of time equal to 15
#' suppressMessages(suppressWarnings(
#' length(taxa_only_in_one_level(data_fungi, "Time", "15"))
#' ))

taxa_only_in_one_level <- function(physeq,
modality,
level,
70 changes: 57 additions & 13 deletions R/dada_phyloseq.R
Original file line number Diff line number Diff line change
@@ -236,13 +236,20 @@ clean_pq <- function(physeq,
#' data(enterotype)
#' if (requireNamespace("pbapply")) {
#' track_wkflow(list(data_fungi, enterotype), taxonomy_rank = c(3, 5))
#' track_wkflow(list("data FUNGI"=data_fungi,
#' "fastq files forward" =
#' unlist(list_fastq_files(system.file("extdata", package = "MiscMetabar"),
#' paired_end = FALSE))))
#' }
track_wkflow <- function(list_of_objects,
obj_names = NULL,
clean_pq = FALSE,
taxonomy_rank = NULL,
verbose=TRUE,
...) {
message("Compute the number of sequences")
if(verbose) {
message("Compute the number of sequences")
}
if (!is.null(obj_names)) {
names(list_of_objects) <- obj_names
}
@@ -257,7 +264,9 @@ track_wkflow <- function(list_of_objects,

track_nb_seq_per_obj <-
pbapply::pblapply(list_of_objects, function(object) {
if(verbose) {
message(paste("Start object of class:", class(object), sep = " "))
}
if (inherits(object, "phyloseq")) {
sum(object@otu_table)
} else if (inherits(object, "matrix")) {
@@ -292,7 +301,9 @@ track_wkflow <- function(list_of_objects,
message("Compute the number of clusters")
track_nb_cluster_per_obj <-
pbapply::pblapply(list_of_objects, function(object) {
if(verbose) {
message(paste("Start object of class:", class(object), sep = " "))
}
if (inherits(object, "phyloseq")) {
ntaxa(object)
} else if (inherits(object, "matrix")) {
@@ -318,7 +329,9 @@ track_wkflow <- function(list_of_objects,
message("Compute the number of samples")
track_nb_sam_per_obj <-
pbapply::pblapply(list_of_objects, function(object) {
if(verbose) {
message(paste("Start object of class:", class(object), sep = " "))
}
if (inherits(object, "phyloseq")) {
nsamples(object)
} else if (inherits(object, "matrix")) {
@@ -347,7 +360,9 @@ track_wkflow <- function(list_of_objects,
message("Compute the number of values in taxonomic rank")
track_nb_tax_value_per_obj <-
pbapply::pblapply(list_of_objects, function(object) {
if(verbose) {
message(paste("Start object of class:", class(object), sep = " "))
}
if (inherits(object, "phyloseq")) {
if (taxa_are_rows(object)) {
apply(object@tax_table[taxonomy_rank, ], 1, function(x) {
@@ -365,7 +380,9 @@ track_wkflow <- function(list_of_objects,

names_taxonomic_rank <-
pbapply::pblapply(list_of_objects, function(object) {
if(verbose) {
message(paste("Start object of class:", class(object), sep = " "))
}
if (inherits(object, "phyloseq")) {
if (taxa_are_rows(object)) {
rownames(object@tax_table)[taxonomy_rank]
@@ -1384,7 +1401,7 @@ verify_pq <- function(physeq,
if (sum(is.na(physeq@sam_data)) > 0) {
warning("At least one of your samples metadata columns contains NA.")
}
if (sum(grepl("^[0-9]", sample_names(physeq)) > 0) && !silent) {
if (sum(grepl("^[0-9]", sample_names(physeq)) > 0)) {
message(
"At least one sample name start with a number.
It may introduce bug in some function such
@@ -2346,10 +2363,14 @@ physeq_or_string_to_dna <- function(physeq = NULL, dna_seq = NULL) {
#' @param cmd_is_run (logical, default TRUE) Do the cutadapt command is run.
#' If set to FALSE, the only effect of the function is to return a list of
#' command to manually run in a terminal.
#' @param return_file_path (logical, default FALSE) If true, the function
#' return the path of the output folder (param `folder_output`). Useful
#' in targets workflow
#' @param args_before_cutadapt (String) A one line bash command to run before
#' to run cutadapt. For examples, "source ~/miniconda3/etc/profile.d/conda.sh && conda activate cutadaptenv &&" allow to bypass the conda init which asks to restart the shell
#'
#' @return a list of command
#' @return a list of command or if `return_file_path` is TRUE, the path to
#' the output folder
#' @export
#' @author Adrien Taudière
#'
@@ -2397,12 +2418,15 @@ cutadapt_remove_primers <- function(path_to_fastq,
pattern_R2 = "_R2",
nb_files = Inf,
cmd_is_run = TRUE,
args_before_cutadapt = "source ~/miniconda3/etc/profile.d/conda.sh && conda activate cutadaptenv && ") {
return_file_path = FALSE,
args_before_cutadapt = "source ~/miniconda3/etc/profile.d/conda.sh && conda activate cutadaptenv && "
) {
cmd <- list()
if (!dir.exists(folder_output)) {
dir.create(folder_output)
}


if (is.null(primer_rev)) {
lff <- list_fastq_files(
path_to_fastq,
@@ -2418,6 +2442,11 @@ cutadapt_remove_primers <- function(path_to_fastq,
args_before_cutadapt,
"cutadapt --cores=",
nproc,
" --json=",
folder_output,
"/",
gsub(".fastq", "", gsub(".fastq.gz", "", basename(f))),
".cutadapt.json",
" --discard-untrimmed -g '",
primer_fw,
"' -o ",
@@ -2446,6 +2475,11 @@ cutadapt_remove_primers <- function(path_to_fastq,
args_before_cutadapt,
"cutadapt -n 2 --cores=",
nproc,
" --json=",
folder_output,
"/",
gsub(".fastq", "", gsub(".fastq.gz", "", basename(f))),
".cutadapt.json",
" --discard-untrimmed -g '",
primer_fw,
"' -G '",
@@ -2478,7 +2512,11 @@ cutadapt_remove_primers <- function(path_to_fastq,
))
unlink(paste0(tempdir(), "/script_cutadapt.sh"))
}
return(cmd)
if(return_file_path){
return(normalizePath(folder_output))
} else {
return(cmd)
}
}
################################################################################

@@ -2503,14 +2541,21 @@ cutadapt_remove_primers <- function(path_to_fastq,
#' @return A vector of taxa names
#' @export
#'
#' @examples
#' # Taxa present only in low height samples
#' suppressMessages(suppressWarnings(taxa_only_in_one_level(data_fungi, "Height", "Low")))
#' # Number of taxa present only in sample of time equal to 15
#' suppressMessages(suppressWarnings(length(taxa_only_in_one_level(data_fungi, "Time", "15"))))
#' @seealso [ggvenn_pq()] and [upset_pq()]
#' @export
#' @author Adrien Taudière
#' @examples
#' data_fungi_mini_woNA4height <- subset_samples(
#' data_fungi_mini,
#' !is.na(data_fungi_mini@sam_data$Height)
#' )
#' taxa_only_in_one_level(data_fungi_mini_woNA4height, "Height", "High")
#' #' # Taxa present only in low height samples
#' suppressMessages(suppressWarnings(
#' taxa_only_in_one_level(data_fungi, "Height", "Low")
#' ))
#' # Number of taxa present only in sample of time equal to 15
#' suppressMessages(suppressWarnings(
#' length(taxa_only_in_one_level(data_fungi, "Time", "15"))
#' ))

taxa_only_in_one_level <- function(physeq,
modality,
@@ -2544,7 +2589,6 @@ taxa_only_in_one_level <- function(physeq,
################################################################################



################################################################################
#' Normalize OTU table using samples depth
#' @description
40 changes: 36 additions & 4 deletions R/plot_functions.R
Original file line number Diff line number Diff line change
@@ -1199,6 +1199,13 @@ venn_pq <-
#' @param return_data_for_venn (logical, default FALSE) If TRUE, the plot is
#' not returned, but the resulting dataframe to plot with ggVennDiagram package
#' is returned.
#' @param verbose (logical, default TRUE) If TRUE, prompt some messages.
#' @param type If "nb_taxa" (default), the number of taxa (ASV, OTU or
#' taxonomic_rank if `taxonomic_rank` is not NULL) is
#' used in plot. If "nb_seq", the number of sequences is plotted.
#' `taxonomic_rank` is never used if type = "nb_seq".
#' @param na_remove (logical, default TRUE) If set to TRUE, remove samples with
#' NA in the variables set in `fact` param
#' @param ... Other arguments for the `ggVennDiagram::ggVennDiagram` function
#' for ex. `category.names`.
#' @return A \code{\link[ggplot2]{ggplot}}2 plot representing Venn diagram of
@@ -1225,6 +1232,8 @@ venn_pq <-
#' data_fungi@sam_data$Height %in% c("Low", "High"))
#' ggvenn_pq(data_fungi2, fact = "Height")
#'
#' ggvenn_pq(data_fungi2, fact = "Height", type = "nb_seq")
#'
#' ggvenn_pq(data_fungi, fact = "Height", add_nb_seq = TRUE, set_size = 4)
#' ggvenn_pq(data_fungi, fact = "Height", rarefy_before_merging = TRUE)
#' ggvenn_pq(data_fungi, fact = "Height", rarefy_after_merging = TRUE) +
@@ -1240,11 +1249,13 @@ venn_pq <-
#' geom_polygon(aes(X, Y, group = id, fill = name),
#' data = ggVennDiagram::venn_regionedge(res_venn)
#' ) +
#' scale_fill_manual(values = funky_color(7)) +
#' # 2. set edge layer
#' geom_path(aes(X, Y, color = id, group = id),
#' data = ggVennDiagram::venn_setedge(res_venn),
#' show.legend = FALSE, linewidth = 3
#' show.legend = FALSE, linewidth = 2
#' ) +
#' scale_color_manual(values = c("red", "red","blue")) +
#' # 3. set label layer
#' geom_text(aes(X, Y, label = name),
#' data = ggVennDiagram::venn_setlabel(res_venn)
@@ -1274,11 +1285,25 @@ ggvenn_pq <- function(physeq = NULL,
rarefy_before_merging = FALSE,
rarefy_after_merging = FALSE,
return_data_for_venn = FALSE,
verbose = TRUE,
type = "nb_taxa",
na_remove = TRUE,
...) {
if (!is.factor(physeq@sam_data[[fact]])) {
physeq@sam_data[[fact]] <- as.factor(physeq@sam_data[[fact]])
}

if (na_remove) {
new_physeq <- subset_samples_pq(physeq, !is.na(physeq@sam_data[[fact]]))
if (nsamples(physeq) - nsamples(new_physeq) > 0 && verbose) {
message(paste0(
nsamples(physeq) - nsamples(new_physeq),
" were discarded due to NA in variable fact"
))
}
physeq <- new_physeq
}

physeq <- taxa_as_columns(physeq)

if (rarefy_before_merging) {
@@ -1302,7 +1327,8 @@ ggvenn_pq <- function(physeq = NULL,
newphyseq <- physeq
new_DF <- newphyseq@sam_data[newphyseq@sam_data[[fact]] == f, ]
sample_data(newphyseq) <- sample_data(new_DF)
if (is.null(taxonomic_rank)) {
newphyseq <- clean_pq(newphyseq)
if (is.null(taxonomic_rank) || type == "nb_seq") {
res[[f]] <- colnames(newphyseq@otu_table[
,
colSums(newphyseq@otu_table) > min_nb_seq
@@ -1316,9 +1342,15 @@ ggvenn_pq <- function(physeq = NULL,
}
nb_seq <-
c(nb_seq, sum(physeq@otu_table[physeq@sam_data[[fact]] == f, ], na.rm = TRUE))

if(type == "nb_seq") {
res[[f]] <- unlist(sapply(res[[f]], function(x) {
paste0(x, "_", seq(1, taxa_sums(physeq)[[x]]))
}))
}
}

if (max(nb_seq) / min(nb_seq) > 2) {
if (max(nb_seq) / min(nb_seq) > 2 && verbose) {
message(
paste0(
"Two modalities differ greatly (more than x2) in their number of sequences (",
@@ -4010,7 +4042,7 @@ plot_var_part_pq <-
#'
#' @inheritParams clean_pq
#' @param num_modality (required) Name of the numeric column in
#' `physeq@sam_data` to plot and test against hill numberk
#' `physeq@sam_data` to plot and test against hill number
#' @param hill_scales (a vector of integer) The list of q values to compute
#' the hill number H^q. If Null, no hill number are computed. Default value
#' compute the Hill number 0 (Species richness), the Hill number 1
Loading

0 comments on commit 2c4553b

Please sign in to comment.