diff --git a/.github/workflows/draft-pdf.yaml b/.github/workflows/draft-pdf.yaml index 65635939..7e81bd7e 100644 --- a/.github/workflows/draft-pdf.yaml +++ b/.github/workflows/draft-pdf.yaml @@ -1,5 +1,6 @@ -on: [push] - +on: + push: + branches: [main, master] jobs: paper: runs-on: ubuntu-latest diff --git a/.github/workflows/pkgdown.yaml b/.github/workflows/pkgdown.yaml index e1e2e525..6251d2ed 100644 --- a/.github/workflows/pkgdown.yaml +++ b/.github/workflows/pkgdown.yaml @@ -2,11 +2,8 @@ # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help on: push: - branches: [main, master] - pull_request: - branches: [main, master] - release: - types: [published] + branches: + - master workflow_dispatch: name: pkgdown @@ -33,6 +30,12 @@ jobs: extra-packages: any::pkgdown, local::. needs: website + - name: Install vsearch + run: sudo apt-get install vsearch + + - name: Install blastn + run: sudo apt-get install ncbi-blast+ + - name: Build site run: pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE) shell: Rscript {0} diff --git a/.github/workflows/test-coverage.yaml b/.github/workflows/test-coverage.yaml index 74a141bd..1c4c396c 100644 --- a/.github/workflows/test-coverage.yaml +++ b/.github/workflows/test-coverage.yaml @@ -2,9 +2,8 @@ # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help on: push: - branches: [main, master] - pull_request: - branches: [main, master] + branches: + - master name: test-coverage diff --git a/DESCRIPTION b/DESCRIPTION index ea601364..2bd22cd6 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -18,17 +18,20 @@ Depends: Suggests: Biostrings, circlize, + ComplexUpset, data.table, DECIPHER, DESeq2, DT, edgeR, + formattable, gghalves, ggVennDiagram, grDevices, grid, gridExtra, here, + iNEXT, knitr, lulu, metacoder, @@ -51,6 +54,7 @@ Suggests: stringr, testthat (>= 3.0.0), tibble, + tidyr, vegan, venneuler, viridis diff --git a/NAMESPACE b/NAMESPACE index a3b8f2a9..af4e025f 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -20,6 +20,7 @@ export(clean_physeq) export(clean_pq) export(compare_pairs_pq) export(count_seq) +export(diff_fct_diff_class) export(dist_bycol) export(dist_pos_control) export(filter_asv_blast) @@ -33,6 +34,7 @@ export(hill_phyloseq) export(hill_pq) export(hill_tuckey_phyloseq) export(hill_tuckey_pq) +export(iNEXT_pq) export(krona) export(list_fastq_files) export(lulu_phyloseq) @@ -71,6 +73,7 @@ export(tax_datatable) export(track_wkflow) export(track_wkflow_samples) export(tsne_pq) +export(upset_pq) export(venn_phyloseq) export(venn_pq) export(verify_pq) diff --git a/NEWS.md b/NEWS.md index cd7f762a..e0298faf 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,6 +1,11 @@ # MiscMetabar 0.41 (in development) - Add function `iNEXT_pq()` to calculate hill diversity using the [iNEXT](https://github.com/AnneChao/iNEXT) package. +- Add argument `paires` to `multi_biplot_pq()` in order to indicate all paires of samples we want to print. +- Improve `compare_pairs_pq()` with information about the number of shared sequences among paires +- Add function `upset_pq()` to plot upset of phyloseq object using the [ComplexUpset](https://krassowski.github.io/complex-upset/) package +- Add info (param `add_info`) in subtitle of the `hill_pq()` function + # MiscMetabar 0.40 diff --git a/R/MiscMetabar-package.R b/R/MiscMetabar-package.R index 1e413c2c..388188ba 100644 --- a/R/MiscMetabar-package.R +++ b/R/MiscMetabar-package.R @@ -9,7 +9,7 @@ NULL if (getRversion() >= "2.15.1") { utils::globalVariables(c( - ".id", "%>%", "Ab", "Abundance", "col_tax", "combn", "complement", "devtools", "e-value", "Family", "Genus", "grid.draw", "grid.layout", "group_by", "Hill_0", "Hill_1", "Hill_2", "install_github", "log2FoldChange", "logFC", "lwr", "max_Hill", "modality", "multcompLetters", "nb_values", "ott_id", "OTU", "Proportion", "pushViewport", "Query name", "rarefy", "reverse", "rgb", "reverseComplement", "rrarefy", "Species", "summarise", "tax", "tax_col", "teststat", "tnrs_match_names", "tol_induced_subtree", "upr", "upViewport", "vegdist", "viewport", "x", "x1", "X1", "x2", "y", "y1", "y2", "ymax", "ymin" + ".id", "%>%", "Ab", "Abundance", "character_method", "col_tax", "combn", "complement", "devtools", "e-value", "Family", "Genus", "grid.draw", "grid.layout", "group_by", "Hill_0", "Hill_1", "Hill_2", "install_github", "log2FoldChange", "logFC", "lwr", "max_Hill", "modality", "multcompLetters", "nb_values", "ott_id", "OTU", "Proportion", "pushViewport", "Query name", "rarefy", "reverse", "rgb", "reverseComplement", "rrarefy", "Sample", "Species", "summarise", "tax", "tax_col", "teststat", "tnrs_match_names", "tol_induced_subtree", "upr", "upViewport", "val", "vegdist", "viewport", "x", "x1", "X1", "x2", "y", "y1", "y2", "ymax", "ymin" )) } diff --git a/R/blast.R b/R/blast.R index 0ec14dd0..daec9042 100644 --- a/R/blast.R +++ b/R/blast.R @@ -20,10 +20,10 @@ #' @param e_value_cut (default: 1e-30) cut of in e-value (%) to keep result #' The BLAST E-value is the number of expected hits of similar quality (score) #' that could be found just by chance. -#' @param unique_per_seq (logical) if TRUE only return the first match for -#' each sequence in seq2search -#' @param score_filter (logical) does results are filter by score? If -#' FALSE, `id_cut`,`bit_score_cut` and `min_cover_cut` are ignored +#' @param unique_per_seq (logical, default FALSE) if TRUE only return the better match +#' (higher **bit score**) for each sequence +#' @param score_filter (logical, default TRUE) does results are filter by score? If +#' FALSE, `id_cut`,`bit_score_cut`, `e_value_cut` and `min_cover_cut` are ignored #' @param list_no_output_query (logical) does the result table include #' query sequences for which `blastn` does not find any correspondence? #' @param args_makedb Additional parameters parse to makeblastdb command @@ -34,9 +34,9 @@ #' @param keep_temporary_files (logical, default: FALSE) Do we keep temporary files #' - db.fasta (refseq transformed into a database) #' - dbase list of files (output of blastn) -#' - blast_result.txt the summary result of blastn using +#' - blast_result.txt the summary result of blastn using #' `-outfmt "6 qseqid qlen sseqid slen length pident evalue bitscore qcovs"` -#' +#' #' @seealso [MiscMetabar::blast_pq()] to use `refseq` slot as query sequences #' against un custom database. #' @@ -70,12 +70,13 @@ blast_to_phyloseq <- function(physeq, dna <- Biostrings::DNAStringSet(physeq@refseq) Biostrings::writeXStringSet(dna, paste0(tempdir(), "/", "db.fasta")) - system(paste0(blastpath, + system(paste0( + blastpath, "makeblastdb -dbtype nucl -in ", - paste0(tempdir(), "/", "db.fasta"), + paste0(tempdir(), "/", "db.fasta"), " -out ", paste0(tempdir(), "/", "dbase"), - " ", + " ", args_makedb )) @@ -97,7 +98,7 @@ blast_to_phyloseq <- function(physeq, ) if (file.info(paste0(tempdir(), "/", "blast_result.txt"))$size > 0) { blast_tab <- utils::read.table( - paste0(tempdir(), "/", "blast_result.txt"),, + paste0(tempdir(), "/", "blast_result.txt"), , sep = "\t", header = FALSE, stringsAsFactors = FALSE @@ -115,7 +116,7 @@ blast_to_phyloseq <- function(physeq, message(paste0("Temporary files are located at ", tempdir())) } - if(!blast_tab_OK){ + if (!blast_tab_OK) { message("None query sequences matched your phyloseq references sequences.") return(NULL) } @@ -132,7 +133,7 @@ blast_to_phyloseq <- function(physeq, "Query cover" ) - blast_tab <- blast_tab[order(blast_tab[, "% id. match"], decreasing = TRUE), ] + blast_tab <- blast_tab[order(blast_tab[, "bit score"], decreasing = TRUE), ] if (unique_per_seq) { blast_tab <- blast_tab[which(!duplicated(blast_tab[, 1])), ] @@ -184,9 +185,9 @@ blast_to_phyloseq <- function(physeq, #' @param keep_temporary_files (logical, default: FALSE) Do we keep temporary files #' - db.fasta (refseq transformed into a database) #' - dbase list of files (output of blastn) -#' - blast_result.txt the summary result of blastn using +#' - blast_result.txt the summary result of blastn using #' `-outfmt "6 qseqid qlen sseqid slen length pident evalue bitscore qcovs"` -#' +#' #' @seealso [MiscMetabar::blast_to_phyloseq()] to use `refseq` #' slot as a database #' @return a blast table @@ -254,7 +255,7 @@ blast_pq <- function(physeq, " -db ", database, " -out ", - paste0(tempdir(), "/", "blast_result.txt"), + paste0(tempdir(), "/", "blast_result.txt"), " -outfmt \"6 qseqid qlen sseqid slen", " length pident evalue bitscore qcovs\"", " -num_threads ", nproc, @@ -264,9 +265,9 @@ blast_pq <- function(physeq, ) } - if (file.info(paste0(tempdir(), "/", "blast_result.txt"))$size > 0) { + if (file.info(paste0(tempdir(), "/", "blast_result.txt"))$size > 0) { blast_tab <- utils::read.table( - paste0(tempdir(), "/", "blast_result.txt"),, + paste0(tempdir(), "/", "blast_result.txt"), , sep = "\t", header = FALSE, stringsAsFactors = FALSE @@ -284,7 +285,7 @@ blast_pq <- function(physeq, message(paste0("Temporary files are located at ", tempdir())) } - if(!blast_tab_OK){ + if (!blast_tab_OK) { message("None query sequences matched your phyloseq references sequences.") return(NULL) } @@ -301,7 +302,7 @@ blast_pq <- function(physeq, "Query cover" ) - blast_tab <- blast_tab[order(blast_tab[, "% id. match"], decreasing = TRUE), ] + blast_tab <- blast_tab[order(blast_tab[, "bit score"], decreasing = TRUE), ] if (unique_per_seq) { blast_tab <- blast_tab[which(!duplicated(blast_tab[, 1])), ] @@ -423,7 +424,7 @@ filter_asv_blast <- function(physeq, #' @param keep_temporary_files (logical, default: FALSE) Do we keep temporary files #' - db.fasta (refseq transformed into a database) #' - dbase list of files (output of blastn) -#' - blast_result.txt the summary result of blastn using +#' - blast_result.txt the summary result of blastn using #' `-outfmt "6 qseqid qlen sseqid slen length pident evalue bitscore qcovs"` #' @return A blast table #' @@ -468,7 +469,8 @@ blast_to_derep <- function(derep, names(dna) <- paste0(names(dna), "(", unlist(derep_occurence), "seqs)") Biostrings::writeXStringSet(dna, paste0(tempdir(), "/", "db.fasta")) - system(paste0(blastpath, + system(paste0( + blastpath, "makeblastdb -dbtype nucl -in ", paste0(tempdir(), "/", "db.fasta"), " -out ", @@ -494,9 +496,9 @@ blast_to_derep <- function(derep, ) - if (file.info(paste0(tempdir(), "/", "blast_result.txt"))$size > 0) { + if (file.info(paste0(tempdir(), "/", "blast_result.txt"))$size > 0) { blast_tab <- utils::read.table( - paste0(tempdir(), "/", "blast_result.txt"),, + paste0(tempdir(), "/", "blast_result.txt"), , sep = "\t", header = FALSE, stringsAsFactors = FALSE @@ -514,7 +516,7 @@ blast_to_derep <- function(derep, message(paste0("Temporary files are located at ", tempdir())) } - if(!blast_tab_OK){ + if (!blast_tab_OK) { message("None query sequences matched your phyloseq references sequences.") return(NULL) } @@ -533,7 +535,7 @@ blast_to_derep <- function(derep, blast_tab$occurence <- sub("seqs\\)", "", sub(".*\\(", "", blast_tab$`Sample name`, perl = TRUE), perl = TRUE) - blast_tab <- blast_tab[order(blast_tab[, "% id. match"], decreasing = TRUE), ] + blast_tab <- blast_tab[order(blast_tab[, "bit score"], decreasing = TRUE), ] if (unique_per_seq) { blast_tab <- blast_tab[which(!duplicated(blast_tab[, 1])), ] diff --git a/R/controls.R b/R/controls.R index ff9ac71c..beb232ab 100644 --- a/R/controls.R +++ b/R/controls.R @@ -43,16 +43,17 @@ search_exact_seq_pq <- function(physeq, sequences) { ################################################################################ #' Calculate ecological distance among positive controls vs #' distance for all samples -#' + +#' @description #' `r lifecycle::badge("experimental")` -#' -#' @aliases dist_pos_control -#' @details Compute distance among positive controls, +#' +#' Compute distance among positive controls, #' i.e. samples which are duplicated #' to test for variation, for example in #' (i) a step in the sampling, #' (ii) a step in the extraction, #' (iii) a step in the sequencing. +#' @aliases dist_pos_control #' @inheritParams clean_pq #' @param samples_names (required) a vector of names for samples with #' positives controls of the same samples having the same name @@ -141,7 +142,9 @@ dist_pos_control <- function(physeq, samples_names, method = "bray") { #' @examples #' data(data_fungi) #' -#' subset_taxa_tax_control(data_fungi, as.numeric(data_fungi@otu_table[, 300])) +#' subset_taxa_tax_control(data_fungi, +#' as.numeric(data_fungi@otu_table[, 300]), +#' min_diff_for_cutoff = 2) #' #' @author Adrien Taudière subset_taxa_tax_control <- diff --git a/R/dada_phyloseq.R b/R/dada_phyloseq.R index 5e00ec50..1f0df6e7 100644 --- a/R/dada_phyloseq.R +++ b/R/dada_phyloseq.R @@ -30,7 +30,7 @@ add_dna_to_phyloseq <- function(physeq) { ################################################################################ #' Clean phyloseq object by removing empty samples and taxa #' -#' @details `r lifecycle::badge("experimental")` +#' @description `r lifecycle::badge("experimental")` #' #' In addition, this function check for discrepancy (and rename) between #' (i) taxa names in refseq, taxonomy table and otu_table and between @@ -172,7 +172,7 @@ clean_pq <- function(physeq, #' Track the number of reads (= sequences), samples and cluster (e.g. ASV) #' from various objects including dada-class and derep-class. #' -#' @details +#' @description #' `r lifecycle::badge("maturing")` #' #' * List of fastq and fastg.gz files -> nb of reads and samples @@ -381,14 +381,16 @@ track_wkflow <- function( ################################################################################ #' Track the number of reads (= sequences), samples and cluster (e.g. ASV) -#' for each samples +#' for each samples. #' #' @description #' `r lifecycle::badge("experimental")` #' +#' Contrary to [track_wkflow()], only phyloseq object are possible. #' More information are available in the manual of the function [track_wkflow()] #' #' @param list_pq_obj (required): a list of object passed on to [track_wkflow()] +#' Only phyloseq object will return value because information of sample is needed #' @param ... : other args passed on to [track_wkflow()] #' #' @return A list of dataframe. cf [track_wkflow()] for more information @@ -408,8 +410,13 @@ track_wkflow_samples <- function(list_pq_obj, ...) { } res <- list() for (s in sample_names(list_pq_obj[[1]])) { - list_pq_obj_samples <- lapply(list_pq_obj, select_one_sample, sam_name = s) - res[[s]] <- track_wkflow(list_pq_obj_samples, ...) + if (inherits(list_pq_obj[[1]], "phyloseq")) { + list_pq_obj_samples <- lapply(list_pq_obj, select_one_sample, sam_name = s) + res[[s]] <- track_wkflow(list_pq_obj_samples, ...) + } else { + message(paste0(names(list_pq_obj[[1]])), " is not a phyloseq obj. Sample information can't be computed. ") + res[[s]] <- NULL + } } return(res) } @@ -420,6 +427,7 @@ track_wkflow_samples <- function(list_pq_obj, ...) { #' Recluster sequences of an object of class `physeq` #' (e.g. OTUs or ASV from dada) #' +#' @description #' `r lifecycle::badge("maturing")` #' #' @inheritParams clean_pq @@ -432,7 +440,8 @@ track_wkflow_samples <- function(list_pq_obj, ...) { #' Set the clustering method. #' - `clusterize` use the [DECIPHER::Clusterize()] fonction, #' - `vsearch` use the vsearch software (https://github.com/torognes/vsearch/) -#' with arguments `-cluster_fast` and `-strand both` +#' with arguments `--cluster_size` by default (see args `vsearch_cluster_method`) +#' and `-strand both` (see args `vsearch_args`) #' @param vsearchpath path to vsearch #' @param id (default: 0.97) level of identity to cluster #' @param tax_adjust See the man page @@ -639,11 +648,11 @@ vs_search_global <- function(physeq, if (inherits(seq2search, "character")) { seq2search <- Biostrings::DNAStringSet(seq2search) } - Biostrings::writeXStringSet(seq2search, paste0(tempdir(), "seq2search.fasta")) + Biostrings::writeXStringSet(seq2search, paste0(tempdir(), "seq2search.fasta")) seq2search <- paste0(tempdir(), "seq2search.fasta") } else if (!is.null(path_to_fasta)) { dna <- Biostrings::readDNAStringSet(path_to_fasta) - Biostrings::writeXStringSet(dna, paste0(tempdir(), "seq2search.fasta")) + Biostrings::writeXStringSet(dna, paste0(tempdir(), "seq2search.fasta")) seq2search <- paste0(tempdir(), "seq2search.fasta") } @@ -989,7 +998,7 @@ read_pq <- function(path = NULL, taxa_are_rows = FALSE, sam_names = NULL, sep_cs ################################################################################ #' Lulu reclustering of class `physeq` #' -#' @details +#' @description #' `r lifecycle::badge("experimental")` #' #' See https://www.nature.com/articles/s41467-017-01312-x for more information @@ -1121,7 +1130,7 @@ lulu_pq <- function(physeq, ################################################################################ #' Verify the validity of a phyloseq object #' -#' @details +#' @description #' `r lifecycle::badge("maturing")` #' #' Mostly for internal use in MiscMetabar functions. @@ -1142,7 +1151,7 @@ verify_pq <- function(physeq) { ################################################################################ #' Subset samples using a conditional boolean vector. #' -#' @details +#' @description #' `r lifecycle::badge("experimental")` #' #' The main objective of this function is to complete the [phyloseq::subset_samples()] @@ -1194,7 +1203,7 @@ subset_samples_pq <- function(physeq, condition) { ################################################################################ #' Subset taxa using a conditional named boolean vector. #' -#' @details +#' @description #' `r lifecycle::badge("experimental")` #' #' The main objective of this function is to complete the [phyloseq::subset_taxa()] @@ -1334,7 +1343,6 @@ select_one_sample <- function(physeq, sam_name, silent = FALSE) { #' @examples #' # example code #' -#' #' @author Adrien Taudière #' add_new_taxonomy_pq <- function(physeq, ref_fasta, suffix = NULL, ...) { diff --git a/R/plot_functions.R b/R/plot_functions.R index 2aa97093..7e683b13 100644 --- a/R/plot_functions.R +++ b/R/plot_functions.R @@ -924,7 +924,7 @@ venn_pq <- #' @return A \code{\link{ggplot}}2 plot representing Venn diagramm of #' modalities of the argument \code{factor} or if split_by is set a list #' of plots. -#' +#' @seealso [upset_pq()] #' @examples #' data(data_fungi) #' ggvenn_pq(data_fungi, fact = "Height") @@ -1106,7 +1106,9 @@ multiplot <- #' \code{\link[multcompView]{multcompLetters}} function from the package #' multcompLetters. BROKEN for the moment. #' @param add_points (logical): add jitter point on boxplot -#' +#' @param add_info (logical, default TRUE) Do we add a subtitle with +#' information about the number of samples per modality. +#' #' @return A list of 4 ggplot2 plot. #' - plot_Hill_0 : the boxplot of Hill number 0 (= species richness) #' against the variable @@ -1137,7 +1139,8 @@ hill_pq <- variable, color_fac = NA, letters = FALSE, - add_points = FALSE) { + add_points = FALSE, + add_info = TRUE) { var <- sym(variable) if (is.na(color_fac)) { color_fac <- sym(variable) @@ -1184,6 +1187,21 @@ hill_pq <- p_2 + geom_jitter(aes(y = !!var, colour = as.factor(!!color_fac)), alpha = 0.5) } + if (add_info) { + subtitle_plot <- paste0( + "Nb of samples: '", + paste0(names(table(physeq@sam_data[[variable]])), + sep = "' : ", + table(physeq@sam_data[[variable]]), collapse = " - '" + ) + ) + + p_0 <- p_0 + labs(subtitle = subtitle_plot) + p_1 <- p_1 + labs(subtitle = subtitle_plot) + p_2 <- p_2 + labs(subtitle = subtitle_plot) + + } + if (letters) { ### HILL 0 data_h0 <- @@ -1849,8 +1867,11 @@ biplot_pq <- function(physeq, #' using one factor. #' #' @inheritParams clean_pq -#' @param split_by (required) the name of the factor to make all combination +#' @param split_by (required if paires is NULL) the name of the factor to make all combination #' of couples of values +#' @param paires (required if paires is NULL) the name of the factor in physeq@sam_data` slot +#' to make plot by paires of samples. Each level must be present only two times. +#' Note that if you set paires, you also must set fact arguments to pass on to [biplot_pq()]. #' @param na_remove (logical, default TRUE) if TRUE remove all the samples #' with NA in the `split_by` variable of the `physeq@sam_data` slot #' @param ... all other parameters passed on to [biplot_pq()] @@ -1867,12 +1888,20 @@ biplot_pq <- function(physeq, #' @author Adrien Taudière multi_biplot_pq <- function(physeq, split_by = NULL, + paires = NULL, na_remove = TRUE, ...) { - if (is.null(split_by) || is.null(physeq@sam_data[[split_by]])) { + if (is.null(paires) && is.null(split_by)) { + stop("You must set one of split_by or paires.") + } else if (!is.null(paires) && !is.null(split_by)) { + stop("You must set either split_by or paires, not both.") + } else if (!is.null(split_by) && is.null(physeq@sam_data[[split_by]])) { stop("split_by must be set and must be a variable in physeq@sam_data") + } else if (!is.null(paires) && is.null(physeq@sam_data[[paires]])) { + stop("paires must be set and must be a variable in physeq@sam_data") } - if (na_remove) { + + if (na_remove && !is.null(split_by)) { new_physeq <- subset_samples_pq(physeq, !is.na(physeq@sam_data[[split_by]])) if (nsamples(physeq) - nsamples(new_physeq) > 0) { message(paste0( @@ -1883,20 +1912,28 @@ multi_biplot_pq <- function(physeq, physeq <- new_physeq } - names_split_by <- names(table(physeq@sam_data[[split_by]])) - couples <- combn(names_split_by, 2) - - p <- list() - for (c in 1:ncol(couples)) { - names_p <- paste0(couples[1, c], " - ", couples[2, c]) - new_physeq <- subset_samples_pq(physeq, physeq@sam_data[[split_by]] %in% - c(couples[1, c], couples[2, c])) - p[[names_p]] <- biplot_pq(new_physeq, - fact = split_by, - merge_sample_by = split_by, - ) - } + if (!is.null(paires)) { + p <- list() + for (c in levels(as.factor(physeq@sam_data[[paires]]))) { + new_physeq <- subset_samples_pq(physeq, physeq@sam_data[[paires]] %in% c) + p[[c]] <- biplot_pq(new_physeq, ...) + ggtitle(c) + } + } else { + names_split_by <- names(table(physeq@sam_data[[split_by]])) + couples <- combn(names_split_by, 2) + p <- list() + for (c in 1:ncol(couples)) { + names_p <- paste0(couples[1, c], " - ", couples[2, c]) + new_physeq <- subset_samples_pq(physeq, physeq@sam_data[[split_by]] %in% + c(couples[1, c], couples[2, c])) + p[[names_p]] <- biplot_pq(new_physeq, + fact = split_by, + merge_sample_by = split_by, + ... + ) + } + } return(p) } @@ -2250,7 +2287,7 @@ SRS_curve_pq <- function(physeq, clean_pq = FALSE, ...) { } -#' Visualization of two samples for comparison +#' iNterpolation and EXTrapolation of Hill numbers (with iNEXT) #' @description #' `r lifecycle::badge("experimental")` #' @@ -2259,21 +2296,24 @@ SRS_curve_pq <- function(physeq, clean_pq = FALSE, ...) { #' physeq are merged using the vector set by `merge_sample_by`. This #' merging used the [speedyseq::merge_samples2()]. In the case of #' [biplot_pq()] this must be a factor with two levels only. -#' @param ... other arguments for the ggplot function -#' @return A object of class XXX +#' @param ... other arguments for the [iNEXT::iNEXT()] function +#' @return see [iNEXT::iNEXT()] documentation #' @export #' #' @examples #' library("iNEXT") -#' iNEXT_pq(data_fungi, merge_sample_by="Height", q=1, datatype="abundance", nboot=5) +#' res_iNEXT <- iNEXT_pq(data_fungi, +#' merge_sample_by = "Height", +#' q = 1, datatype = "abundance", nboot = 5 +#' ) #' ggiNEXT(res_iNEXT) #' ggiNEXT(res_iNEXT, type = 2) #' ggiNEXT(res_iNEXT, type = 3) -#' +#' #' @author Adrien Taudière #' #' -iNEXT_pq <- function(physeq, merge_sample_by = NULL, ...){ +iNEXT_pq <- function(physeq, merge_sample_by = NULL, ...) { if (!is.null(merge_sample_by)) { physeq <- speedyseq::merge_samples2(physeq, merge_sample_by) physeq <- clean_pq(physeq, force_taxa_as_columns = TRUE) @@ -2284,3 +2324,252 @@ iNEXT_pq <- function(physeq, merge_sample_by = NULL, ...){ } + + +#' Make upset plot for phyloseq object. +#' @description +#' `r lifecycle::badge("experimental")` +#' +#' Alternative to venn plot. +#' +#' @inheritParams clean_pq +#' @param fact (required): Name of the factor to cluster samples by modalities. +#' Need to be in \code{physeq@sam_data}. +#' @param min_nb_seq minimum number of sequences by OTUs by +#' samples to take into count this OTUs in this sample. For example, +#' if min_nb_seq=2,each value of 2 or less in the OTU table +#' will not count in the venn diagramm +#' @param na_remove : if TRUE (the default), NA values in fact are removed +#' if FALSE, NA values are set to "NA" +#' @param numeric_fonction (default : sum) the function for numeric vector +#' usefull only for complex plot (see examples) +#' @param ... other arguments passed on to the [ComplexUpset::upset()] +#' +#' @return A \code{\link{ggplot}}2 plot +#' @export +#' @author Adrien Taudière +#' +#' @seealso [ggvenn_pq()] +#' @examples +#' upset_pq(data_fungi, fact = "Height", width_ratio = 0.2) +#' upset_pq(data_fungi, fact = "Height", min_nb_seq = 1000) +#' upset_pq(data_fungi, fact = "Height", na_remove = FALSE) +#' upset_pq(data_fungi, fact = "Time", width_ratio = 0.2) +#' +#' upset_pq( +#' data_fungi, +#' fact = "Time", +#' width_ratio = 0.2, +#' annotations = list( +#' "Sequences per ASV \n (log10)" = ( +#' ggplot(mapping = aes(y = log10(Abundance))) +#' + +#' geom_jitter(aes( +#' color = +#' Abundance +#' ), na.rm = TRUE) +#' + +#' geom_violin(alpha = 0.5, na.rm = TRUE) + +#' theme(legend.key.size = unit(0.2, "cm")) + +#' theme(axis.text = element_text(size = 12)) +#' ), +#' "ASV per phylum" = ( +#' ggplot(mapping = aes(fill = Phylum)) +#' + +#' geom_bar() + +#' ylab("ASV per phylum") + +#' theme(legend.key.size = unit(0.2, "cm")) + +#' theme(axis.text = element_text(size = 12)) +#' ) +#' ) +#' ) +#' +#' +#' upset_pq( +#' data_fungi, +#' fact = "Time", +#' width_ratio = 0.2, +#' numeric_fonction = mean, +#' annotations = list( +#' "Sequences per ASV \n (log10)" = ( +#' ggplot(mapping = aes(y = log10(Abundance))) +#' + +#' geom_jitter(aes( +#' color = +#' Abundance +#' ), na.rm = TRUE) +#' + +#' geom_violin(alpha = 0.5, na.rm = TRUE) + +#' theme(legend.key.size = unit(0.2, "cm")) + +#' theme(axis.text = element_text(size = 12)) +#' ), +#' "ASV per phylum" = ( +#' ggplot(mapping = aes(fill = Phylum)) +#' + +#' geom_bar() + +#' ylab("ASV per phylum") + +#' theme(legend.key.size = unit(0.2, "cm")) + +#' theme(axis.text = element_text(size = 12)) +#' ) +#' ) +#' ) +#' +#' +#' upset_pq( +#' subset_taxa(data_fungi, Phylum == "Basidiomycota"), +#' fact = "Time", +#' width_ratio = 0.2, +#' base_annotations = list(), +#' annotations = list( +#' "Sequences per ASV \n (log10)" = ( +#' ggplot(mapping = aes(y = log10(Abundance))) +#' + +#' geom_jitter(aes( +#' color = +#' Abundance +#' ), na.rm = TRUE) +#' + +#' geom_violin(alpha = 0.5, na.rm = TRUE) + +#' theme(legend.key.size = unit(0.2, "cm")) + +#' theme(axis.text = element_text(size = 12)) +#' ), +#' "ASV per phylum" = ( +#' ggplot(mapping = aes(fill = Class)) +#' + +#' geom_bar() + +#' ylab("ASV per Class") + +#' theme(legend.key.size = unit(0.2, "cm")) + +#' theme(axis.text = element_text(size = 12)) +#' ) +#' ) +#' ) +#' +#' data_fungi2 <- data_fungi +#' data_fungi2@sam_data[["Time_0"]] <- data_fungi2@sam_data$Time == 0 +#' data_fungi2@sam_data[["Height__Time_0"]] <- +#' paste0(data_fungi2@sam_data[["Height"]], "__", data_fungi2@sam_data[["Time_0"]]) +#' data_fungi2@sam_data[["Height__Time_0"]][grepl("NA", data_fungi2@sam_data[["Height__Time_0"]])] <- +#' NA +#' upset_pq(data_fungi2, fact = "Height__Time_0", width_ratio = 0.2) +upset_pq <- + function(physeq, + fact, + min_nb_seq = 0, + na_remove = TRUE, + numeric_fonction = sum, + ...) { + if (!is.null(min_nb_seq)) { + physeq <- subset_taxa_pq(physeq, taxa_sums(physeq) >= min_nb_seq) + } + + if (na_remove) { + physeq <- + subset_samples_pq(physeq, !is.na(physeq@sam_data[[fact]])) + } else { + physeq@sam_data[[fact]][is.na(physeq@sam_data[[fact]])] <- + "NA" + } + + physeq <- speedyseq::merge_samples2(physeq, fact) + + psm <- psmelt(physeq) + samp_names <- unique(psm$Sample) + psm <- + psm %>% + mutate(val = TRUE) %>% + tidyr::pivot_wider(names_from = Sample, values_from = val) + psm[samp_names][is.na(psm[samp_names])] <- FALSE + + psm <- psm %>% filter(Abundance != 0) + psm[[fact]] <- as.character(psm[[fact]]) + + psm2 <- data.frame(lapply(psm, function(col) { + tapply(col, paste0(psm$OTU), function(vec) { + diff_fct_diff_class(vec, numeric_fonction = numeric_fonction, na.rm = TRUE) + }) + })) %>% arrange(., desc(Abundance)) + + colnames(psm2) <- colnames(psm) + + p <- + ComplexUpset::upset(psm2, intersect = samp_names, ...) + xlab(fact) + return(p) + } + + +#' Compute different functions for different class of vector. +#' +#' @description +#' `r lifecycle::badge("experimental")` +#' Mainly an internal function useful in "lapply(..., tapply)" methods +#' +#' +#' @param x : a vector +#' @param numeric_fonction : a function for numeric vector. For ex. `sum` or `mean` +#' @param logical_method : A method for logical vector. One of : +#' - TRUE_if_one (default) +#' - NA_if_not_all_TRUE +#' - FALSE_if_not_all_TRUE +#' @param character_method : A method for character vector (and factor). One of : +#' - unique_or_na (default) +#' - more frequent +#' - more_frequent_without_equality +#' @param ... other arguments passed on to the numeric function (ex. na.rm=TRUE) +#' @return a single value +#' @export +#' +#' @author Adrien Taudière +diff_fct_diff_class <- + function(x, + numeric_fonction = mean, + logical_method = "TRUE_if_one", + character_method = "unique_or_na", + ...) { + if (is.character(x) || is.factor(x)) { + if (length(unique(x)) == 1) { + return(unique(x)) + } else if (character_method == "unique_or_na") { + return(NA) + } else if (character_method == "more_frequent") { + return(names(sort(table(x), decreasing = TRUE)[1])) + } else if (character_method == "more_frequent_without_equality") { + if (names(sort(table(x), decreasing = TRUE)[1]) == names(sort(table(x), decreasing = TRUE)[2])) { + return(NA) + } else { + return(names(sort(table(x), decreasing = TRUE)[1])) + } + } else { + stop(paste0(character_method, " is not a valid method for character_method params.")) + } + } else if (is.numeric(x)) { + return(numeric_fonction(x, ...)) + } else if (is.logical(x)) { + if (logical_method == "TRUE_if_one") { + if (sum(x, na.rm = TRUE) > 0) { + return(TRUE) + } else { + return(FALSE) + } + } + if (logical_method == "NA_if_not_all_TRUE") { + if (sum(x, na.rm = TRUE) > 0 && sum(!x, na.rm = TRUE) == 0) { + return(TRUE) + } else if (sum(!x, na.rm = TRUE) > 0 && sum(x, na.rm = TRUE) > 0) { + return(NA) + } else if (sum(!x, na.rm = TRUE) > 0 && sum(x, na.rm = TRUE) == 0) { + return(FALSE) + } + } + if (logical_method == "FALSE_if_not_all_TRUE") { + if (sum(x, na.rm = TRUE) > 0 && sum(!x, na.rm = TRUE) == 0) { + return(TRUE) + } else { + return(FALSE) + } + } else { + stop(paste0(logical_method, " is not a valid method for character_method params.")) + } + } else { + stop("At least one column is neither numeric nor character or logical") + } + } diff --git a/R/table_functions.R b/R/table_functions.R index bb1f1664..3ac4d29a 100644 --- a/R/table_functions.R +++ b/R/table_functions.R @@ -109,7 +109,8 @@ tax_datatable <- function(physeq, #' NA in the variables set in bifactor, modality and merge_sample_by. #' NA in variables are well managed even if na_remove = FALSE, so na_remove may #' be useless. -#' @return A tibble +#' @return A tibble with information about the number of shared ASV, shared number of sequences +#' and diversity #' @importFrom rlang .data #' @export #' @examples @@ -199,7 +200,7 @@ compare_pairs_pq <- function(physeq = NULL, sample_data(newphyseq) <- sample_data(new_DF) } if (nsamples(newphyseq) != 2) { - res[[i]] <- c(NA, NA, NA, NA, NA) + res[[i]] <- rep(NA, 8) warning("At least one case do not contain 2 samples, so NA were introduced.") } else { cond1 <- newphyseq@sam_data[[bifactor]] == lev1 @@ -216,7 +217,14 @@ compare_pairs_pq <- function(physeq = NULL, index = veg_index )[cond2], 2) - res[[i]] <- c(nb_first, nb_second, nb_shared, div_first, div_second) + nb_shared_seq <- sum(newphyseq@otu_table[, newphyseq@otu_table[cond1, ] > nb_min_seq & + newphyseq@otu_table[cond2, ] > nb_min_seq]) + + perc_seq_shared_lv1 <- round(100 * nb_shared_seq / sum(newphyseq@otu_table[, newphyseq@otu_table[cond1, ] > nb_min_seq]), 2) + + perc_seq_shared_lv2 <- round(100 * nb_shared_seq / sum(newphyseq@otu_table[, newphyseq@otu_table[cond2, ] > nb_min_seq]), 2) + + res[[i]] <- c(nb_first, nb_second, nb_shared, div_first, div_second, nb_shared_seq, perc_seq_shared_lv1, perc_seq_shared_lv2) } } @@ -236,13 +244,16 @@ compare_pairs_pq <- function(physeq = NULL, .data$V5, 3)) colnames(res_df) <- c( - paste0("nb_", lev1), - paste0("nb_", lev2), - "nb_shared", + paste0("nb_ASV_", lev1), + paste0("nb_ASV_", lev2), + "nb_shared_ASV", paste0("div_", lev1), paste0("div_", lev2), - paste0("percent_shared_", lev1), - paste0("percent_shared_", lev2), + "nb_shared_seq", + paste0("percent_shared_seq_", lev1), + paste0("percent_shared_seq_", lev2), + paste0("percent_shared_ASV_", lev1), + paste0("percent_shared_ASV_", lev2), paste0("ratio_nb_", lev1, "_", lev2), paste0("ratio_div_", lev1, "_", lev2) ) diff --git a/R/targets_misc.R b/R/targets_misc.R index 9f97dd66..4aa509e2 100644 --- a/R/targets_misc.R +++ b/R/targets_misc.R @@ -64,9 +64,8 @@ list_fastq_files <- #' #' @examples #' data(data_fungi) -#' rename_samples_otu_table(data_fungi, as.character(1:nsamples(data_fungi)), -#' taxa_are_rows = T -#' ) +#' rename_samples_otu_table(data_fungi, as.character(1:nsamples(data_fungi))) +#' rename_samples_otu_table <- function(physeq, names_of_samples) { otu_tab <- physeq@otu_table tax_in_row <- taxa_are_rows(physeq) diff --git a/README.Rmd b/README.Rmd index 837590ea..a447dc0e 100644 --- a/README.Rmd +++ b/README.Rmd @@ -28,20 +28,23 @@ There is no CRAN or bioconductor version of MiscMetabar for now (work in progres You can install the stable version from [GitHub](https://github.com/) with: -```{r, message=FALSE} +```{r, message=FALSE, eval=FALSE} install.packages("devtools") devtools::install_github("adrientaudiere/MiscMetabar") ``` You can install the developement version from [GitHub](https://github.com/) with: -```{r, message=FALSE} +```{r, message=FALSE, eval=FALSE} install.packages("devtools") devtools::install_github("adrientaudiere/MiscMetabar", ref = "dev") ``` ## Some use of MiscMetabar +See vignettes in the [MiscMetabar](https://adrientaudiere.github.io/MiscMetabar/) website for more examples. + + ### Summarize a physeq object ```{r example, message=FALSE} @@ -52,8 +55,35 @@ data("data_fungi") summary_plot_pq(data_fungi) ``` -### Circle for visualize distribution of taxa in function of samples variables +### Alpha-diversity analysis + +```{r, fig.cap="Hill number 1"} +p <- MiscMetabar::hill_pq(data_fungi, variable = "Height") +p$plot_Hill_0 +``` + +```{r, fig.cap="Result of the Tuckey post-hoc test"} +p$plot_tuckey +``` + +### Beta-diversity analysis ```{r} -circle_pq(data_fungi, "Height", taxa = "Class", add_nb_seq = F) +ggvenn_pq(data_fungi, fact = "Height") + + ggplot2::scale_fill_distiller(palette = "BuPu", direction = 1) + + labs(title = "Share number of ASV among Height in tree") +``` + +### Installation of other softwares for debian Linux distributions + +#### blastn + +```sh +sudo apt-get install ncbi-blast+ +``` + +#### vsearch + +```sh +sudo apt-get install vsearch ``` diff --git a/README.md b/README.md index 23543f2f..7849b8f1 100644 --- a/README.md +++ b/README.md @@ -23,16 +23,6 @@ with: ``` r install.packages("devtools") devtools::install_github("adrientaudiere/MiscMetabar") -#> dplyr (1.1.2 -> 1.1.3) [CRAN] -#> ── R CMD build ───────────────────────────────────────────────────────────────── -#> * checking for file ‘/tmp/RtmpluOtzG/remotes62e682f7ffc53/adrientaudiere-MiscMetabar-2cb7839/DESCRIPTION’ ... OK -#> * preparing ‘MiscMetabar’: -#> * checking DESCRIPTION meta-information ... OK -#> Avis : read_pq.Rd:26: unknown macro '\t' -#> Avis : write_pq.Rd:74: unknown macro '\t' -#> * checking for LF line-endings in source and make files and shell scripts -#> * checking for empty or unneeded directories -#> * building ‘MiscMetabar_0.34.tar.gz’ ``` You can install the developement version from @@ -41,21 +31,14 @@ You can install the developement version from ``` r install.packages("devtools") devtools::install_github("adrientaudiere/MiscMetabar", ref = "dev") -#> -#> ── R CMD build ───────────────────────────────────────────────────────────────── -#> * checking for file ‘/tmp/RtmpluOtzG/remotes62e682b7bdb84/adrientaudiere-MiscMetabar-1dc2e0c/DESCRIPTION’ ... OK -#> * preparing ‘MiscMetabar’: -#> * checking DESCRIPTION meta-information ... OK -#> * checking for LF line-endings in source and make files and shell scripts -#> * checking for empty or unneeded directories -#> * building ‘MiscMetabar_0.40.tar.gz’ -#> Warning in i.p(...): l'installation du package -#> '/tmp/RtmpluOtzG/file62e683fb2fa28/MiscMetabar_0.40.tar.gz' a eu un statut de -#> sortie non nul ``` ## Some use of MiscMetabar +See vignettes in the +[MiscMetabar](https://adrientaudiere.github.io/MiscMetabar/) website for +more examples. + ### Summarize a physeq object ``` r @@ -68,11 +51,57 @@ summary_plot_pq(data_fungi) -### Circle for visualize distribution of taxa in function of samples variables +### Alpha-diversity analysis ``` r -circle_pq(data_fungi, "Height", taxa = "Class", add_nb_seq = F) -#> Only 10 taxa are plot (41.67%). Use 'min_prop_tax' to plot more taxa +p <- MiscMetabar::hill_pq(data_fungi, variable = "Height") +#> Taxa are now in rows. +#> Cleaning suppress 0 taxa and 0 samples. +p$plot_Hill_0 ``` - +
+ +Hill number 1 +

+Hill number 1 +

+ +
+ +``` r +p$plot_tuckey +``` + +
+ +Result of the Tuckey post-hoc test +

+Result of the Tuckey post-hoc test +

+ +
+ +### Beta-diversity analysis + +``` r +ggvenn_pq(data_fungi, fact = "Height") + + ggplot2::scale_fill_distiller(palette = "BuPu", direction = 1) + + labs(title = "Share number of ASV among Height in tree") +``` + + + +### Installation of other softwares for debian Linux distributions + +#### blastn + +``` sh +sudo apt-get install ncbi-blast+ +``` + +#### vsearch + +``` sh +sudo apt-get install vsearch +``` diff --git a/_pkgdown.yml b/_pkgdown.yml index b0f0d829..4c708686 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -19,6 +19,10 @@ navbar: href: articles/alpha-div.html - text: Beta-diversity href: articles/beta-div.html + - text: Import/export and track + href: articles/import_export_track.html + - text: Filter taxa and samples + href: articles/filter.html - text: ------- - text: "For developpers" - text: Rules of code diff --git a/man/add_new_taxonomy_pq.Rd b/man/add_new_taxonomy_pq.Rd index b7b67f57..d057ae06 100644 --- a/man/add_new_taxonomy_pq.Rd +++ b/man/add_new_taxonomy_pq.Rd @@ -30,7 +30,6 @@ One of main use of this function is to add taxonomic assignment from a new datab \examples{ # example code - } \author{ Adrien Taudière diff --git a/man/asv2otu.Rd b/man/asv2otu.Rd index 88756387..05556627 100644 --- a/man/asv2otu.Rd +++ b/man/asv2otu.Rd @@ -35,7 +35,8 @@ Set the clustering method. \itemize{ \item \code{clusterize} use the \code{\link[DECIPHER:Clusterize]{DECIPHER::Clusterize()}} fonction, \item \code{vsearch} use the vsearch software (https://github.com/torognes/vsearch/) -with arguments \code{-cluster_fast} and \verb{-strand both} +with arguments \code{--cluster_size} by default (see args \code{vsearch_cluster_method}) +and \verb{-strand both} (see args \code{vsearch_args}) }} \item{id}{(default: 0.97) level of identity to cluster} diff --git a/man/blast_pq.Rd b/man/blast_pq.Rd index 648cde44..ed67baa1 100644 --- a/man/blast_pq.Rd +++ b/man/blast_pq.Rd @@ -47,11 +47,11 @@ normalized raw-score. Each increase by one doubles the required database size The BLAST E-value is the number of expected hits of similar quality (score) that could be found just by chance.} -\item{unique_per_seq}{(logical) if TRUE only return the first match for -each sequence in seq2search} +\item{unique_per_seq}{(logical, default FALSE) if TRUE only return the better match +(higher \strong{bit score}) for each sequence} -\item{score_filter}{(logical) does results are filter by score? If -FALSE, \code{id_cut},\code{bit_score_cut} and \code{min_cover_cut} are ignored} +\item{score_filter}{(logical, default TRUE) does results are filter by score? If +FALSE, \code{id_cut},\code{bit_score_cut}, \code{e_value_cut} and \code{min_cover_cut} are ignored} \item{nproc}{(default: 1) Set to number of cpus/processors to use for blast (args -num_threads diff --git a/man/blast_to_derep.Rd b/man/blast_to_derep.Rd index 16c29f39..1ad7ee0d 100644 --- a/man/blast_to_derep.Rd +++ b/man/blast_to_derep.Rd @@ -46,11 +46,11 @@ normalized raw-score. Each increase by one doubles the required database size The BLAST E-value is the number of expected hits of similar quality (score) that could be found just by chance.} -\item{unique_per_seq}{(logical) if TRUE only return the first match for -each sequence in seq2search} +\item{unique_per_seq}{(logical, default FALSE) if TRUE only return the better match +(higher \strong{bit score}) for each sequence} -\item{score_filter}{(logical) does results are filter by score? If -FALSE, \code{id_cut},\code{bit_score_cut} and \code{min_cover_cut} are ignored} +\item{score_filter}{(logical, default TRUE) does results are filter by score? If +FALSE, \code{id_cut},\code{bit_score_cut}, \code{e_value_cut} and \code{min_cover_cut} are ignored} \item{list_no_output_query}{(logical) does the result table include query sequences for which \code{blastn} does not find any correspondence?} diff --git a/man/blast_to_phyloseq.Rd b/man/blast_to_phyloseq.Rd index 96aa67c8..c083ccc8 100644 --- a/man/blast_to_phyloseq.Rd +++ b/man/blast_to_phyloseq.Rd @@ -46,11 +46,11 @@ normalized raw-score. Each increase by one doubles the required database size The BLAST E-value is the number of expected hits of similar quality (score) that could be found just by chance.} -\item{unique_per_seq}{(logical) if TRUE only return the first match for -each sequence in seq2search} +\item{unique_per_seq}{(logical, default FALSE) if TRUE only return the better match +(higher \strong{bit score}) for each sequence} -\item{score_filter}{(logical) does results are filter by score? If -FALSE, \code{id_cut},\code{bit_score_cut} and \code{min_cover_cut} are ignored} +\item{score_filter}{(logical, default TRUE) does results are filter by score? If +FALSE, \code{id_cut},\code{bit_score_cut}, \code{e_value_cut} and \code{min_cover_cut} are ignored} \item{list_no_output_query}{(logical) does the result table include query sequences for which \code{blastn} does not find any correspondence?} diff --git a/man/clean_pq.Rd b/man/clean_pq.Rd index b0083217..532a4d18 100644 --- a/man/clean_pq.Rd +++ b/man/clean_pq.Rd @@ -52,9 +52,6 @@ the ASV names in verbose information can be misleading.} A new \code{\link{phyloseq-class}} object } \description{ -Clean phyloseq object by removing empty samples and taxa -} -\details{ \ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} In addition, this function check for discrepancy (and rename) between diff --git a/man/compare_pairs_pq.Rd b/man/compare_pairs_pq.Rd index edff4f83..8d6fdac4 100644 --- a/man/compare_pairs_pq.Rd +++ b/man/compare_pairs_pq.Rd @@ -41,7 +41,8 @@ NA in variables are well managed even if na_remove = FALSE, so na_remove may be useless.} } \value{ -A tibble +A tibble with information about the number of shared ASV, shared number of sequences +and diversity } \description{ \ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} #' For the moment refseq slot need to be not Null. diff --git a/man/diff_fct_diff_class.Rd b/man/diff_fct_diff_class.Rd new file mode 100644 index 00000000..8ec6bf8a --- /dev/null +++ b/man/diff_fct_diff_class.Rd @@ -0,0 +1,45 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/plot_functions.R +\name{diff_fct_diff_class} +\alias{diff_fct_diff_class} +\title{Compute different functions for different class of vector.} +\usage{ +diff_fct_diff_class( + x, + numeric_fonction = mean, + logical_method = "TRUE_if_one", + character_method = "unique_or_na", + ... +) +} +\arguments{ +\item{x}{: a vector} + +\item{numeric_fonction}{: a function for numeric vector. For ex. \code{sum} or \code{mean}} + +\item{logical_method}{: A method for logical vector. One of : +\itemize{ +\item TRUE_if_one (default) +\item NA_if_not_all_TRUE +\item FALSE_if_not_all_TRUE +}} + +\item{character_method}{: A method for character vector (and factor). One of : +\itemize{ +\item unique_or_na (default) +\item more frequent +\item more_frequent_without_equality +}} + +\item{...}{other arguments passed on to the numeric function (ex. na.rm=TRUE)} +} +\value{ +a single value +} +\description{ +\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} +Mainly an internal function useful in "lapply(..., tapply)" methods +} +\author{ +Adrien Taudière +} diff --git a/man/dist_pos_control.Rd b/man/dist_pos_control.Rd index ed3a5997..eedaff09 100644 --- a/man/dist_pos_control.Rd +++ b/man/dist_pos_control.Rd @@ -25,8 +25,7 @@ A list of two data-frames with } \description{ \ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} -} -\details{ + Compute distance among positive controls, i.e. samples which are duplicated to test for variation, for example in diff --git a/man/figures/README-unnamed-chunk-4-1.png b/man/figures/README-unnamed-chunk-4-1.png index 1f0c16a9..031cb5e5 100644 Binary files a/man/figures/README-unnamed-chunk-4-1.png and b/man/figures/README-unnamed-chunk-4-1.png differ diff --git a/man/figures/README-unnamed-chunk-5-1.png b/man/figures/README-unnamed-chunk-5-1.png new file mode 100644 index 00000000..ee667483 Binary files /dev/null and b/man/figures/README-unnamed-chunk-5-1.png differ diff --git a/man/figures/README-unnamed-chunk-6-1.png b/man/figures/README-unnamed-chunk-6-1.png index 5d97dae1..57d3a97f 100644 Binary files a/man/figures/README-unnamed-chunk-6-1.png and b/man/figures/README-unnamed-chunk-6-1.png differ diff --git a/man/ggvenn_pq.Rd b/man/ggvenn_pq.Rd index 7f40c733..5bf41929 100644 --- a/man/ggvenn_pq.Rd +++ b/man/ggvenn_pq.Rd @@ -72,6 +72,9 @@ data_fungi2 <- subset_samples(data_fungi, data_fungi@sam_data$Tree_name == "A10- data_fungi@sam_data$Height \%in\% c("Low", "High")) ggvenn_pq(data_fungi2, fact = "Height") } +\seealso{ +\code{\link[=upset_pq]{upset_pq()}} +} \author{ Adrien Taudière } diff --git a/man/hill_pq.Rd b/man/hill_pq.Rd index 1889e680..fd499e8b 100644 --- a/man/hill_pq.Rd +++ b/man/hill_pq.Rd @@ -4,7 +4,14 @@ \alias{hill_pq} \title{Graphical representation of hill number 0, 1 and 2 across a factor} \usage{ -hill_pq(physeq, variable, color_fac = NA, letters = FALSE, add_points = FALSE) +hill_pq( + physeq, + variable, + color_fac = NA, + letters = FALSE, + add_points = FALSE, + add_info = TRUE +) } \arguments{ \item{physeq}{(required): a \code{\link{phyloseq-class}} object obtained @@ -20,6 +27,9 @@ show letters based on p-values for comparison. Use the multcompLetters. BROKEN for the moment.} \item{add_points}{(logical): add jitter point on boxplot} + +\item{add_info}{(logical, default TRUE) Do we add a subtitle with +information about the number of samples per modality.} } \value{ A list of 4 ggplot2 plot. diff --git a/man/iNEXT_pq.Rd b/man/iNEXT_pq.Rd new file mode 100644 index 00000000..97d23101 --- /dev/null +++ b/man/iNEXT_pq.Rd @@ -0,0 +1,39 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/plot_functions.R +\name{iNEXT_pq} +\alias{iNEXT_pq} +\title{iNterpolation and EXTrapolation of Hill numbers (with iNEXT)} +\usage{ +iNEXT_pq(physeq, merge_sample_by = NULL, ...) +} +\arguments{ +\item{physeq}{(required): a \code{\link{phyloseq-class}} object obtained +using the \code{phyloseq} package.} + +\item{merge_sample_by}{(default: NULL) if not \code{NULL} samples of +physeq are merged using the vector set by \code{merge_sample_by}. This +merging used the \code{\link[speedyseq:merge_samples2]{speedyseq::merge_samples2()}}. In the case of +\code{\link[=biplot_pq]{biplot_pq()}} this must be a factor with two levels only.} + +\item{...}{other arguments for the \code{\link[iNEXT:iNEXT]{iNEXT::iNEXT()}} function} +} +\value{ +see \code{\link[iNEXT:iNEXT]{iNEXT::iNEXT()}} documentation +} +\description{ +\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} +} +\examples{ +library("iNEXT") +res_iNEXT <- iNEXT_pq(data_fungi, + merge_sample_by = "Height", + q = 1, datatype = "abundance", nboot = 5 +) +ggiNEXT(res_iNEXT) +ggiNEXT(res_iNEXT, type = 2) +ggiNEXT(res_iNEXT, type = 3) + +} +\author{ +Adrien Taudière +} diff --git a/man/lulu_pq.Rd b/man/lulu_pq.Rd index 6c2f7fa1..d69aecdb 100644 --- a/man/lulu_pq.Rd +++ b/man/lulu_pq.Rd @@ -47,14 +47,12 @@ value. NA value are not counted as discrepancy. } } \description{ -Lulu reclustering of class \code{physeq} -} -\details{ \ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} See https://www.nature.com/articles/s41467-017-01312-x for more information on the method. - +} +\details{ The version of LULU is a fork of Adrien Taudière (\url{https://github.com/adrientaudiere/lulu}) from \url{https://github.com/tobiasgf/lulu} } diff --git a/man/multi_biplot_pq.Rd b/man/multi_biplot_pq.Rd index be2784cb..3f659b77 100644 --- a/man/multi_biplot_pq.Rd +++ b/man/multi_biplot_pq.Rd @@ -4,15 +4,19 @@ \alias{multi_biplot_pq} \title{Visualization of a collection of couples of samples for comparison} \usage{ -multi_biplot_pq(physeq, split_by = NULL, na_remove = TRUE, ...) +multi_biplot_pq(physeq, split_by = NULL, paires = NULL, na_remove = TRUE, ...) } \arguments{ \item{physeq}{(required): a \code{\link{phyloseq-class}} object obtained using the \code{phyloseq} package.} -\item{split_by}{(required) the name of the factor to make all combination +\item{split_by}{(required if paires is NULL) the name of the factor to make all combination of couples of values} +\item{paires}{(required if paires is NULL) the name of the factor in physeq@sam_data` slot +to make plot by paires of samples. Each level must be present only two times. +Note that if you set paires, you also must set fact arguments to pass on to \code{\link[=biplot_pq]{biplot_pq()}}.} + \item{na_remove}{(logical, default TRUE) if TRUE remove all the samples with NA in the \code{split_by} variable of the \code{physeq@sam_data} slot} diff --git a/man/rename_samples_otu_table.Rd b/man/rename_samples_otu_table.Rd index bd33de33..70f2a303 100644 --- a/man/rename_samples_otu_table.Rd +++ b/man/rename_samples_otu_table.Rd @@ -20,9 +20,8 @@ the matrix with new colnames (or rownames if \code{taxa_are_rows} is true) } \examples{ data(data_fungi) -rename_samples_otu_table(data_fungi, as.character(1:nsamples(data_fungi)), - taxa_are_rows = T -) +rename_samples_otu_table(data_fungi, as.character(1:nsamples(data_fungi))) + } \author{ Adrien Taudière diff --git a/man/subset_samples_pq.Rd b/man/subset_samples_pq.Rd index 3eb2c620..de6d6059 100644 --- a/man/subset_samples_pq.Rd +++ b/man/subset_samples_pq.Rd @@ -17,9 +17,6 @@ the number of samples} a new phyloseq object } \description{ -Subset samples using a conditional boolean vector. -} -\details{ \ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} The main objective of this function is to complete the \code{\link[phyloseq:subset_samples-methods]{phyloseq::subset_samples()}} diff --git a/man/subset_taxa_pq.Rd b/man/subset_taxa_pq.Rd index d1de1517..43ce08b9 100644 --- a/man/subset_taxa_pq.Rd +++ b/man/subset_taxa_pq.Rd @@ -23,9 +23,6 @@ If set to TRUE, empty samples are discarded after subsetting ASV} a new phyloseq object } \description{ -Subset taxa using a conditional named boolean vector. -} -\details{ \ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} The main objective of this function is to complete the \code{\link[phyloseq:subset_taxa-methods]{phyloseq::subset_taxa()}} diff --git a/man/subset_taxa_tax_control.Rd b/man/subset_taxa_tax_control.Rd index d4eab364..dd116758 100644 --- a/man/subset_taxa_tax_control.Rd +++ b/man/subset_taxa_tax_control.Rd @@ -43,7 +43,9 @@ A new \code{\link{phyloseq-class}} object. \examples{ data(data_fungi) -subset_taxa_tax_control(data_fungi, as.numeric(data_fungi@otu_table[, 300])) +subset_taxa_tax_control(data_fungi, + as.numeric(data_fungi@otu_table[, 300]), + min_diff_for_cutoff = 2) } \author{ diff --git a/man/track_wkflow.Rd b/man/track_wkflow.Rd index 52e97f16..5099e998 100644 --- a/man/track_wkflow.Rd +++ b/man/track_wkflow.Rd @@ -33,10 +33,6 @@ The number of sequences, clusters (e.g. OTUs, ASVs) and samples for each object. } \description{ -Track the number of reads (= sequences), samples and cluster (e.g. ASV) -from various objects including dada-class and derep-class. -} -\details{ \ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#maturing}{\figure{lifecycle-maturing.svg}{options: alt='[Maturing]'}}}{\strong{[Maturing]}} \itemize{ \item List of fastq and fastg.gz files -> nb of reads and samples diff --git a/man/track_wkflow_samples.Rd b/man/track_wkflow_samples.Rd index 896dcb7b..ba960af9 100644 --- a/man/track_wkflow_samples.Rd +++ b/man/track_wkflow_samples.Rd @@ -3,12 +3,13 @@ \name{track_wkflow_samples} \alias{track_wkflow_samples} \title{Track the number of reads (= sequences), samples and cluster (e.g. ASV) -for each samples} +for each samples.} \usage{ track_wkflow_samples(list_pq_obj, ...) } \arguments{ -\item{list_pq_obj}{(required): a list of object passed on to \code{\link[=track_wkflow]{track_wkflow()}}} +\item{list_pq_obj}{(required): a list of object passed on to \code{\link[=track_wkflow]{track_wkflow()}} +Only phyloseq object will return value because information of sample is needed} \item{...}{: other args passed on to \code{\link[=track_wkflow]{track_wkflow()}}} } @@ -18,6 +19,7 @@ A list of dataframe. cf \code{\link[=track_wkflow]{track_wkflow()}} for more inf \description{ \ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} +Contrary to \code{\link[=track_wkflow]{track_wkflow()}}, only phyloseq object are possible. More information are available in the manual of the function \code{\link[=track_wkflow]{track_wkflow()}} } \examples{ diff --git a/man/upset_pq.Rd b/man/upset_pq.Rd new file mode 100644 index 00000000..9365e854 --- /dev/null +++ b/man/upset_pq.Rd @@ -0,0 +1,151 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/plot_functions.R +\name{upset_pq} +\alias{upset_pq} +\title{Make upset plot for phyloseq object.} +\usage{ +upset_pq( + physeq, + fact, + min_nb_seq = 0, + na_remove = TRUE, + numeric_fonction = sum, + ... +) +} +\arguments{ +\item{physeq}{(required): a \code{\link{phyloseq-class}} object obtained +using the \code{phyloseq} package.} + +\item{fact}{(required): Name of the factor to cluster samples by modalities. +Need to be in \code{physeq@sam_data}.} + +\item{min_nb_seq}{minimum number of sequences by OTUs by +samples to take into count this OTUs in this sample. For example, +if min_nb_seq=2,each value of 2 or less in the OTU table +will not count in the venn diagramm} + +\item{na_remove}{: if TRUE (the default), NA values in fact are removed +if FALSE, NA values are set to "NA"} + +\item{numeric_fonction}{(default : sum) the function for numeric vector +usefull only for complex plot (see examples)} + +\item{...}{other arguments passed on to the \code{\link[ComplexUpset:upset]{ComplexUpset::upset()}}} +} +\value{ +A \code{\link{ggplot}}2 plot +} +\description{ +\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} + +Alternative to venn plot. +} +\examples{ +upset_pq(data_fungi, fact = "Height", width_ratio = 0.2) +upset_pq(data_fungi, fact = "Height", min_nb_seq = 1000) +upset_pq(data_fungi, fact = "Height", na_remove = FALSE) +upset_pq(data_fungi, fact = "Time", width_ratio = 0.2) + +upset_pq( + data_fungi, + fact = "Time", + width_ratio = 0.2, + annotations = list( + "Sequences per ASV \n (log10)" = ( + ggplot(mapping = aes(y = log10(Abundance))) + + + geom_jitter(aes( + color = + Abundance + ), na.rm = TRUE) + + + geom_violin(alpha = 0.5, na.rm = TRUE) + + theme(legend.key.size = unit(0.2, "cm")) + + theme(axis.text = element_text(size = 12)) + ), + "ASV per phylum" = ( + ggplot(mapping = aes(fill = Phylum)) + + + geom_bar() + + ylab("ASV per phylum") + + theme(legend.key.size = unit(0.2, "cm")) + + theme(axis.text = element_text(size = 12)) + ) + ) +) + + +upset_pq( + data_fungi, + fact = "Time", + width_ratio = 0.2, + numeric_fonction = mean, + annotations = list( + "Sequences per ASV \n (log10)" = ( + ggplot(mapping = aes(y = log10(Abundance))) + + + geom_jitter(aes( + color = + Abundance + ), na.rm = TRUE) + + + geom_violin(alpha = 0.5, na.rm = TRUE) + + theme(legend.key.size = unit(0.2, "cm")) + + theme(axis.text = element_text(size = 12)) + ), + "ASV per phylum" = ( + ggplot(mapping = aes(fill = Phylum)) + + + geom_bar() + + ylab("ASV per phylum") + + theme(legend.key.size = unit(0.2, "cm")) + + theme(axis.text = element_text(size = 12)) + ) + ) +) + + +upset_pq( + subset_taxa(data_fungi, Phylum == "Basidiomycota"), + fact = "Time", + width_ratio = 0.2, + base_annotations = list(), + annotations = list( + "Sequences per ASV \n (log10)" = ( + ggplot(mapping = aes(y = log10(Abundance))) + + + geom_jitter(aes( + color = + Abundance + ), na.rm = TRUE) + + + geom_violin(alpha = 0.5, na.rm = TRUE) + + theme(legend.key.size = unit(0.2, "cm")) + + theme(axis.text = element_text(size = 12)) + ), + "ASV per phylum" = ( + ggplot(mapping = aes(fill = Class)) + + + geom_bar() + + ylab("ASV per Class") + + theme(legend.key.size = unit(0.2, "cm")) + + theme(axis.text = element_text(size = 12)) + ) + ) +) + +data_fungi2 <- data_fungi +data_fungi2@sam_data[["Time_0"]] <- data_fungi2@sam_data$Time == 0 +data_fungi2@sam_data[["Height__Time_0"]] <- + paste0(data_fungi2@sam_data[["Height"]], "__", data_fungi2@sam_data[["Time_0"]]) +data_fungi2@sam_data[["Height__Time_0"]][grepl("NA", data_fungi2@sam_data[["Height__Time_0"]])] <- + NA +upset_pq(data_fungi2, fact = "Height__Time_0", width_ratio = 0.2) +} +\seealso{ +\code{\link[=ggvenn_pq]{ggvenn_pq()}} +} +\author{ +Adrien Taudière +} diff --git a/man/verify_pq.Rd b/man/verify_pq.Rd index e8860099..2dc3c86d 100644 --- a/man/verify_pq.Rd +++ b/man/verify_pq.Rd @@ -14,9 +14,6 @@ using the \code{phyloseq} package.} Nothing if the phyloseq object is valid. An error in the other case. } \description{ -Verify the validity of a phyloseq object -} -\details{ \ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#maturing}{\figure{lifecycle-maturing.svg}{options: alt='[Maturing]'}}}{\strong{[Maturing]}} Mostly for internal use in MiscMetabar functions. diff --git a/paper/compare_tools.ods b/paper/compare_tools.ods index 56df6263..491791fc 100644 Binary files a/paper/compare_tools.ods and b/paper/compare_tools.ods differ diff --git a/paper/paper.bib b/paper/paper.bib index 509a4bb6..dcbab7ee 100644 --- a/paper/paper.bib +++ b/paper/paper.bib @@ -223,3 +223,12 @@ @article{zhao2021 year = {2021}, publisher = {BioMed Central} } + +@article{kauserud2023, + title={ITS alchemy: On the use of ITS as a DNA marker in fungal ecology}, + author={Kauserud, H{\aa}vard}, + journal={Fungal Ecology}, + pages={101274}, + year={2023}, + publisher={Elsevier} +} diff --git a/paper/paper.md b/paper/paper.md index 8911dcae..761880c7 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -68,9 +68,9 @@ Rapid graphical description of phyloseq object is available using the function ` **ASV** (stand for *Amplicon Sequence Variant*; also called **ESV** for *Exact Amplicon Variant*) is a DNA sequences recovered from a high-throughput analysis of marker genes. **OTU** are a group of closely related individuals generated by clustering sequences based on a threshold of similarity. An ASV is a special case of OTU with a similarity threshold of 100%. -The choice between ASV and OTU is important because they lead to different results ([@joos2020], box 2 in [@tedersoo2022]). Most articles advice to make a choice depending on the question. For example, ASV may outperform OTU in describing a group of very closely related species. Moreover, ASV are comparable across different datasets (obtained using identical marker genes). On the other hand, [@tedersoo2022] report that ESV approaches overestimate richness of common fungal species (due to haplotype variation) but underestimate richness of rare species. They consequently recommend the use of OTU in fungal community metabarcoding analysis. +The choice between ASV and OTU is important because they lead to different results ([@joos2020], box 2 in [@tedersoo2022]). Most articles advice to make a choice depending on the question. For example, ASV may outperform OTU in describing a group of very closely related species. Moreover, ASV are comparable across different datasets (obtained using identical marker genes). On the other hand, [@tedersoo2022] report that ESV approaches overestimate richness of common fungal species (due to haplotype variation) but underestimate richness of rare species. They consequently recommend the use of OTU in fungal community metabarcoding analysis. Finally, [@kauserud2023] args that ASV term falls within the original OTU term and recommend to use only the OTU terms but with a concise and clear report how the OTUs were generated. -Recent articles ([@forster2019],[@antich2021]) proposed to used both jointly. They recommend (i) to use ASV to denoise the dataset and (ii) for some questions, clustering the ASV sequences into OTUs. This is the aim of the function `asv2otu()` using either `DECIPHER::Clusterize` function from R or the [vsearch](https://github.com/torognes/vsearch) software. +Recent articles ([@forster2019],[@antich2021]) proposed to used both jointly. They recommend (i) to use ASV to denoise the dataset and (ii) for some questions, clustering the ASV sequences into OTUs. This is the aim of the function `asv2otu()` using either `DECIPHER::Clusterize` function from R or the [vsearch](https://github.com/torognes/vsearch) software. ## Exploration diff --git a/tests/testthat/test_blast.R b/tests/testthat/test_blast.R index 94ab44e7..728207fc 100644 --- a/tests/testthat/test_blast.R +++ b/tests/testthat/test_blast.R @@ -22,8 +22,8 @@ if (class(blast_error_or_not) == "try-error") { expect_s3_class(blast_df <- blast_pq(df_basidio, path_db, unique_per_seq = TRUE), "data.frame") expect_s3_class(blast_df <- blast_pq(df_basidio, path_db, score_filter = FALSE), "data.frame") expect_s3_class(blast_df <- blast_pq(df_basidio, path_db, unique_per_seq = FALSE, score_filter = FALSE), "data.frame") - expect_s3_class(blast_df <- blast_pq(df_basidio, database="inst/extdata/dbase"), "data.frame") - expect_error(blast_df <- blast_pq(df_basidio, fasta_for_db = path_db, database="inst/extdata/dbase")) + expect_s3_class(blast_df <- blast_pq(df_basidio, database = "inst/extdata/dbase"), "data.frame") + expect_error(blast_df <- blast_pq(df_basidio, fasta_for_db = path_db, database = "inst/extdata/dbase")) expect_error(blast_df <- blast_pq(df_basidio)) }) diff --git a/tests/testthat/test_figures_beta_div.R b/tests/testthat/test_figures_beta_div.R index 81bceefe..4bd6cf06 100644 --- a/tests/testthat/test_figures_beta_div.R +++ b/tests/testthat/test_figures_beta_div.R @@ -83,3 +83,14 @@ test_that("ggvenn_pq works with data_fungi dataset", { expect_error(ggvenn_pq(data_fungi)) expect_s3_class(ggvenn_pq(data_fungi, "Height"), "ggplot") }) + + +test_that("upset_pq works with data_fungi dataset", { + expect_silent(suppressMessages(upset_pq(data_fungi, "Height"))) + expect_s3_class(upset_pq(data_fungi, "Height"), "ggplot") + expect_s3_class(upset_pq(data_fungi, "Time"), "ggplot") + expect_s3_class(upset_pq(data_fungi, "Time", min_nb_seq = 10), "ggplot") + expect_s3_class(upset_pq(data_fungi, "Time", numeric_fonction = mean), "ggplot") + expect_error(upset_pq(data_fungi)) +}) + diff --git a/tests/testthat/test_figures_biplot.R b/tests/testthat/test_figures_biplot.R index 298e51de..66ea5d85 100644 --- a/tests/testthat/test_figures_biplot.R +++ b/tests/testthat/test_figures_biplot.R @@ -17,8 +17,13 @@ test_that("biplot_pq works", { test_that("multi_biplot_pq works with data_fungi dataset", { p1 <- multi_biplot_pq(data_fungi_abun, split_by = "Time", na_remove = FALSE) p2 <- multi_biplot_pq(data_fungi_abun, "Height") + data_fungi_abun@sam_data$Random_paires <- as.factor(sample(rep(1:(nsamples(data_fungi_abun) / 2), 2))) + p3 <- multi_biplot_pq(data_fungi_abun, paires = "Random_paires") expect_s3_class(p1[[1]], "ggplot") expect_type(p1, "list") expect_s3_class(p2[[1]], "ggplot") expect_type(p2, "list") + expect_s3_class(p3[[1]], "ggplot") + expect_type(p3, "list") + expect_equal(length(p3), 85) }) diff --git a/tests/testthat/test_table_functions.R b/tests/testthat/test_table_functions.R index 5be7548b..7d947dc1 100644 --- a/tests/testthat/test_table_functions.R +++ b/tests/testthat/test_table_functions.R @@ -26,9 +26,9 @@ data_fungi_low_high_withNA@sam_data[["Height"]][1] <- NA test_that(" compare_pairs_pq function works fine with data_fungi dataset", { expect_s3_class(compare_pairs_pq(data_fungi_low_high, bifactor = "Height", merge_sample_by = "Height"), "tbl_df") expect_message(expect_message(compare_pairs_pq(data_fungi_low_high_withNA, bifactor = "Height", merge_sample_by = "Height"))) - expect_equal(dim(compare_pairs_pq(data_fungi_low_high, bifactor = "Height", merge_sample_by = "Height")), c(1, 10)) + expect_equal(dim(compare_pairs_pq(data_fungi_low_high, bifactor = "Height", merge_sample_by = "Height")), c(1, 13)) expect_s3_class(compare_pairs_pq(data_fungi_low_high, bifactor = "Height", merge_sample_by = "Height", nb_min_seq = 2), "tbl_df") expect_s3_class(compare_pairs_pq(data_fungi_low_high, bifactor = "Height", merge_sample_by = "Height", veg_index = "simpson"), "tbl_df") expect_s3_class(compare_pairs_pq(data_fungi_low_high_withNA, bifactor = "Height", merge_sample_by = "Height", modality = "Time"), "tbl_df") - expect_equal(dim(compare_pairs_pq(data_fungi_low_high_withNA, bifactor = "Height", merge_sample_by = "Height", modality = "Time")), c(4, 10)) + expect_equal(dim(compare_pairs_pq(data_fungi_low_high_withNA, bifactor = "Height", merge_sample_by = "Height", modality = "Time")), c(4, 13)) }) diff --git a/tests/testthat/test_vsearch.R b/tests/testthat/test_vsearch.R index 56d46516..021cbf77 100644 --- a/tests/testthat/test_vsearch.R +++ b/tests/testthat/test_vsearch.R @@ -12,11 +12,14 @@ path_db <- "inst/extdata/1000_sp_UNITE_sh_general_release_dynamic.fasta" suppressWarnings(vsearch_error_or_not <- try(system("vsearch 2>&1", intern = TRUE), silent = TRUE)) if (class(vsearch_error_or_not) == "try-error") { - message("vs_search_global() and asv2otu(..., method=vsearch) can't be tested when vsearch is not installed") + message("vs_search_global() and asv2otu(..., method=vsearch) can't be tested when vsearch is not installed") } else { test_that("asv2otu works fine with vsearch method", { - expect_s4_class(asv2otu(data_fungi_sp_known, method = "vsearch"), "phyloseq") + expect_s4_class(d_vs <- asv2otu(data_fungi_sp_known, method = "vsearch"), "phyloseq") + expect_s4_class(d_fast <- asv2otu(data_fungi_sp_known, method = "vsearch", vsearch_cluster_method = "--cluster_fast"), "phyloseq") expect_s3_class(asv2otu(seq_names = sequences_ex, method = "vsearch"), "data.frame") + expect_true(sum(!d_fast@refseq == d_vs@refseq) > 0) + expect_equal(sum(dim(d_vs@otu_table) == dim(d_fast@otu_table)), 2) }) test_that("vs_search_global works fine with vsearch method", { diff --git a/vignettes/Reclustering.Rmd b/vignettes/Reclustering.Rmd index d44a6c40..a0cb3e45 100644 --- a/vignettes/Reclustering.Rmd +++ b/vignettes/Reclustering.Rmd @@ -26,8 +26,8 @@ data(data_fungi_sp_known) otu <- asv2otu(data_fungi_sp_known, method = "clusterize") ``` -```{r, eval = FALSE} -otu_vs <- asv2otu(data_fungi_sp_known, method = "vsearch", vsearch_cluster_method = "") +```{r} +otu_vs <- asv2otu(data_fungi_sp_known, method = "vsearch") ``` The vsearch method requires the installation of [Vsearch](https://github.com/torognes/vsearch). @@ -39,15 +39,23 @@ summary_plot_pq(otu) ## Using lulu algorithm ([link to LULU article](https://www.nature.com/articles/s41467-017-01312-x)) -```{r, message=FALSE, results="hide", eval = FALSE} -library(devtools) -install_github("adrientaudiere/lulu") +```{r, message=FALSE, results="hide"} library("lulu") data(data_fungi_sp_known) lulu_res <- lulu_pq(data_fungi_sp_known) ``` -```{r, eval = FALSE} +```{r} summary_plot_pq(data_fungi_sp_known) summary_plot_pq(lulu_res$new_physeq) ``` + +## Tracking number of samples, sequences and clusters + +```{r} +track_wkflow(list("Raw data" = data_fungi_sp_known, + "OTU" = otu, + "OTU_vsearch" = otu_vs, + "LULU" = lulu_res[[1]]) + ) +``` diff --git a/vignettes/beta-div.Rmd b/vignettes/beta-div.Rmd index ee5c76ae..7b398030 100644 --- a/vignettes/beta-div.Rmd +++ b/vignettes/beta-div.Rmd @@ -83,9 +83,61 @@ ggvenn_pq(data_fungi, fact = "Height", taxonomic_rank = "Genus", min_nb_seq = 10 labs(title = "Share number of Genus represented by at least one ASV with more than 100 seqs") ``` +## Upset plot + +Venn diagramm can quickly become complex to read when the number of modality increase. One graphical solution is upset plot. MiscMetabar propose a solution based on the package [ComplexUpset](https://krassowski.github.io/complex-upset/). +```{r} +upset_pq(data_fungi, fact = "Height") +``` + +```{r} +upset_pq(data_fungi, fact = "Time") +``` + +`ComplexUpset` package allow powerfull configuration of you plot as you can see in the following figure. + +```{r} +upset_pq( + data_fungi, + fact = "Time", + width_ratio = 0.2, + annotations = list( + "Sequences per ASV \n (log10)" = ( + ggplot(mapping = aes(y = log10(Abundance))) + + + geom_jitter(aes( + color = + Abundance + ), na.rm = TRUE) + + + geom_violin(alpha = 0.5, na.rm = TRUE) + + theme(legend.key.size = unit(0.2, "cm")) + + theme(axis.text = element_text(size = 12)) + ), + "ASV per phylum" = ( + ggplot(mapping = aes(fill = Phylum)) + + + geom_bar() + + ylab("ASV per phylum") + + theme(legend.key.size = unit(0.2, "cm")) + + theme(axis.text = element_text(size = 12)) + ) + ) +) +``` ## Change in abundance across a factor +### Benchdamic + +There is a lot of available methods. Please refer to R package [benchdamic](https://github.com/mcalgaro93/benchdamic) for a list of method and a implementation of a benchmark for your data. + +### Library requirement for Debian Linux OS + +```sh +sudo apt-get install libgsl-dev libmpfr-dev +``` + ### Using Deseq2 package ```{r} diff --git a/vignettes/filter.Rmd b/vignettes/filter.Rmd index 70936905..5747253d 100644 --- a/vignettes/filter.Rmd +++ b/vignettes/filter.Rmd @@ -16,6 +16,7 @@ knitr::opts_chunk$set( ```{r setup, message=FALSE} library(MiscMetabar) +library(formattable) ``` ## Filter samples @@ -54,37 +55,36 @@ path_db <- system.file("extdata", "1000_sp_UNITE_sh_general_release_dynamic.fast suppressWarnings(blast_error_or_not <- try(system("blastn 2>&1", intern = TRUE), silent = TRUE)) if (class(blast_error_or_not) != "try-error") { - -df_blast_80 <- filter_asv_blast(df_basidio, fasta_for_db = path_db) -df_blast_50 <- filter_asv_blast(df_basidio, fasta_for_db = path_db, id_filter = 50, e_value_filter = 10, bit_score_filter = 20, min_cover_filter = 20) - -track_formattable <- - track_wkflow( - list( - "raw data" = df_basidio, - "id_filter = 80" = df_blast_80, - "id_filter = 50" = df_blast_50 + df_blast_80 <- filter_asv_blast(df_basidio, fasta_for_db = path_db) + df_blast_50 <- filter_asv_blast(df_basidio, fasta_for_db = path_db, id_filter = 50, e_value_filter = 10, bit_score_filter = 20, min_cover_filter = 20) + + track_formattable <- + track_wkflow( + list( + "raw data" = df_basidio, + "id_filter = 80" = df_blast_80, + "id_filter = 50" = df_blast_50 + ) ) -) } ``` ```{r} if (class(blast_error_or_not) != "try-error") { -library(formattable) -formattable(track_formattable, - list( - area(col = nb_sequences) ~ color_bar("cyan", na.rm = T), - area(col = nb_clusters) ~ normalize_bar("yellowgreen", na.rm = TRUE, min = 0.3), - area(col = nb_samples) ~ normalize_bar("lightpink", na.rm = TRUE, min = 0.3) - ) -) + formattable( + track_formattable, + list( + area(col = nb_sequences) ~ color_bar("cyan", na.rm = T), + area(col = nb_clusters) ~ normalize_bar("yellowgreen", na.rm = TRUE, min = 0.3), + area(col = nb_samples) ~ normalize_bar("lightpink", na.rm = TRUE, min = 0.3) + ) + ) } ``` ### Filter taxa using a known taxa for control -To filter out contamination, one solution is to add a proportion of a known taxa which is not present in the environment of the study. In that case we can define some threshold for each sample to discard taxon based on pseudo-abundance. In the example code above, we select taxon using the ASV_50 as control through 6 differents algorithm. +To filter out contamination, one solution is to add a proportion of a known taxa which is not present in the environment of the study. In that case we can define some threshold for each sample to discard taxon based on pseudo-abundance. In the example code above, we select taxon using the ASV_50 as control through 6 differents algorithms. ```{r, results='hide'} data(data_fungi) @@ -110,21 +110,22 @@ res_mean <- suppressWarnings(subset_taxa_tax_control(data_fungi, as.numeric(data ``` ```{r} -track_formattable <- track_wkflow(list("raw data" = data_fungi, - "cutoff_seq" = res_seq, - "cutoff_mixt" = res_mixt, - "cutoff_diff" = res_diff, - "min" = res_min, - "max" = res_max, - "mean" = res_mean - ) - ) - -formattable(track_formattable, - list( - area(col = nb_sequences) ~ color_bar("cyan"), - area(col = nb_clusters) ~ normalize_bar("yellowgreen", na.rm = TRUE, min = 0.3), - area(col = nb_samples) ~ normalize_bar("lightpink", na.rm = TRUE) - ) +track_formattable <- track_wkflow(list( + "raw data" = data_fungi, + "cutoff_seq" = res_seq, + "cutoff_mixt" = res_mixt, + "cutoff_diff" = res_diff, + "min" = res_min, + "max" = res_max, + "mean" = res_mean +)) + +formattable( + track_formattable, + list( + area(col = nb_sequences) ~ color_bar("cyan"), + area(col = nb_clusters) ~ normalize_bar("yellowgreen", na.rm = TRUE, min = 0.3), + area(col = nb_samples) ~ normalize_bar("lightpink", na.rm = TRUE) + ) ) ``` diff --git a/vignettes/import_export_track.Rmd b/vignettes/import_export_track.Rmd new file mode 100644 index 00000000..557e941f --- /dev/null +++ b/vignettes/import_export_track.Rmd @@ -0,0 +1,54 @@ +--- +title: "import_export_track" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{import_export_track} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + + + +```{r, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>" +) +``` + +```{r setup, message=FALSE} +library(MiscMetabar) +library(formattable) +``` + +## Export phyloseq object + +You can export a phyloseq object to csv (and txt for phylogenetic tree) files in a folder. It is possible to export each table into one file or to merge all slot (exept phytree) in one file (args `one_file = TRUE`). Finally, if `rdata` is set to TRUE, a `rdata` file containing the phyloseq object is also writed. + +```{r, eval = FALSE} +write_pq(data_fungi, path = "fungi_phyloseq") +write_pq(data_fungi, path = "fungi_phyloseq", one_file = TRUE) +write_pq(data_fungi, path = "fungi_phyloseq", rdata = TRUE) +``` + +Finally, you can use the function `save_pq()` to write the phyloseq object in all 3 versions (one table for each slot, a file merging each slot and an Rdata file). + +```{r, eval = FALSE} +save_pq(data_fungi) +``` + +## Import + +To import a Rdata file, juste use `load()` base function. In order to import phyloseq object from a folder create using `write_pq()` or `save_pq()`, please use the function `read_pq()`. + +```{r, eval = FALSE} +d <- read_pq(path = "fungi_phyloseq") +``` + +## Tracking sequences, clusters and samples + +In bioinformatic pipeline, we often need to track the number of samples, sequences and clusters across step in the pipeline. MiscMetabar propose two utilities to achieve this goal : `track_wkflow()` and a version to compute value per samples : `track_wkflow_samples()`. The function `track_wkflow()` can deal with (i) fastq and fastg.gz files, dada-class object, derep-class object, matrix of samples x clusters (e.g. `otu_table`) and phyloseq-class object. + +```{r} +track_wkflow(list(data_fungi, data_fungi_sp_known)) +``` \ No newline at end of file