From 1110fff999e7ed958846efe459879c11b77e85c3 Mon Sep 17 00:00:00 2001 From: Salix <31168746+salix-d@users.noreply.github.com> Date: Sat, 22 Oct 2022 01:32:45 -0400 Subject: [PATCH 1/9] Update transform_filter-methods.R fixed the paste mistake in the message for removed samples (there were no space between the names) changed `unlist(data.frame())` for `rep.int()`; faster, uses less memory merge `message` functions added comment for the changes used `@` when possible replaced `badcolumns`with -(1:CN); more straightforward tested everything; all passed --- R/transform_filter-methods.R | 429 ++++++++++++++++++----------------- 1 file changed, 225 insertions(+), 204 deletions(-) diff --git a/R/transform_filter-methods.R b/R/transform_filter-methods.R index 3a6101cf..52a123af 100644 --- a/R/transform_filter-methods.R +++ b/R/transform_filter-methods.R @@ -1,15 +1,15 @@ ################################################################################ -# Function to create subsampled dataset +# Function to create subsampled dataset # in which each sample has same number of total observations/counts/reads # Note that the subsampling is random, so some noise is introduced making the # relative abundances slightly different ################################################################################ #' Resample an OTU table such that all samples have the same library size. -#' +#' #' Please note that the authors of phyloseq do not advocate using this #' as a normalization procedure, despite its recent popularity. #' Our justifications for using alternative approaches to address -#' disparities in library sizes have been made available as +#' disparities in library sizes have been made available as #' \href{http://dx.plos.org/10.1371/journal.pcbi.1003531}{an article in PLoS Computational Biology}. #' See \code{\link{phyloseq_to_deseq2}} for a recommended alternative to rarefying #' directly supported in the phyloseq package, as well as @@ -17,19 +17,19 @@ #' and \href{http://joey711.github.io/phyloseq-extensions}{the phyloseq extensions repository on GitHub}. #' Nevertheless, for comparison and demonstration, the rarefying procedure is implemented #' here in good faith and with options we hope are useful. -#' This function uses the standard R \code{\link{sample}} function to -#' resample from the abundance values +#' This function uses the standard R \code{\link{sample}} function to +#' resample from the abundance values #' in the \code{\link{otu_table}} component of the first argument, #' \code{physeq}. -#' Often one of the major goals of this procedure is to achieve parity in +#' Often one of the major goals of this procedure is to achieve parity in #' total number of counts between samples, as an alternative to other formal -#' normalization procedures, which is why a single value for the +#' normalization procedures, which is why a single value for the #' \code{sample.size} is expected. #' This kind of resampling can be performed with and without replacement, #' with replacement being the more computationally-efficient, default setting. #' See the \code{replace} parameter documentation for more details. #' We recommended that you explicitly select a random number generator seed -#' before invoking this function, or, alternatively, that you +#' before invoking this function, or, alternatively, that you #' explicitly provide a single positive integer argument as \code{rngseed}. #' #' This approach is sometimes mistakenly called ``rarefaction'', which @@ -38,15 +38,15 @@ #' \href{http://en.wikipedia.org/wiki/Rarefaction_(ecology)}{repeated sampling procedure to assess species richness}, #' first proposed in 1968 by Howard Sanders. #' In contrast, the procedure implemented here is used as an \emph{ad hoc} means to -#' normalize microbiome counts that have +#' normalize microbiome counts that have #' resulted from libraries of widely-differing sizes. #' Here we have intentionally adopted an alternative #' name, \code{rarefy}, that has also been used recently -#' to describe this process +#' to describe this process #' and, to our knowledge, not previously used in ecology. #' #' Make sure to use \code{\link{set.seed}} for exactly-reproducible results -#' of the random subsampling. +#' of the random subsampling. #' #' @param physeq (Required). A \code{\link{phyloseq-class}} object that you #' want to trim/filter. @@ -54,16 +54,16 @@ #' @param sample.size (Optional). A single integer value equal to the number #' of reads being simulated, also known as the depth, #' and also equal to each value returned by \code{\link{sample_sums}} -#' on the output. -#' -#' @param rngseed (Optional). A single integer value passed to +#' on the output. +#' +#' @param rngseed (Optional). A single integer value passed to #' \code{\link{set.seed}}, which is used to fix a seed for reproducibly #' random number generation (in this case, reproducibly random subsampling). -#' The default value is \code{711}. +#' The default value is \code{711}. #' If set to \code{FALSE}, then no fiddling with the RNG seed is performed, #' and it is up to the user to appropriately call \code{\link{set.seed}} #' beforehand to achieve reproducible results. -#' +#' #' @param replace (Optional). Logical. Whether to sample with replacement #' (\code{TRUE}) or without replacement (\code{FALSE}). #' The default is with replacement (\code{replace=TRUE}). @@ -80,28 +80,28 @@ #' Note that this default behavior was selected for computational efficiency, #' but differs from analogous functions in related packages #' (e.g. subsampling in QIIME). -#' +#' #' @param trimOTUs (Optional). \code{\link{logical}(1)}. #' Whether to trim OTUs #' from the dataset that are no longer observed in any sample -#' (have a count of zero in every sample). +#' (have a count of zero in every sample). #' The number of OTUs trimmed, if any, is printed to #' standard out as a reminder. -#' +#' #' @param verbose (Optional). Logical. Default is \code{TRUE}. #' If \code{TRUE}, extra non-warning, non-error messages are printed -#' to standard out, describing steps in the rarefying process, +#' to standard out, describing steps in the rarefying process, #' the OTUs and samples removed, etc. This can be useful the #' first few times the function is executed, but can be set #' to \code{FALSE} as-needed once behavior has been verified -#' as expected. +#' as expected. #' -#' @return An object of class \code{phyloseq}. +#' @return An object of class \code{phyloseq}. #' Only the \code{otu_table} component is modified. #' #' @seealso #' \code{\link{sample}} -#' +#' #' \code{\link{set.seed}} #' #' @export @@ -119,131 +119,153 @@ #' # GPrepT = rarefy_even_depth(GlobalPatterns, 1E5, replace=TRUE) #' ## Actually just this one is slow #' # system.time(GPrepF <- rarefy_even_depth(GlobalPatterns, 1E5, replace=FALSE)) -rarefy_even_depth <- function(physeq, sample.size=min(sample_sums(physeq)), - rngseed=FALSE, replace=TRUE, trimOTUs=TRUE, verbose=TRUE){ - - if( as(rngseed, "logical") ){ +rarefy_even_depth <- + function(physeq, + sample.size = min(sample_sums(physeq)), + rngseed = FALSE, + replace = TRUE, + trimOTUs = TRUE, + verbose = TRUE) { + + if ( !(is.integer(rngseed) || is.numeric(rngseed)) || length(rngseed) > 1) { + stop("`rngseed` must be a single integer value or FALSE") + } + + if ( !(is.logical(rngseed) && !rngseed) ) { # Now call the set.seed using the value expected in phyloseq set.seed(rngseed) - if(verbose){ + if (verbose) { # Print to screen this value - message("`set.seed(", rngseed, ")` was used to initialize repeatable random subsampling.") - message("Please record this for your records so others can reproduce.") - message("Try `set.seed(", rngseed,"); .Random.seed` for the full vector", sep="") - message("...") + message( + "`set.seed(", rngseed, ")` was used to initialize repeatable random subsampling.\n", + "Please record this for your records so others can reproduce.\n", + "Try `set.seed(", rngseed, "); .Random.seed` for the full vector.\n", + "..." + ) } - } else if(verbose){ - message("You set `rngseed` to FALSE. Make sure you've set & recorded\n", - " the random seed of your session for reproducibility.\n", - "See `?set.seed`\n") - message("...") + } else if (verbose) { + message( + "You set `rngseed` to FALSE. Make sure you've set & recorded\n", + " the random seed of your session for reproducibility.\n", + "See `?set.seed`\n", + "..." + ) } - + # Make sure sample.size is of length 1. - if( length(sample.size) > 1 ){ - warning("`sample.size` had more than one value. ", + if ( length(sample.size) > 1 ) { + warning("`sample.size` had more than one value. ", "Using only the first. \n ... \n") - sample.size <- sample.size[1] + sample.size <- sample.size[1] } - - if( sample.size <= 0 ){ - stop("sample.size less than or equal to zero. ", + + if ( !(is.integer(sample.size) || is.numeric(sample.size)) ) { + stop("`sample.size` must be a single integer value") + } else + if ( sample.size <= 0 ) { + stop("sample.size less than or equal to zero. ", "Need positive sample size to work.") } - + # Instead of warning, expected behavior now is to prune samples # that have fewer reads than `sample.size` - if( min(sample_sums(physeq)) < sample.size ){ - rmsamples = sample_names(physeq)[sample_sums(physeq) < sample.size] - if(verbose){ - message(length(rmsamples), " samples removed", + if ( min(sample_sums(physeq)) < sample.size ) { + rmsamples = sample_sums(physeq) < sample.size + if (verbose) { + n.rmsamples <- sum(rmsamples) + message(n.rmsamples, " samples removed", "because they contained fewer reads than `sample.size`.") message("Up to first five removed samples are: \n") - message(paste(rmsamples[1:min(5, length(rmsamples))], sep="\t")) + message(paste(names(rmsamples[rmsamples])[1:min(5, n.rmsamples)], "\t")) message("...") } # Now done with notifying user of pruning, actually prune. - physeq = prune_samples(setdiff(sample_names(physeq), rmsamples), physeq) + physeq = prune_samples(!rmsamples, physeq) } - # initialize the subsamples phyloseq instance, newsub - newsub <- physeq - # enforce orientation as species-are-rows, for assignment - if(!taxa_are_rows(newsub)){newsub <- t(newsub)} # apply through each sample, and replace - newotu <- apply(otu_table(newsub), 2, rarefaction_subsample, - sample.size=sample.size, replace=replace) + # instead of transposing the table : + # if taxa are rows use margin 2 (cols) else use margin 1 (rows) + newotu <- apply(physeq@otu_table, taxa_are_rows(physeq) + 1, rarefaction_subsample, + sample.size = sample.size, replace = replace) # Add OTU names to the row indices - rownames(newotu) <- taxa_names(physeq) - # replace the otu_table. - otu_table(newsub) <- otu_table(newotu, TRUE) - if(trimOTUs){ + dimnames(newotu) <- dimnames(physeq@otu_table)#.taxa_names + # replace the otu_table + physeq@otu_table <- otu_table(newotu, taxa_are_rows(physeq)) + if (trimOTUs) { # Check for and remove empty OTUs # 1. Notify user of empty OTUs being cut. # 2. Cut empty OTUs - rmtaxa = taxa_names(newsub)[taxa_sums(newsub) <= 0] - if( length(rmtaxa) > 0 ){ - if(verbose){ - message(length(rmtaxa), "OTUs were removed because they are no longer \n", + rmtaxa = taxa_sums(physeq) <= 0 + if ( sum(rmtaxa) > 0 ) { + if (verbose) { + message(sum(rmtaxa), "OTUs were removed because they are no longer \n", "present in any sample after random subsampling\n") message("...") } - newsub = prune_taxa(setdiff(taxa_names(newsub), rmtaxa), newsub) + physeq = prune_taxa(!rmtaxa, physeq) } } - # If the OTU table was transposed before rarefaction, transpose it - # back to the way it was in the original physeq object. - if(!taxa_are_rows(physeq)){newsub <- t(newsub)} - return(newsub) + return(physeq) } ################################################################################ # rarefaction subsample function, one sample ################################################################################ #' @keywords internal -rarefaction_subsample <- function(x, sample.size, replace=FALSE){ - # This is a test - # x = sample(10, 10) - # x = 1:10 - # sample.size = 50 - #system.time(obsvec <- foreach(OTUi=1:length(x), times=x, .combine=c) %do% {rep(OTUi, times)}) - # data("GlobalPatterns") - # sample.size = sample_sums(GlobalPatterns)[which.min(sample_sums(GlobalPatterns))] - # x = get_taxa(GlobalPatterns, which.max(sample_sums(GlobalPatterns))) - # Create replacement species vector - rarvec <- numeric(length(x)) - # Perform the sub-sampling. Suppress warnings due to old R compat issue. - # Also, make sure to avoid errors from x summing to zero, +# ------------------------------------------------------------------------ +# This is a test +# x = sample(10, 10) +# x = 1:10 +# sample.size = 50 +#system.time(obsvec <- foreach(OTUi=1:length(x), times=x, .combine=c) %do% {rep(OTUi, times)}) +# data("GlobalPatterns") +# sample.size = sample_sums(GlobalPatterns)[which.min(sample_sums(GlobalPatterns))] +# x = get_taxa(GlobalPatterns, which.max(sample_sums(GlobalPatterns))) +# ------------------------------------------------------------------------ +rarefaction_subsample <- function(x, sample.size, replace = FALSE) { + # Perform the sub-sampling. Suppress warnings due to old R compat issue. + # Also, make sure to avoid errors from x summing to zero, # and there are no observations to sample. - # The initialization of rarvec above is already sufficient. - if(sum(x) <= 0){ - # Protect against, and quickly return an empty vector, - # if x is already an empty count vector - return(rarvec) - } - if(replace){ - # resample with replacement - suppressWarnings(subsample <- sample(1:length(x), sample.size, replace=TRUE, prob=x)) - } else { - # resample without replacement - obsvec <- apply(data.frame(OTUi=1:length(x), times=x), 1, function(x){ - rep_len(x["OTUi"], x["times"]) - }) - obsvec <- unlist(obsvec, use.names=FALSE) - # use `sample` for subsampling. Hope that obsvec doesn't overflow. - suppressWarnings(subsample <- sample(obsvec, sample.size, replace=FALSE)) - } - # Tabulate the results (these are already named by the order in `x`) - sstab <- table(subsample) - # Assign the tabulated random subsample values to the species vector - rarvec[as(names(sstab), "integer")] <- sstab - # Return abundance vector. Let replacement happen elsewhere. - return(rarvec) + if (sum(x) > 0) { + # Protect against, and quickly return an empty vector, + # if x is already an empty count vector this part is skipped + subsample <- suppressWarnings(if (replace) { + # resample with replacement + sample(seq_along(x), + sample.size, + replace = TRUE, + prob = x) + } else { + # resample without replacement + # use `sample` for subsampling. Hope that obsvec doesn't overflow. + ## obsvec being longer than .Machine$integer.max doesn't break it + ## as long as there aren't more than .Machine$integer.max taxa, + ## it shouldn't be a problem (and if it is, tabulate throws an error) + ## could always add a check for length(x) < .Machine$integer.max + + # Use seq_along to get the indexes of x, + # then rep.int to repeat each times its abundance + ## (int vectors use less memory than list/data.frame) + obsvec <- rep.int(seq_along(x), x) + sample(obsvec, + sample.size, + replace = FALSE) + }) + # Tabulate the results (these are already named by the order in `x`) + ## `table` is a `tabulate` wrapper that remove the 0s we want, + ## so using tabulate directly + rarvec <- tabulate(subsample) + # there can be 0s missing at the end though so, need padding + rarvec <- c(rarvec, rep(0, length(x) - length(rarvec))) + } + # Return abundance vector. Let replacement happen elsewhere. + return(rarvec) } -################################################################################ +############################################################################### #' Agglomerate closely-related taxa using single-linkage clustering. -#' -#' All tips of the tree separated by a cophenetic distance smaller than +#' +#' All tips of the tree separated by a cophenetic distance smaller than #' \code{h} will be agglomerated into one taxa using \code{\link{merge_taxa}}. -#' +#' #' Can be used to create a non-trivial OTU Table, if a phylogenetic tree is available. #' #' For now, a simple, ``greedy'', single-linkage clustering is used. In future releases @@ -252,25 +274,25 @@ rarefaction_subsample <- function(x, sample.size, replace=FALSE){ #' clustering applications. #' #' @param physeq (Required). A \code{\link{phyloseq-class}}, -#' containing a phylogenetic tree. +#' containing a phylogenetic tree. #' Alternatively, a phylogenetic tree \code{\link[ape]{phylo}} will also work. #' #' @param h (Optional). Numeric scalar of the height where the tree should be cut. #' This refers to the tree resulting from hierarchical clustering #' of \code{\link[ape]{cophenetic.phylo}(phy_tree(physeq))}, #' not necessarily the original phylogenetic tree, \code{phy_tree(physeq)}. -#' Default value is \code{0.2}. +#' Default value is \code{0.2}. #' Note that this argument used to be named \code{speciationMinLength}, #' before this function/method was rewritten. -#' -#' @param hcfun (Optional). A function. +#' +#' @param hcfun (Optional). A function. #' The (agglomerative, hierarchical) clustering function to use. #' Good examples are #' \code{\link[cluster]{agnes}} and \code{\link[stats]{hclust}}. #' The default is \code{\link[cluster]{agnes}}. -#' +#' #' @param ... (Optional). Additional named arguments to pass -#' to \code{hcfun}. +#' to \code{hcfun}. #' #' @return An instance of the \code{\link{phyloseq-class}}. #' Or alternatively, a \code{\link{phylo}} object if the @@ -278,25 +300,25 @@ rarefaction_subsample <- function(x, sample.size, replace=FALSE){ #' In the expected-use case, the number of OTUs will be fewer #' (see \code{\link{ntaxa}}), #' after merging OTUs that are related enough to be called -#' the same OTU. +#' the same OTU. +#' +#' @seealso #' -#' @seealso -#' #' \code{\link{merge_taxa}} -#' +#' #' \code{\link[cluster]{agnes}} -#' +#' #' \code{\link[stats]{hclust}} -#' +#' #' \code{\link[ape]{cophenetic.phylo}} -#' +#' #' \code{\link[ape]{phylo}} #' #' @importFrom cluster agnes #' #' @export #' -#' @examples +#' @examples #' data("esophagus") #' # for speed #' esophagus = prune_taxa(taxa_names(esophagus)[1:25], esophagus) @@ -316,8 +338,8 @@ tip_glom = function(physeq, h=0.2, hcfun=agnes, ...){ ################################################################################ #' Agglomerate taxa of the same type. #' -#' This method merges species that have the same taxonomy at a certain -#' taxonomic rank. +#' This method merges species that have the same taxonomy at a certain +#' taxonomic rank. #' Its approach is analogous to \code{\link{tip_glom}}, but uses categorical data #' instead of a tree. In principal, other categorical data known for all taxa #' could also be used in place of taxonomy, @@ -341,31 +363,31 @@ tip_glom = function(physeq, h=0.2, hcfun=agnes, ...){ #' CAUTION. The decision to prune (or not) taxa for which you lack categorical #' data could have a large effect on downstream analysis. You may want to #' re-compute your analysis under both conditions, or at least think carefully -#' about what the effect might be and the reasons explaining the absence of -#' information for certain taxa. In the case of taxonomy, it is often a result +#' about what the effect might be and the reasons explaining the absence of +#' information for certain taxa. In the case of taxonomy, it is often a result #' of imprecision in taxonomic designation based on short phylogenetic sequences #' and a patchy system of nomenclature. If this seems to be an issue for your #' analysis, think about also trying the nomenclature-agnostic \code{\link{tip_glom}} #' method if you have a phylogenetic tree available. #' #' @param bad_empty (Optional). Character vector. Default: \code{c(NA, "", " ", "\t")}. -#' Defines the bad/empty values +#' Defines the bad/empty values #' that should be ignored and/or considered unknown. They will be removed #' from the internal agglomeration vector derived from the argument to \code{tax}, #' and therefore agglomeration will not combine taxa according to the presence #' of these values in \code{tax}. Furthermore, the corresponding taxa can be #' optionally pruned from the output if \code{NArm} is set to \code{TRUE}. -#' +#' #' @return A taxonomically-agglomerated, optionally-pruned, object with class matching #' the class of \code{physeq}. #' #' @seealso #' \code{\link{tip_glom}} -#' +#' #' \code{\link{prune_taxa}} -#' +#' #' \code{\link{merge_taxa}} -#' +#' #' @export #' #' @examples @@ -387,17 +409,17 @@ tax_glom <- function(physeq, taxrank=rank_names(physeq)[1], if( is.null(access(physeq, "tax_table")) ){ stop("The tax_glom() function requires that physeq contain a taxonomyTable") } - + # Error if bad taxrank if( !taxrank[1] %in% rank_names(physeq) ){ - stop("Bad taxrank argument. Must be among the values of rank_names(physeq)") + stop("Bad taxrank argument. Must be among the values of rank_names(physeq)") } # Make a vector from the taxonomic data. CN <- which( rank_names(physeq) %in% taxrank[1] ) tax <- as(access(physeq, "tax_table"), "matrix")[, CN] - - # if NArm is TRUE, remove the empty, white-space, NA values from + + # if NArm is TRUE, remove the empty, white-space, NA values from if( NArm ){ keep_species <- names(tax)[ !(tax %in% bad_empty) ] physeq <- prune_taxa(keep_species, physeq) @@ -406,33 +428,32 @@ tax_glom <- function(physeq, taxrank=rank_names(physeq)[1], # Concatenate data up to the taxrank column, use this for agglomeration tax <- as(access(physeq, "tax_table"), "matrix")[, 1:CN, drop=FALSE] tax <- apply(tax, 1, function(i){paste(i, sep=";_;", collapse=";_;")}) - + # Remove NAs and useless from the vector/factor for looping. # This does not remove the taxa that have an unknown (NA) # taxonomic designation at this particular taxonomic rank. tax <- tax[ !(tax %in% bad_empty) ] - + # Define the OTU cliques to loop through spCliques <- tapply(names(tax), factor(tax), list) - + # Successively merge taxa in physeq. for( i in names(spCliques)){ physeq <- merge_taxa(physeq, spCliques[[i]]) } - + # "Empty" the values to the right of the rank, using NA_character_. if( CN < length(rank_names(physeq)) ){ - badcolumns <- (CN+1):length(rank_names(physeq)) - tax_table(physeq)[, badcolumns] <- NA_character_ + tax_table(physeq)[, -(1:CN)] <- NA_character_ } - + # Return. return(physeq) } ################################################################################ ################################################################################ #' Prune unwanted OTUs / taxa from a phylogenetic object. -#' +#' #' An S4 Generic method for removing (pruning) unwanted OTUs/taxa from phylogenetic #' objects, including phylo-class trees, as well as native phyloseq package #' objects. This is particularly useful for pruning a phyloseq object that has @@ -446,7 +467,7 @@ tax_glom <- function(physeq, taxrank=rank_names(physeq)[1], #' keep -- OR alternatively -- a logical vector where the kept taxa are TRUE, and length #' is equal to the number of taxa in object x. If \code{taxa} is a named #' logical, the taxa retained are based on those names. Make sure they are -#' compatible with the \code{taxa_names} of the object you are modifying (\code{x}). +#' compatible with the \code{taxa_names} of the object you are modifying (\code{x}). #' #' @param x (Required). A phylogenetic object, including \code{phylo} trees, #' as well as all phyloseq classes that represent taxa. If the function @@ -457,7 +478,7 @@ tax_glom <- function(physeq, taxrank=rank_names(physeq)[1], #' the class of the argument, \code{x}. #' #' @seealso -#' +#' #' \code{\link{prune_samples}} #' #' \href{http://cran.at.r-project.org/web/packages/picante/index.html}{prune.sample} @@ -468,8 +489,8 @@ tax_glom <- function(physeq, taxrank=rank_names(physeq)[1], #' data("esophagus") #' esophagus #' plot(sort(taxa_sums(esophagus), TRUE), type="h", ylim=c(0, 50)) -#' x1 = prune_taxa(taxa_sums(esophagus) > 10, esophagus) -#' x2 = prune_taxa(names(sort(taxa_sums(esophagus), TRUE))[1:9], esophagus) +#' x1 = prune_taxa(taxa_sums(esophagus) > 10, esophagus) +#' x2 = prune_taxa(names(sort(taxa_sums(esophagus), TRUE))[1:9], esophagus) #' identical(x1, x2) setGeneric("prune_taxa", function(taxa, x) standardGeneric("prune_taxa")) #' @aliases prune_taxa,NULL,ANY-method @@ -488,7 +509,7 @@ setMethod("prune_taxa", signature("logical", "ANY"), function(taxa, x){ stop("logical argument to taxa is wrong length. Should equal ntaxa(x)") } else { # Pass on to names-based prune_taxa method - return( prune_taxa(taxa_names(x)[taxa], x) ) + return( prune_taxa(taxa_names(x)[taxa], x) ) } }) #' @importFrom ape drop.tip @@ -502,7 +523,7 @@ setMethod("prune_taxa", signature("character", "phylo"), function(taxa, x){ } else if( setequal(taxa, taxa_names(x)) ){ return(x) } else { - return( drop.tip(x, setdiff(taxa_names(x), taxa)) ) + return( drop.tip(x, setdiff(taxa_names(x), taxa)) ) } }) #' @aliases prune_taxa,character,otu_table-method @@ -541,7 +562,7 @@ setMethod("prune_taxa", signature("character", "phyloseq"), function(taxa, x){ } if( !is.null(x@refseq) ){ x@refseq = prune_taxa(taxa, refseq(x)) - } + } # Force index order after pruning to be the same, # according to the same rules as in the constructor, phyloseq() x = index_reorder(x, index_type="taxa") @@ -566,7 +587,7 @@ setMethod("prune_taxa", signature("character", "XStringSet"), function(taxa, x){ return(x) } else if( length(intersect(taxa, taxa_names(x))) == 0 ){ # Informative error if intersection is zero. - stop("prune_taxa,XStringSet: taxa and taxa_names(x) do not overlap.") + stop("prune_taxa,XStringSet: taxa and taxa_names(x) do not overlap.") } else { # Pop the OTUs that are not in `taxa`, without reordering. return(x[-which(!taxa_names(x) %in% taxa)]) @@ -575,7 +596,7 @@ setMethod("prune_taxa", signature("character", "XStringSet"), function(taxa, x){ ################################################################################ ################################################################################ #' Define a subset of samples to keep in a phyloseq object. -#' +#' #' An S4 Generic method for pruning/filtering unwanted samples #' by defining those you want to keep. #' @@ -585,7 +606,7 @@ setMethod("prune_taxa", signature("character", "XStringSet"), function(taxa, x){ #' keep -- OR alternatively -- a logical vector where the kept samples are TRUE, and length #' is equal to the number of samples in object x. If \code{samples} is a named #' logical, the samples retained is based on those names. Make sure they are -#' compatible with the \code{sample_names} of the object you are modifying (\code{x}). +#' compatible with the \code{sample_names} of the object you are modifying (\code{x}). #' #' @param x A phyloseq object. #' @@ -593,7 +614,7 @@ setMethod("prune_taxa", signature("character", "XStringSet"), function(taxa, x){ #' the class of the phyloseq object, \code{x}. #' #' @seealso \code{\link{subset_samples}} -#' +#' #' @rdname prune_samples-methods #' @docType methods #' @export @@ -627,7 +648,7 @@ setMethod("prune_samples", signature("character", "sample_data"), function(sampl # If the sets of `samples` and sample_names are the same, return as-is. return(x) } else { - samples = intersect(samples, sample_names(x)) + samples = intersect(samples, sample_names(x)) return(x[samples, ]) } }) @@ -638,7 +659,7 @@ setMethod("prune_samples", signature("character", "phyloseq"), function(samples, samples = intersect(intersect_samples(x), samples) # Now prune each component. # All phyloseq objects have an otu_table slot, no need to test for existence. - x@otu_table = prune_samples(samples, otu_table(x)) + x@otu_table = prune_samples(samples, otu_table(x)) if( !is.null(x@sam_data) ){ # protect missing sample_data component. Don't need to prune if empty x@sam_data = prune_samples(samples, sample_data(x)) @@ -646,9 +667,9 @@ setMethod("prune_samples", signature("character", "phyloseq"), function(samples, # Force sample index order after pruning to be the same, # according to the same rules as in the constructor, phyloseq() x = index_reorder(x, index_type="samples") - return(x) + return(x) }) -# A logical should specify the samples to keep, or not. Have same length as nsamples(x) +# A logical should specify the samples to keep, or not. Have same length as nsamples(x) #' @aliases prune_samples,logical,ANY-method #' @rdname prune_samples-methods setMethod("prune_samples", signature("logical", "ANY"), function(samples, x){ @@ -657,28 +678,28 @@ setMethod("prune_samples", signature("logical", "ANY"), function(samples, x){ stop("logical argument to samples is wrong length. Should equal nsamples(x)") } else { # Pass on to names-based prune_samples method - return( prune_samples(sample_names(x)[samples], x) ) + return( prune_samples(sample_names(x)[samples], x) ) } }) ################################################################################ #' Thresholded rank transformation. -#' +#' #' The lowest \code{thresh} values in \code{x} all get the value 'thresh'. #' #' @usage threshrank(x, thresh, keep0s=FALSE, ...) #' #' @param x (Required). Numeric vector to transform. #' @param thresh A single numeric value giving the threshold. -#' @param keep0s A logical determining whether 0's in \code{x} should remain +#' @param keep0s A logical determining whether 0's in \code{x} should remain #' a zero-value in the output. If FALSE, zeros are treated as any other value. #' @param ... Further arguments passes to the \code{\link{rank}} function. -#' +#' #' @return A ranked, (optionally) thresholded numeric vector with length equal to #' \code{x}. Default arguments to \code{rank} are used, unless provided as -#' additional arguments. +#' additional arguments. #' #' @seealso \code{\link{transform_sample_counts}}, \code{\link{rank}}, \code{\link{threshrankfun}} -#' @export +#' @export #' @examples # #' (a_vector <- sample(0:10, 100, TRUE)) #' threshrank(a_vector, 5, keep0s=TRUE) @@ -701,20 +722,20 @@ threshrank <- function(x, thresh, keep0s=FALSE, ...){ #################################################################################### #' A closure version of the \code{threshrank} function. #' -#' Takes the same arguments as \code{\link{threshrank}}, except for \code{x}, -#' because the output is a single-argument function rather than a rank-transformed numeric. +#' Takes the same arguments as \code{\link{threshrank}}, except for \code{x}, +#' because the output is a single-argument function rather than a rank-transformed numeric. #' This is useful for higher-order functions that require a single-argument function as input, #' like \code{\link{transform_sample_counts}}. #' #' @usage threshrankfun(thresh, keep0s=FALSE, ...) -#' +#' #' @param thresh A single numeric value giving the threshold. -#' @param keep0s A logical determining whether 0's in \code{x} should remain +#' @param keep0s A logical determining whether 0's in \code{x} should remain #' a zero-value in the output. If FALSE, zeros are treated as any other value. #' @param ... Further arguments passes to the \code{\link{rank}} function. -#' +#' #' @return A single-argument function with the options to \code{\link{threshrank}} set. -#' +#' #' @seealso \code{\link{transform_sample_counts}}, \code{\link{threshrankfun}}, #' \code{\link{threshrank}} #' @export @@ -768,28 +789,28 @@ setMethod("t", signature("phyloseq"), function(x){ }) ################################################################################ #' Transform abundance data in an \code{otu_table}, sample-by-sample. -#' +#' #' This function transforms the sample counts of a taxa #' abundance matrix according to a user-provided function. -#' The counts of each sample will be transformed individually. No sample-sample -#' interaction/comparison is possible by this method. +#' The counts of each sample will be transformed individually. No sample-sample +#' interaction/comparison is possible by this method. #' #' @usage transform_sample_counts(physeq, fun, ...) #' #' @param physeq (Required). \code{\link{phyloseq-class}} of \code{\link{otu_table-class}}. #' #' @param fun (Required). A single-argument function that will be applied -#' to the abundance counts of each sample. +#' to the abundance counts of each sample. #' Can be an anonymous \code{\link[base]{function}}. -#' -#' @param ... (Optional). Additional, optionally-named, arguments passed to +#' +#' @param ... (Optional). Additional, optionally-named, arguments passed to #' \code{fun} during transformation of abundance data. -#' +#' #' @return A transformed \code{otu_table} -- or \code{phyloseq} object with its -#' transformed \code{otu_table}. -#' In general, trimming is not expected by this +#' transformed \code{otu_table}. +#' In general, trimming is not expected by this #' method, so it is suggested that the user provide only functions that return -#' a full-length vector. Filtering/trimming can follow, for which the +#' a full-length vector. Filtering/trimming can follow, for which the #' \code{\link{genefilter_sample}} and \code{\link{prune_taxa}} functions #' are suggested. #' @@ -847,7 +868,7 @@ transformSampleCounts <- transform_sample_counts #################################################################################### ############################################################ #' Filter OTUs with arbitrary function, sample-wise. -#' +#' #' A general OTU trimming function for selecting OTUs that satisfy #' some criteria within the distribution of each sample, and then #' also an additional criteria for number of samples that must pass. @@ -855,11 +876,11 @@ transformSampleCounts <- transform_sample_counts #' criteria. The number of acceptable samples is used #' as the final criteria (set by the argument \code{A}) #' to determine whether or not the taxa should -#' be retained (\code{TRUE}) or not (\code{FALSE}). Just like with genefilter, a +#' be retained (\code{TRUE}) or not (\code{FALSE}). Just like with genefilter, a #' logical having length equal to nrow()/\code{\link{ntaxa}} is returned, indicating which #' should be kept. This output can be provided #' directly to OTU trimming function, \code{\link{prune_taxa}}. -#' By contrast, \code{\link[genefilter]{genefilter}}, +#' By contrast, \code{\link[genefilter]{genefilter}}, #' of the genefilter package in Bioconductor, #' works only on the rows of a matrix. Note that, because \code{\link{otu_table-class}} #' inherits directly from the \code{\link{matrix-class}}, an unmodified @@ -895,7 +916,7 @@ transformSampleCounts <- transform_sample_counts #' ## wh2 <- c(TRUE, TRUE, TRUE, FALSE, FALSE) #' ## prune_taxa(wh1, testOTU) #' ## prune_taxa(wh2, testOTU) -#' ## +#' ## #' ## tax_table1 <- tax_table(matrix("abc", 5, 5)) #' ## prune_taxa(wh1, tax_table1) #' ## prune_taxa(wh2, tax_table1) @@ -926,17 +947,17 @@ setMethod("genefilter_sample", signature("phyloseq"), function(X, flist, A=1){ #' #' See the \code{\link[genefilter]{filterfun}}, from the Bioconductor repository, #' for a taxa-/gene-wise filter (and further examples). -#' +#' #' @usage filterfun_sample(...) #' #' @param ... A comma-separated list of functions. -#' -#' @return An enclosure (function) that itself will return a logical vector, +#' +#' @return An enclosure (function) that itself will return a logical vector, #' according to the #' functions provided in the argument list, evaluated in order. The output of -#' filterfun_sample is appropriate for the `flist' argument to the +#' filterfun_sample is appropriate for the `flist' argument to the #' genefilter_sample method. -#' +#' #' @export #' @seealso \code{\link[genefilter]{filterfun}}, \code{\link{genefilter_sample}} #' @examples @@ -978,7 +999,7 @@ filterfun_sample = function(...){ #' indicating whether or not each OTU passed the criteria. #' Alternatively, if the \code{"prune"} option is set to \code{TRUE}, #' it returns the already-trimmed version of the phyloseq object. -#' +#' #' @usage filter_taxa(physeq, flist, prune=FALSE) #' #' @param physeq (Required). A \code{\link{phyloseq-class}} object that you @@ -991,25 +1012,25 @@ filterfun_sample = function(...){ #' @param prune (Optional). A logical. Default \code{FALSE}. If \code{TRUE}, then #' the function returns the pruned \code{\link{phyloseq-class}} object, rather #' than the logical vector of taxa that passed the filter. -#' +#' #' @return A logical vector equal to the number of taxa in \code{physeq}. #' This can be provided directly to \code{\link{prune_taxa}} as first argument. -#' Alternatively, if \code{prune==TRUE}, the pruned \code{\link{phyloseq-class}} +#' Alternatively, if \code{prune==TRUE}, the pruned \code{\link{phyloseq-class}} #' object is returned instead. -#' +#' #' @export -#' @seealso +#' @seealso #' \code{\link[genefilter]{filterfun}}, #' \code{\link{genefilter_sample}}, #' \code{\link{filterfun_sample}} -#' +#' #' @examples #' data("enterotype") #' require("genefilter") #' flist <- filterfun(kOverA(5, 2e-05)) #' ent.logi <- filter_taxa(enterotype, flist) #' ent.trim <- filter_taxa(enterotype, flist, TRUE) -#' identical(ent.trim, prune_taxa(ent.logi, enterotype)) +#' identical(ent.trim, prune_taxa(ent.logi, enterotype)) #' identical(sum(ent.logi), ntaxa(ent.trim)) #' filter_taxa(enterotype, flist, TRUE) filter_taxa <- function(physeq, flist, prune=FALSE){ @@ -1027,11 +1048,11 @@ filter_taxa <- function(physeq, flist, prune=FALSE){ if( ntaxa(physeq) != length(ans) ){ stop("Logic error in applying function(s). Logical result not same length as ntaxa(physeq)") } - # Now return logical or pruned phyloseq-class instance. + # Now return logical or pruned phyloseq-class instance. if( prune ){ return( prune_taxa(ans, physeq) ) } else { - return( ans ) + return( ans ) } } ################################################################################ @@ -1050,7 +1071,7 @@ filter_taxa <- function(physeq, flist, prune=FALSE){ #' \code{\link{topp}}, \code{\link{rm_outlierf}} #' #' @export -#' +#' #' @examples #' ## Use simulated abundance matrix #' set.seed(711) @@ -1121,7 +1142,7 @@ topp <- function(p, na.rm=TRUE){ #' \code{\link{topp}}, \code{\link{rm_outlierf}} #' #' @export -#' +#' #' @examples #' t1 <- 1:10; names(t1)<-paste("t", 1:10, sep="") #' topf(0.6)(t1) @@ -1174,7 +1195,7 @@ topf <- function(f, na.rm=TRUE){ #' (wh1 <- genefilter_sample(testOTU, f1, A=1)) #' wh2 <- c(TRUE, TRUE, TRUE, FALSE, FALSE) #' prune_taxa(wh1, testOTU) -#' prune_taxa(wh2, testOTU) +#' prune_taxa(wh2, testOTU) rm_outlierf <- function(f, na.rm=TRUE){ function(x){ if(na.rm){ From c54f3ab4bb4a71c13568fffa0d69ea4df59def0c Mon Sep 17 00:00:00 2001 From: Salix <31168746+salix-d@users.noreply.github.com> Date: Sat, 22 Oct 2022 01:34:15 -0400 Subject: [PATCH 2/9] update deprecated functions in test files had lots of warnings about `is_true` and `is_false` being deprecated so I changed the syntax for `expect_true` and `expect_false` --- tests/testthat/test-IO.R | 88 +++++++++++++++++----------------- tests/testthat/test-phyloseq.R | 86 ++++++++++++++++----------------- tests/testthat/test-subset.R | 56 +++++++++++----------- 3 files changed, 115 insertions(+), 115 deletions(-) diff --git a/tests/testthat/test-IO.R b/tests/testthat/test-IO.R index 94400c4b..760907fe 100644 --- a/tests/testthat/test-IO.R +++ b/tests/testthat/test-IO.R @@ -10,7 +10,7 @@ mothlist <- system.file("extdata", "esophagus.fn.list.gz", package="phyloseq") mothgroup <- system.file("extdata", "esophagus.good.groups.gz", package="phyloseq") mothtree <- system.file("extdata", "esophagus.tree.gz", package="phyloseq") cutoff <- "0.10" -esophman <- import_mothur(mothlist, mothgroup, mothtree, cutoff) +esophman <- import_mothur(mothlist, mothgroup, mothtree, cutoff) # mothur "Shared" file, create with mothur from these example data files mothshared = system.file("extdata", "esophagus.fn.shared.gz", package="phyloseq") constaxonomy = system.file("extdata", "mothur_example.cons.taxonomy.gz", package="phyloseq") @@ -26,7 +26,7 @@ test_that("import_mothur: The two phyloseq objects, example and just-imported, a test_that("import_mothur: Test mothur file import on the (esophagus data).", { smlc <- show_mothur_cutoffs(mothlist) - expect_that(smlc, is_equivalent_to(c("unique", "0.00", "0.01", "0.02", "0.03", "0.04", "0.05", "0.06", "0.07", "0.08", "0.09", "0.10"))) + expect_that(smlc, is_equivalent_to(c("unique", "0.00", "0.01", "0.02", "0.03", "0.04", "0.05", "0.06", "0.07", "0.08", "0.09", "0.10"))) }) test_that("import_mothur: abundances can be manipulated mathematically", { @@ -45,7 +45,7 @@ test_that("import_mothur: Expected classes of non-empty components", { }) test_that("import_mothur: imported files become S4 object", { - expect_that(isS4(esophman), is_true()) + expect_true(isS4(esophman)) }) test_that("import_mothur: show method output tests", { @@ -70,7 +70,7 @@ test_that("the import_RDP_otu function can properly read gzipped-example", { otufile <- system.file("extdata", "rformat_dist_0.03.txt.gz", package="phyloseq") - ex_otu <- import_RDP_otu(otufile) + ex_otu <- import_RDP_otu(otufile) # test expectations expect_output(print(head(t(ex_otu))), "OTU Table:") expect_is(ex_otu, "otu_table") @@ -97,7 +97,7 @@ test_that("Classes of components are as expected", { expect_is(otu_table(t0), ("otu_table")) expect_is(tax_table(t0), ("taxonomyTable")) expect_is(sample_data(t0), ("sample_data")) - expect_is(phy_tree(t0), ("phylo")) + expect_is(phy_tree(t0), ("phylo")) expect_is(refseq(t0), ("DNAStringSet")) }) @@ -114,29 +114,29 @@ test_that("Features of the abundance data are consistent, match known values", { test_that("Features of the taxonomy table match expected values", { expect_equal(length(rank_names(t0)), (7L)) - expect_equal(rank_names(t0), + expect_equal(rank_names(t0), c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species")) tax53 = as(tax_table(t0), "matrix")[53, ] expect_equivalent( - tax53, + tax53, c("Bacteria", "Proteobacteria", "Deltaproteobacteria", - "Desulfovibrionales", "Desulfomicrobiaceae", + "Desulfovibrionales", "Desulfomicrobiaceae", "Desulfomicrobium", "Desulfomicrobiumorale")) }) ################################################################################ # parse function tests - note, these are also used by import_biom test_that("Taxonomy vector parsing functions behave as expected", { - + chvec1 = c("Bacteria", "Proteobacteria", "Gammaproteobacteria", "Enterobacteriales", "Enterobacteriaceae", "Escherichia") - + chvec2 = c("k__Bacteria", "p__Proteobacteria", "c__Gammaproteobacteria", "o__Enterobacteriales", "f__Enterobacteriaceae", "g__Escherichia", "s__") - + chvec3 = c("Root", "k__Bacteria", "p__Firmicutes", "c__Bacilli", "o__Bacillales", "f__Staphylococcaceae") - + # Example where only some entries have greengenes prefix. chvec4 = c("Root", "k__Bacteria", "Firmicutes", "c__Bacilli", "o__Bacillales", "Staphylococcaceae", "z__mistake") @@ -144,34 +144,34 @@ test_that("Taxonomy vector parsing functions behave as expected", { # Even more terrible example, where leading or trailing space characters included # (the exact weirdnes of chvec4, compounded by leading and/or trailing space characters) chvec5 = c(" Root \n ", " k__Bacteria", " Firmicutes", " c__Bacilli ", - "o__Bacillales ", "Staphylococcaceae ", "\t z__mistake \t\n") + "o__Bacillales ", "Staphylococcaceae ", "\t z__mistake \t\n") # This should give a warning because there were no greengenes prefixes expect_warning(t1 <- parse_taxonomy_greengenes(chvec1)) # And output from previous call, t1, should be identical to default expect_that(parse_taxonomy_default(chvec1), is_equivalent_to(t1)) - + # All the greengenes entries get trimmed by parse_taxonomy_greengenes - expect_that(all(sapply(chvec2, nchar) > sapply(parse_taxonomy_greengenes(chvec2), nchar)), is_true()) + expect_true(all(sapply(chvec2, nchar) > sapply(parse_taxonomy_greengenes(chvec2), nchar))) # None of the greengenes entries are trimmed by parse_taxonomy_default - expect_that(any(sapply(chvec2, nchar) > sapply(parse_taxonomy_default(chvec2), nchar)), is_false()) - + expect_false(any(sapply(chvec2, nchar) > sapply(parse_taxonomy_default(chvec2), nchar))) + # Check that the "Root" element is not removed by parse_taxonomy_greengenes and parse_taxonomy_default. - expect_that("Root" %in% chvec3, is_true()) - expect_that("Root" %in% parse_taxonomy_default(chvec3), is_true()) - expect_that(length(parse_taxonomy_default(chvec3)) == length(chvec3), is_true()) - + expect_true("Root" %in% chvec3) + expect_true("Root" %in% parse_taxonomy_default(chvec3)) + expect_true(length(parse_taxonomy_default(chvec3)) == length(chvec3)) + # Check that non-greengenes prefixes, and those w/o prefixes, are given dummy rank(s) chvec4ranks = names(parse_taxonomy_greengenes(chvec4)) expect_that(grep("Rank", chvec4ranks, fixed=TRUE), is_equivalent_to(c(1, 3, 6, 7))) # Check that everything given dummy rank in default parse. chvec4ranks = names(parse_taxonomy_default(chvec4)) expect_that(grep("Rank", chvec4ranks, fixed=TRUE), is_equivalent_to(1:7)) - + # chvec4 and chvec5 result in identical vectors. expect_that(parse_taxonomy_default(chvec4), is_equivalent_to(parse_taxonomy_default(chvec5))) - expect_that(parse_taxonomy_greengenes(chvec4), is_equivalent_to(parse_taxonomy_greengenes(chvec5))) - + expect_that(parse_taxonomy_greengenes(chvec4), is_equivalent_to(parse_taxonomy_greengenes(chvec5))) + # The names of chvec5, greengenes parsed, should be... correct5names = c("Rank1", "Kingdom", "Rank3", "Class", "Order", "Rank6", "Rank7") expect_that(names(parse_taxonomy_greengenes(chvec5)), is_equivalent_to(correct5names)) @@ -198,7 +198,7 @@ test_that("Importing biom files yield phyloseq objects", { expect_is(rich_dense, ("phyloseq")) expect_is(rich_sparse, ("phyloseq")) - + expect_equal(ntaxa(rich_dense), (5L)) expect_equal(ntaxa(rich_sparse), (5L)) @@ -221,7 +221,7 @@ test_that("The different types of biom files yield phyloseq objects", { rich_sparse = import_biom(rich_sparse_biom, treefilename, refseqfilename, parseFunction=parse_taxonomy_greengenes) min_dense = import_biom(min_dense_biom, treefilename, refseqfilename, parseFunction=parse_taxonomy_greengenes) min_sparse = import_biom(min_sparse_biom, treefilename, refseqfilename, parseFunction=parse_taxonomy_greengenes) - + expect_is(rich_dense, ("phyloseq")) expect_is(rich_sparse, ("phyloseq")) expect_is(min_dense, ("phyloseq")) @@ -230,7 +230,7 @@ test_that("The different types of biom files yield phyloseq objects", { expect_equal(ntaxa(rich_dense), (5L)) expect_equal(ntaxa(rich_sparse), (5L)) expect_equal(ntaxa(min_dense), (5L)) - expect_equal(ntaxa(min_sparse), (5L)) + expect_equal(ntaxa(min_sparse), (5L)) # # Component classes # sample_data @@ -243,8 +243,8 @@ test_that("The different types of biom files yield phyloseq objects", { expect_is(access(rich_dense, "tax_table"), ("taxonomyTable")) expect_is(access(rich_sparse, "tax_table"), ("taxonomyTable")) expect_is(access(min_dense, "tax_table"), ("NULL")) - expect_is(access(min_sparse, "tax_table"), ("NULL")) - + expect_is(access(min_sparse, "tax_table"), ("NULL")) + # phylo tree expect_is(access(rich_dense, "phy_tree"), ("phylo")) expect_is(access(rich_sparse, "phy_tree"), ("phylo")) @@ -259,14 +259,14 @@ test_that("The different types of biom files yield phyloseq objects", { expect_is(access(rich_dense, "refseq"), ("DNAStringSet")) expect_is(access(rich_sparse, "refseq"), ("DNAStringSet")) expect_is(access(min_dense, "refseq"), ("DNAStringSet")) - expect_is(access(min_sparse, "refseq"), ("DNAStringSet")) - - # otu_table + expect_is(access(min_sparse, "refseq"), ("DNAStringSet")) + + # otu_table expect_is(access(rich_dense, "otu_table"), ("otu_table")) expect_is(access(rich_sparse, "otu_table"), ("otu_table")) expect_is(access(min_dense, "otu_table"), ("otu_table")) expect_is(access(min_sparse, "otu_table"), ("otu_table")) - + # Compare values in the otu_table. For some reason the otu_tables are not identical # one position is plus-two, another is minus-two combrich <- c(access(rich_dense, "otu_table"), access(rich_sparse, "otu_table")) @@ -283,7 +283,7 @@ test_that("The different types of biom files yield phyloseq objects", { # Compare values in the sample_data expect_equivalent(access(rich_dense, "sam_data"), (access(rich_sparse, "sam_data"))) - + # Compare values in the taxonomyTable expect_equivalent(access(rich_dense, "tax_table"), (access(rich_sparse, "tax_table"))) @@ -291,7 +291,7 @@ test_that("The different types of biom files yield phyloseq objects", { test_that("the import_biom and import(\"biom\", ) syntax give same result", { x1 <- import_biom(rich_dense_biom, parseFunction=parse_taxonomy_greengenes) - x2 <- import("biom", BIOMfilename=rich_dense_biom, parseFunction=parse_taxonomy_greengenes) + x2 <- import("biom", BIOMfilename=rich_dense_biom, parseFunction=parse_taxonomy_greengenes) expect_equivalent(x1, x2) }) ################################################################################ @@ -303,7 +303,7 @@ test_that("The read_tree function works as expected:", { expect_equal(ntaxa(GPNewick), 500L) expect_equal(GPNewick$Nnode, 499L) expect_equivalent(taxa_names(GPNewick), GPNewick$tip.label) - # Now read a nexus tree... + # Now read a nexus tree... # Some error-handling expectations expect_that(read_tree("alskflsakjsfskfhas.akshfaksj"), gives_warning()) # file not exist not_tree <- system.file("extdata", "esophagus.good.groups.gz", package="phyloseq") @@ -328,7 +328,7 @@ test_that("The specialized read_tree_greengenes function works:", { ################################################################################ # microbio_me_qiime tests # This tests different features and expected behavior for -# the functioning of an interface function to the +# the functioning of an interface function to the # microbio.me/qiime data repository. # zipfile = "study_816_split_library_seqs_and_mapping.zip" @@ -339,9 +339,9 @@ tarps = suppressWarnings(microbio_me_qiime(tarfile)) zipps = suppressWarnings(microbio_me_qiime(zipfile)) # This function is intended to interface with an external server, # as described in the documentation. -# However, I don't want successful testing of this package to -# rely on the presence or form of particular files on an -# external server, so these tests will be done exclusively on +# However, I don't want successful testing of this package to +# rely on the presence or form of particular files on an +# external server, so these tests will be done exclusively on # compressed file(s) representing what is exposed by the data server # It is up to the user to provide valid a URL in practice, # and the function attempts to provide informative status @@ -353,14 +353,14 @@ test_that("The microbio_me_qiime imports as expected: .tar.gz", { expect_identical(nrow(otu_table(tarps)), 50L) expect_identical(nrow(sample_data(tarps)), 15L) }) -test_that("The microbio_me_qiime imports as expected: .zip", { +test_that("The microbio_me_qiime imports as expected: .zip", { expect_is(zipps, "phyloseq") expect_is(sample_data(zipps, errorIfNULL=FALSE), "sample_data") expect_is(otu_table(zipps, errorIfNULL=FALSE), "otu_table") expect_identical(nrow(otu_table(zipps)), 50L) expect_identical(nrow(sample_data(zipps)), 15L) }) -test_that("Results of .tar.gz and .zip should be identical", { +test_that("Results of .tar.gz and .zip should be identical", { expect_identical(tarps, zipps) expect_identical(sample_data(tarps), sample_data(zipps)) expect_identical(otu_table(tarps), otu_table(zipps)) @@ -370,7 +370,7 @@ test_that("Results of .tar.gz and .zip should be identical", { ################################################################################ usearchfile = system.file("extdata", "usearch.uc", package="phyloseq") OTU1 = import_usearch_uc(usearchfile) -test_that("import_usearch_uc: Properly omit entries from failed search", { +test_that("import_usearch_uc: Properly omit entries from failed search", { ucLines = readLines(usearchfile) expect_identical( sum(OTU1), (length(ucLines) - length(grep("*", ucLines, fixed=TRUE))) ) expect_identical( nrow(OTU1), 37L) @@ -379,4 +379,4 @@ test_that("import_usearch_uc: Properly omit entries from failed search", { expect_identical( ncol(OTU1), 33L) expect_equivalent(colSums(OTU1)[1:6], c(6, 1, 2, 1, 1, 1)) }) -################################################################################ \ No newline at end of file +################################################################################ diff --git a/tests/testthat/test-phyloseq.R b/tests/testthat/test-phyloseq.R index 96c894f2..89f3358c 100644 --- a/tests/testthat/test-phyloseq.R +++ b/tests/testthat/test-phyloseq.R @@ -16,10 +16,10 @@ test_that("phyloseq: Building a phyloseq-object when tree contains extra quotes, esophagus1 = esophagus phy_tree(esophagus) = tree expect_that(esophagus1, is_identical_to(esophagus)) - # Now try to "rebuild" using the quote-containing - esophagus2 = phyloseq(tree, otu_table(esophagus)) + # Now try to "rebuild" using the quote-containing + esophagus2 = phyloseq(tree, otu_table(esophagus)) expect_that(esophagus2, is_identical_to(esophagus)) - + # Try with a dataset with complete-set of components, Global Patterns data("GlobalPatterns") # Use a subset, because checking identicality of two large, complicated objects takes time. @@ -33,8 +33,8 @@ test_that("phyloseq: Building a phyloseq-object when tree contains extra quotes, GP1 = GP phy_tree(GP) = tree expect_that(GP1, is_identical_to(GP)) - # Now try to "rebuild" using the quote-containing - GP2 = phyloseq(tree, otu_table(GP), tax_table(GP), sample_data(GP)) + # Now try to "rebuild" using the quote-containing + GP2 = phyloseq(tree, otu_table(GP), tax_table(GP), sample_data(GP)) expect_that(GP2, is_identical_to(GP)) }) ################################################################################ @@ -51,18 +51,18 @@ test_that("taxa_names(x)<- and sample_names(x)<- behaves as expected", { # taxa_names<- new_taxa_names = paste("OTU-", taxa_names(e1), sep="") taxa_names(e1) = new_taxa_names - expect_that(identical(taxa_names(e1), new_taxa_names), is_true()) - expect_that(identical(taxa_names(phy_tree(e1)), new_taxa_names), is_true()) - expect_that(identical(taxa_names(otu_table(e1)), new_taxa_names), is_true()) - expect_that(identical(taxa_names(tax_table(e1)), new_taxa_names), is_true()) -# expect_that(identical(taxa_names(refseq(e1)), new_taxa_names), is_true()) - + expect_true(identical(taxa_names(e1), new_taxa_names)) + expect_true(identical(taxa_names(phy_tree(e1)), new_taxa_names)) + expect_true(identical(taxa_names(otu_table(e1)), new_taxa_names)) + expect_true(identical(taxa_names(tax_table(e1)), new_taxa_names)) +# expect_true(identical(taxa_names(refseq(e1)), new_taxa_names)) + # sample_names<- new_sample_names = paste("Sa-", sample_names(e1), sep="") sample_names(e1) = new_sample_names - expect_that(identical(sample_names(e1), new_sample_names), is_true()) - expect_that(identical(sample_names(sample_data(e1)), new_sample_names), is_true()) - expect_that(identical(sample_names(otu_table(e1)), new_sample_names), is_true()) + expect_true(identical(sample_names(e1), new_sample_names)) + expect_true(identical(sample_names(sample_data(e1)), new_sample_names)) + expect_true(identical(sample_names(otu_table(e1)), new_sample_names)) }) test_that("Test intersect_*() and prune_*() methods behave as expected", { e0 = e1 @@ -75,17 +75,17 @@ test_that("Test intersect_*() and prune_*() methods behave as expected", { i = sample(ntaxa(e1), ntaxa(e1)-nunchained, replace=FALSE) new_taxa_names[i] = paste("OTU-", taxa_names(e1)[i], sep="") taxa_names(e1@tax_table) = new_taxa_names - expect_that(identical(taxa_names(e1), new_taxa_names), is_false()) - expect_that(identical(taxa_names(tax_table(e1)), new_taxa_names), is_true()) - expect_that(identical(taxa_names(otu_table(e1)), new_taxa_names), is_false()) - expect_that(identical(taxa_names(phy_tree(e1)), new_taxa_names), is_false()) -# expect_that(identical(taxa_names(refseq(e1)), new_taxa_names), is_false()) + expect_false(identical(taxa_names(e1), new_taxa_names)) + expect_true(identical(taxa_names(tax_table(e1)), new_taxa_names)) + expect_false(identical(taxa_names(otu_table(e1)), new_taxa_names)) + expect_false(identical(taxa_names(phy_tree(e1)), new_taxa_names)) +# expect_false(identical(taxa_names(refseq(e1)), new_taxa_names)) ## Okay so that worked. Now we test if the intersection functions behave - expect_that(identical(length(phyloseq:::intersect_taxa(e1)), nunchained), is_true()) + expect_true(identical(length(phyloseq:::intersect_taxa(e1)), nunchained)) e2 = prune_taxa(phyloseq:::intersect_taxa(e1), e1) - expect_that(identical(ntaxa(e2), nunchained), is_true()) - expect_that(setequal(taxa_names(e2), taxa_names(e1)[-i]), is_true()) - + expect_true(identical(ntaxa(e2), nunchained)) + expect_true(setequal(taxa_names(e2), taxa_names(e1)[-i])) + # sample_names<- e1 = e0 ## We assign new names to just one component, being sneaky and using the @@ -96,45 +96,45 @@ test_that("Test intersect_*() and prune_*() methods behave as expected", { i = sample(nsamples(e1), nsamples(e1)-nunchained, replace=FALSE) new_sample_names[i] = paste("Sa-", sample_names(e1)[i], sep="") sample_names(e1@sam_data) = new_sample_names - expect_that(identical(sample_names(e1), new_sample_names), is_false()) - expect_that(identical(sample_names(sample_data(e1)), new_sample_names), is_true()) - expect_that(identical(sample_names(otu_table(e1)), new_sample_names), is_false()) - + expect_false(identical(sample_names(e1), new_sample_names)) + expect_true(identical(sample_names(sample_data(e1)), new_sample_names)) + expect_false(identical(sample_names(otu_table(e1)), new_sample_names)) + ## Okay so that worked. Now we test if the intersection functions behave - expect_that(identical(length(phyloseq:::intersect_samples(e1)), nunchained), is_true()) + expect_true(identical(length(phyloseq:::intersect_samples(e1)), nunchained)) e2 = prune_samples(phyloseq:::intersect_samples(e1), e1) - expect_that(identical(nsamples(e2), nunchained), is_true()) - expect_that(setequal(sample_names(e2), sample_names(e1)[-i]), is_true()) - + expect_true(identical(nsamples(e2), nunchained)) + expect_true(setequal(sample_names(e2), sample_names(e1)[-i])) + }) test_that("Test ordering", { OTU = otu_table(e1) tree = phy_tree(e1) - expect_that(identical(taxa_names(OTU), taxa_names(tree)), is_true()) + expect_true(identical(taxa_names(OTU), taxa_names(tree))) reotaxnames = sample(taxa_names(tree), ntaxa(tree), FALSE) - expect_that(identical(taxa_names(OTU), reotaxnames), is_false()) + expect_false(identical(taxa_names(OTU), reotaxnames)) # scramble order of taxa_names in tree by random arbitrary assignment taxa_names(tree) <- reotaxnames - expect_that(identical(taxa_names(OTU), taxa_names(tree)), is_false()) + expect_false(identical(taxa_names(OTU), taxa_names(tree))) # implicitly re-order in constructor e3 = phyloseq(OTU, tree) - expect_that(identical(taxa_names(e3), taxa_names(tree)), is_true()) - expect_that(identical(taxa_names(otu_table(e3)), taxa_names(phy_tree(e3))), is_true()) - expect_that(identical(taxa_names(otu_table(e3)), taxa_names(phy_tree(e3))), is_true()) - + expect_true(identical(taxa_names(e3), taxa_names(tree))) + expect_true(identical(taxa_names(otu_table(e3)), taxa_names(phy_tree(e3)))) + expect_true(identical(taxa_names(otu_table(e3)), taxa_names(phy_tree(e3)))) + # Glad that worked. Now let's mess up sample indices in one component, and OTU indices in another # then fix explicitly with index_reorder(e4, "both") e4 = e1 reosamplenames = sample(sample_names(e1), nsamples(e1), FALSE) sample_names(e4@sam_data) <- reosamplenames taxa_names(e4@tax_table) <- reotaxnames - - expect_that(identical(taxa_names(otu_table(e4)), taxa_names(tax_table(e4))), is_false()) - expect_that(identical(sample_names(otu_table(e4)), sample_names(sample_data(e4))), is_false()) + + expect_false(identical(taxa_names(otu_table(e4)), taxa_names(tax_table(e4)))) + expect_false(identical(sample_names(otu_table(e4)), sample_names(sample_data(e4)))) e4 = phyloseq:::index_reorder(e4, "both") - expect_that(identical(taxa_names(otu_table(e4)), taxa_names(tax_table(e4))), is_true()) - expect_that(identical(sample_names(otu_table(e4)), sample_names(sample_data(e4))), is_true()) + expect_true(identical(taxa_names(otu_table(e4)), taxa_names(tax_table(e4)))) + expect_true(identical(sample_names(otu_table(e4)), sample_names(sample_data(e4)))) }) ################################################################################ diff --git a/tests/testthat/test-subset.R b/tests/testthat/test-subset.R index 20b7c890..a8fe1569 100644 --- a/tests/testthat/test-subset.R +++ b/tests/testthat/test-subset.R @@ -17,28 +17,28 @@ test_that("Classes of pruned phyloseq and its components are as expected", { expect_that(nsamples(GP3), is_identical_to(3L)) expect_that(GP3, is_a("phyloseq")) expect_that(access(GP3, "sam_data"), is_a("sample_data")) - expect_that(access(GP3, "otu_table"), is_a("otu_table")) - expect_that(access(GP3, "phy_tree"), is_a("phylo")) - expect_that(access(GP3, "tax_table"), is_a("taxonomyTable")) + expect_that(access(GP3, "otu_table"), is_a("otu_table")) + expect_that(access(GP3, "phy_tree"), is_a("phylo")) + expect_that(access(GP3, "tax_table"), is_a("taxonomyTable")) # Now try on instance without sample data (empty slot) GPnoSD <- phyloseq(otu_table(GP), tax_table(GP)) GP3noSD <- prune_samples(keepNames, GPnoSD) - expect_that(nsamples(GP3noSD), is_identical_to(3L)) - expect_that(access(GP3noSD, "otu_table"), is_a("otu_table")) + expect_that(nsamples(GP3noSD), is_identical_to(3L)) + expect_that(access(GP3noSD, "otu_table"), is_a("otu_table")) expect_that(access(GP3noSD, "sam_data"), is_a("NULL")) - expect_that(access(GP3noSD, "phy_tree"), is_a("NULL")) - expect_that(access(GP3noSD, "tax_table"), is_a("taxonomyTable")) + expect_that(access(GP3noSD, "phy_tree"), is_a("NULL")) + expect_that(access(GP3noSD, "tax_table"), is_a("taxonomyTable")) }) test_that("prune_samples works on sample_data-only and otu_table-only data", { GPotu <- prune_samples(keepNames, access(GP, "otu_table", TRUE)) - GPsd <- prune_samples(keepNames, access(GP, "sam_data", TRUE)) - expect_that(nsamples(GPotu), is_identical_to(3L)) - expect_that(nsamples(GPsd), is_identical_to(3L)) + GPsd <- prune_samples(keepNames, access(GP, "sam_data", TRUE)) + expect_that(nsamples(GPotu), is_identical_to(3L)) + expect_that(nsamples(GPsd), is_identical_to(3L)) expect_that(GPotu, is_a("otu_table")) expect_that(GPsd, is_a("sample_data")) expect_that(dim(GPotu), is_identical_to(c(19216L, 3L))) - expect_that(dim(GPsd), is_identical_to(c(3L, 7L))) + expect_that(dim(GPsd), is_identical_to(c(3L, 7L))) }) # Coerce orientation for apply @@ -55,30 +55,30 @@ samobs = sort(samobs, TRUE)[1:50] samobs = sample(samobs, length(samobs), FALSE) test_that("Initial order before pruning check is different", { - expect_that(setequal(names(samobs), taxa_names(phy_tree(GP))[1:50]), is_false()) - expect_that(setequal(names(samobs), taxa_names(GP)[1:50]), is_false()) - expect_that(identical(names(samobs), taxa_names(GP)[1:50]), is_false()) + expect_false(setequal(names(samobs), taxa_names(phy_tree(GP))[1:50])) + expect_false(setequal(names(samobs), taxa_names(GP)[1:50])) + expect_false(identical(names(samobs), taxa_names(GP)[1:50])) }) # prune to just samobs OTUs pGP = prune_taxa(names(samobs), GP) test_that("The set of names should be the same after pruning, names(samobs)", { - expect_that(setequal(names(samobs), taxa_names(phy_tree(pGP))), is_true()) - expect_that(setequal(names(samobs), taxa_names(otu_table(pGP))), is_true()) - expect_that(setequal(names(samobs), taxa_names(tax_table(pGP))), is_true()) + expect_true(setequal(names(samobs), taxa_names(phy_tree(pGP)))) + expect_true(setequal(names(samobs), taxa_names(otu_table(pGP)))) + expect_true(setequal(names(samobs), taxa_names(tax_table(pGP)))) }) test_that("The set/order of taxa names after pruning should be consistent", { # set equal - expect_that(setequal(taxa_names(pGP), taxa_names(phy_tree(pGP))), is_true()) - expect_that(setequal(taxa_names(otu_table(pGP)), taxa_names(phy_tree(pGP))), is_true()) - expect_that(setequal(taxa_names(tax_table(pGP)), taxa_names(phy_tree(pGP))), is_true()) + expect_true(setequal(taxa_names(pGP), taxa_names(phy_tree(pGP)))) + expect_true(setequal(taxa_names(otu_table(pGP)), taxa_names(phy_tree(pGP)))) + expect_true(setequal(taxa_names(tax_table(pGP)), taxa_names(phy_tree(pGP)))) # identical - expect_that(identical(taxa_names(pGP), taxa_names(phy_tree(pGP))), is_true()) - expect_that(identical(taxa_names(otu_table(pGP)), taxa_names(phy_tree(pGP))), is_true()) - expect_that(identical(taxa_names(tax_table(pGP)), taxa_names(phy_tree(pGP))), is_true()) - expect_that(identical(names(samobs), taxa_names(phy_tree(pGP))), is_false()) + expect_true(identical(taxa_names(pGP), taxa_names(phy_tree(pGP)))) + expect_true(identical(taxa_names(otu_table(pGP)), taxa_names(phy_tree(pGP)))) + expect_true(identical(taxa_names(tax_table(pGP)), taxa_names(phy_tree(pGP)))) + expect_false(identical(names(samobs), taxa_names(phy_tree(pGP)))) # plot_tree(pGP, "sampledodge", nodeplotblank, label.tips="taxa_names", plot.margin=0.75) }) @@ -86,8 +86,8 @@ test_that("The set/order of taxa names after pruning should be consistent", { #' data("esophagus") #' esophagus #' plot(sort(taxa_sums(esophagus), TRUE), type="h", ylim=c(0, 50)) -#' x1 = prune_taxa(taxa_sums(esophagus) > 10, esophagus) -#' x2 = prune_taxa(names(sort(taxa_sums(esophagus), TRUE))[1:9], esophagus) +#' x1 = prune_taxa(taxa_sums(esophagus) > 10, esophagus) +#' x2 = prune_taxa(names(sort(taxa_sums(esophagus), TRUE))[1:9], esophagus) #' identical(x1, x2) @@ -104,7 +104,7 @@ test_that("filter_taxa gives correct, reliable logicals and pruning", { ent.trim <- filter_taxa(enterotype, flist, TRUE) expect_is(ent.trim, ("phyloseq")) expect_equal(sum(ent.logi), (ntaxa(ent.trim))) - expect_identical(prune_taxa(ent.logi, enterotype), (ent.trim)) + expect_identical(prune_taxa(ent.logi, enterotype), (ent.trim)) expect_equal(ntaxa(ent.trim), (416L)) - expect_equal(nsamples(ent.trim), (nsamples(enterotype))) + expect_equal(nsamples(ent.trim), (nsamples(enterotype))) }) From ae5917d16fc441e7dc6c38bad8df2c1e708be903 Mon Sep 17 00:00:00 2001 From: Salix <31168746+salix-d@users.noreply.github.com> Date: Sat, 22 Oct 2022 12:11:14 -0400 Subject: [PATCH 3/9] update deprecated guide argument --- vignettes/phyloseq-analysis.Rmd | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/vignettes/phyloseq-analysis.Rmd b/vignettes/phyloseq-analysis.Rmd index 86c9cdc2..a0ac05b2 100644 --- a/vignettes/phyloseq-analysis.Rmd +++ b/vignettes/phyloseq-analysis.Rmd @@ -242,7 +242,7 @@ Network representation of the relationship between microbiome samples in the "En For further details, see the [plot_ordination tutorial](http://joey711.github.io/phyloseq/plot_ordination-examples.html) -Ordination methods can be a useful tool for exploring complex phylogenetic sequencing data, particularly when the hypothesized structure of the data is poorly defined (or there isn't a hypothesis). The phyloseq package provides some useful tools for performing ordinations and plotting their results, via the `ordinate() and `plot_ordination() functions, respectively. Although there are many options and methods supported, a first-step will probably look something like the following: +Ordination methods can be a useful tool for exploring complex phylogenetic sequencing data, particularly when the hypothesized structure of the data is poorly defined (or there isn't a hypothesis). The phyloseq package provides some useful tools for performing ordinations and plotting their results, via the `ordinate()` and `plot_ordination()` functions, respectively. Although there are many options and methods supported, a first-step will probably look something like the following: ```{r eval=FALSE} my.physeq <- import("Biom", BIOMfilename="myBiomFile.biom") @@ -298,7 +298,7 @@ Next, we will reproduce Figure 5 from the "Global Patterns" article([Caporaso 20 ```{r GPfig5ax1213} (p12 <- plot_ordination(GlobalPatterns, GloPa.pcoa, "samples", color="SampleType") + - geom_point(size=5) + geom_path() + scale_colour_hue(guide = FALSE) ) + geom_point(size=5) + geom_path() + scale_colour_hue(guide = "none") ) (p13 <- plot_ordination(GlobalPatterns, GloPa.pcoa, "samples", axes=c(1, 3), color="SampleType") + geom_line() + geom_point(size=5) ) ``` @@ -368,7 +368,7 @@ We will now investigate further this top-level structure of the data, using an a ```{r GPCAspecplot0} p1 <- plot_ordination(GP, gpca, "species", color="Phylum") (p1 <- ggplot(p1$data, p1$mapping) + geom_point(size=5, alpha=0.5) + - facet_wrap(~Phylum) + scale_colour_hue(guide = FALSE) ) + facet_wrap(~Phylum) + scale_colour_hue(guide = "none") ) ``` Species plot of the "Global Patterns" correspondence analysis first two axes, with each phylum on a different panel ("facet"). Only the most abundant 5 phyla among the most abundant 200 taxa (cumulative, all samples) are included. Arbitrary reduction, for computational efficiency of example. @@ -377,7 +377,7 @@ Let's try drawing the figure again, only this time summarizing the species point ```{r GPCAspecplotTopo0} (p3 <- ggplot(p1$data, p1$mapping) + geom_density2d() + - facet_wrap(~Phylum) + scale_colour_hue(guide = FALSE) ) + facet_wrap(~Phylum) + scale_colour_hue(guide = "none") ) ``` Redrawn figure, which is severely overplotted, as a 2-dimensional species-density topographic map, faceted in the same way. @@ -394,7 +394,7 @@ LF <- subset(mdf, variable=="CA2" & value < -1.0) p <- ggplot(mdf, aes(Phylum, value, color=Phylum)) + geom_boxplot() + facet_wrap(~variable, 2) + - scale_colour_hue(guide = FALSE) + + scale_colour_hue(guide = "none") + theme_bw() + theme( axis.text.x = element_text(angle = -90, vjust = 0.5) ) # Add the text label layer, and render ggplot graphic From f7cb67ef23b6e200fc80a78b12bdf7acfd58612a Mon Sep 17 00:00:00 2001 From: Salix <31168746+salix-d@users.noreply.github.com> Date: Sat, 22 Oct 2022 12:11:23 -0400 Subject: [PATCH 4/9] changes made by check/build --- vignettes/phyloseq-basics.Rmd | 703 ++++++++++++++++++++++++---------- 1 file changed, 497 insertions(+), 206 deletions(-) diff --git a/vignettes/phyloseq-basics.Rmd b/vignettes/phyloseq-basics.Rmd index a44421e9..283299a0 100644 --- a/vignettes/phyloseq-basics.Rmd +++ b/vignettes/phyloseq-basics.Rmd @@ -7,165 +7,233 @@ output: toc: yes toc_depth: 2 number_sections: true +editor_options: + markdown: + wrap: 72 --- + +```{=html} - +``` `r library("knitr")` `r opts_chunk$set(cache=FALSE, fig.width=9, message=FALSE, warning=FALSE)` Paul J. McMurdie and Susan Holmes - +[mcmurdie\@stanford.edu](mailto:mcmurdie@stanford.edu){.email} [phyloseq Home Page](http://joey711.github.io/phyloseq/) -If you find phyloseq and/or its tutorials useful, please acknowledge and cite phyloseq in your publications: +If you find phyloseq and/or its tutorials useful, please acknowledge and +cite phyloseq in your publications: -**phyloseq: An R package for reproducible interactive analysis and graphics of microbiome census data** (2013) PLoS ONE 8(4):e61217 -http://dx.plos.org/10.1371/journal.pone.0061217 +**phyloseq: An R package for reproducible interactive analysis and +graphics of microbiome census data** (2013) PLoS ONE 8(4):e61217 + ## Other resources -The phyloseq project also has a number of supporting online resources, most of which can by found at [the phyloseq home page](http://joey711.github.com/phyloseq/), or from the phyloseq stable release [page on Bioconductor](http://bioconductor.org/packages/release/bioc/html/phyloseq.html). -To post feature requests or ask for help, try [the phyloseq Issue Tracker](https://github.com/joey711/phyloseq/issues). +The phyloseq project also has a number of supporting online resources, +most of which can by found at [the phyloseq home +page](http://joey711.github.com/phyloseq/), or from the phyloseq stable +release [page on +Bioconductor](http://bioconductor.org/packages/release/bioc/html/phyloseq.html). +To post feature requests or ask for help, try [the phyloseq Issue +Tracker](https://github.com/joey711/phyloseq/issues). # Introduction -The analysis of microbiological communities brings many challenges: the integration of many different types of data with methods from ecology, genetics, phylogenetics, network analysis, visualization and testing. The data itself may originate from widely different sources, such as the microbiomes of humans, soils, surface and ocean waters, wastewater treatment plants, industrial facilities, and so on; and as a result, these varied sample types may have very different forms and scales of related data that is extremely dependent upon the experiment and its question(s). The phyloseq package is a tool to import, store, analyze, and graphically display complex phylogenetic sequencing data that has already been clustered into Operational Taxonomic Units (OTUs), especially when there is associated sample data, phylogenetic tree, and/or taxonomic assignment of the OTUs. This package leverages many of the tools available in R for ecology and phylogenetic analysis (vegan, ade4, ape, picante), while also using advanced/flexible graphic systems (ggplot2) to easily produce publication-quality graphics of complex phylogenetic data. phyloseq uses a specialized system of S4 classes to store all related phylogenetic sequencing data as single experiment-level object, making it easier to share data and reproduce analyses. In general, phyloseq seeks to facilitate the use of R for efficient interactive and reproducible analysis of OTU-clustered high-throughput phylogenetic sequencing data. - +The analysis of microbiological communities brings many challenges: the +integration of many different types of data with methods from ecology, +genetics, phylogenetics, network analysis, visualization and testing. +The data itself may originate from widely different sources, such as the +microbiomes of humans, soils, surface and ocean waters, wastewater +treatment plants, industrial facilities, and so on; and as a result, +these varied sample types may have very different forms and scales of +related data that is extremely dependent upon the experiment and its +question(s). The phyloseq package is a tool to import, store, analyze, +and graphically display complex phylogenetic sequencing data that has +already been clustered into Operational Taxonomic Units (OTUs), +especially when there is associated sample data, phylogenetic tree, +and/or taxonomic assignment of the OTUs. This package leverages many of +the tools available in R for ecology and phylogenetic analysis (vegan, +ade4, ape, picante), while also using advanced/flexible graphic systems +(ggplot2) to easily produce publication-quality graphics of complex +phylogenetic data. phyloseq uses a specialized system of S4 classes to +store all related phylogenetic sequencing data as single +experiment-level object, making it easier to share data and reproduce +analyses. In general, phyloseq seeks to facilitate the use of R for +efficient interactive and reproducible analysis of OTU-clustered +high-throughput phylogenetic sequencing data. # About this vignette ## Typesetting Legend -- **bold** - Bold is used for emphasis. -- *italics* - Italics are used for package names, and special words, phrases. -- `code font` - The font for code, usually courrier-like, -but depends on the theme. -- `myFun()` - Code font word with `()` attached at the right-end, -is a function name. -- [Hyperlink](#sec:typeset-legend) - Hyperlinks are -clickable text that will jump to sections and external pages. +- **bold** - Bold is used for emphasis. +- *italics* - Italics are used for package names, and special words, + phrases. +- `code font` - The font for code, usually courrier-like, but depends + on the theme. +- `myFun()` - Code font word with `()` attached at the right-end, is a + function name. +- [Hyperlink](#sec:typeset-legend) - Hyperlinks are clickable text + that will jump to sections and external pages. ## Other links and tutorials -An overview of phyloseq's intended functionality, goals, and design is provided -in the following free and open access article: +An overview of phyloseq's intended functionality, goals, and design is +provided in the following free and open access article: -McMurdie and Holmes (2013). [phyloseq: An R package for reproducible interactive analysis and graphics of microbiome census data](http://dx.plos.org/10.1371/journal.pone.0061217). PLoS ONE e61217. +McMurdie and Holmes (2013). [phyloseq: An R package for reproducible +interactive analysis and graphics of microbiome census +data](http://dx.plos.org/10.1371/journal.pone.0061217). PLoS ONE e61217. -The most updated examples are posted in our online tutorials from -[the phyloseq home page](http://joey711.github.com/phyloseq) +The most updated examples are posted in our online tutorials from [the +phyloseq home page](http://joey711.github.com/phyloseq) -A separate vignette describes analysis tools included in phyloseq along with various examples using included example data. A quick way to load it is: +A separate vignette describes analysis tools included in phyloseq along +with various examples using included example data. A quick way to load +it is: ```{r, eval=FALSE} vignette("phyloseq_analysis") ``` -By contrast, this vignette is intended to provide functional examples of the basic data import and manipulation infrastructure included in phyloseq. This includes example code for importing OTU-clustered data from different clustering pipelines, as well as performing clear and reproducible filtering tasks that can be altered later and checked for robustness. The motivation for including tools like this in phyloseq is to save time, and also to build-in a structure that requires consistency across related data tables from the same experiment.This not only reduces code repetition, but also decreases the likelihood of mistakes during data filtering and analysis. For example, it is intentionally difficult in phyloseq to create an experiment-level object in which a component tree and OTU table have different OTU names. The import functions, trimming tools, as well as the main tool for creating an experiment-level object, `phyloseq`, all automatically trim the OTUs and samples indices to their intersection, such that these component data types are exactly coherent. - +By contrast, this vignette is intended to provide functional examples of +the basic data import and manipulation infrastructure included in +phyloseq. This includes example code for importing OTU-clustered data +from different clustering pipelines, as well as performing clear and +reproducible filtering tasks that can be altered later and checked for +robustness. The motivation for including tools like this in phyloseq is +to save time, and also to build-in a structure that requires consistency +across related data tables from the same experiment.This not only +reduces code repetition, but also decreases the likelihood of mistakes +during data filtering and analysis. For example, it is intentionally +difficult in phyloseq to create an experiment-level object in which a +component tree and OTU table have different OTU names. The import +functions, trimming tools, as well as the main tool for creating an +experiment-level object, `phyloseq`, all automatically trim the OTUs and +samples indices to their intersection, such that these component data +types are exactly coherent. # phyloseq classes -The class structure in the *phyloseq* package follows the inheritance diagram shown in the figure below. -Currently, *phyloseq* uses 4 core data classes. -They are -(1) the OTU abundance table (`otu_table`), -a table of sample data (`sample_data`); -(2) a table of taxonomic descriptors (`taxonomyTable`); and -(3) a phylogenetic tree (`"phylo"`-class, [ape package](http://cran.r-project.org/web/packages/ape/). - -The `otu_table` class can be considered the central data type, -as it directly represents the number and type of sequences observed in each sample. -`otu_table` extends the numeric matrix class in the `R` base, -and has a few additonal feature slots. -The most important of these feature slots is the `taxa_are_rows` slot, -which holds a single logical that indicates whether the table is oriented -with taxa as rows (as in the *genefilter* package in [Bioconductor](#cite:bioconductor) -or with taxa as columns (as in *vegan* and *picante* packages). -In *phyloseq* methods, as well as its extensions of methods in other packages, -the `taxa_are_rows` value is checked to ensure proper orientation of the `otu_table`. -A *phyloseq* user is only required to specify the `otu_table` orientation during initialization, following which all handling is internal. - -The `sample_data` class directly inherits `R`'s `data.frame` class, and thus effectively stores both categorical and numerical data about each sample. The orientation of a `data.frame` in this context requires that samples/trials are rows, and variables are columns (consistent with *vegan* and other packages). The `taxonomyTable` class directly inherits the `matrix` class, and is oriented such that rows are taxa/OTUs and columns are taxonomic levels (e.g. *Phylum*). - -The phyloseq-class can be considered an "experiment-level class" and should contain two or more of the previously-described core data classes. We assume that *phyloseq* users will be interested in analyses that utilize their abundance counts derived from the phylogenetic sequencing data, and so the `phyloseq()` constructor will stop with an error if the arguments do not include an `otu_table`. There are a number of common methods that require either an `otu_table` and `sample_data` combination, or an `otu_table` and phylogenetic tree combination. These methods can operate on instances of the phyloseq-class, and will stop with an error if the required component data is missing. - -![phyloseq class structure](phyloseq_classes_7.png) - Classes and inheritance in the *phyloseq* package. The class name and its slots are shown with red- or blue-shaded text, respectively. Coercibility is indicated graphically by arrows with the coercion function shown. Lines without arrows indicate that the more complex class (``phyloseq") contains a slot with the associated data class as its components. - +The class structure in the *phyloseq* package follows the inheritance +diagram shown in the figure below. Currently, *phyloseq* uses 4 core +data classes. They are (1) the OTU abundance table (`otu_table`), a +table of sample data (`sample_data`); (2) a table of taxonomic +descriptors (`taxonomyTable`); and (3) a phylogenetic tree +(`"phylo"`-class, [ape +package](http://cran.r-project.org/web/packages/ape/). + +The `otu_table` class can be considered the central data type, as it +directly represents the number and type of sequences observed in each +sample. `otu_table` extends the numeric matrix class in the `R` base, +and has a few additonal feature slots. The most important of these +feature slots is the `taxa_are_rows` slot, which holds a single logical +that indicates whether the table is oriented with taxa as rows (as in +the *genefilter* package in [Bioconductor](#cite:bioconductor) or with +taxa as columns (as in *vegan* and *picante* packages). In *phyloseq* +methods, as well as its extensions of methods in other packages, the +`taxa_are_rows` value is checked to ensure proper orientation of the +`otu_table`. A *phyloseq* user is only required to specify the +`otu_table` orientation during initialization, following which all +handling is internal. + +The `sample_data` class directly inherits `R`'s `data.frame` class, and +thus effectively stores both categorical and numerical data about each +sample. The orientation of a `data.frame` in this context requires that +samples/trials are rows, and variables are columns (consistent with +*vegan* and other packages). The `taxonomyTable` class directly inherits +the `matrix` class, and is oriented such that rows are taxa/OTUs and +columns are taxonomic levels (e.g. *Phylum*). + +The phyloseq-class can be considered an "experiment-level class" and +should contain two or more of the previously-described core data +classes. We assume that *phyloseq* users will be interested in analyses +that utilize their abundance counts derived from the phylogenetic +sequencing data, and so the `phyloseq()` constructor will stop with an +error if the arguments do not include an `otu_table`. There are a number +of common methods that require either an `otu_table` and `sample_data` +combination, or an `otu_table` and phylogenetic tree combination. These +methods can operate on instances of the phyloseq-class, and will stop +with an error if the required component data is missing. + +![phyloseq class structure](phyloseq_classes_7.png) Classes and +inheritance in the *phyloseq* package. The class name and its slots are +shown with red- or blue-shaded text, respectively. Coercibility is +indicated graphically by arrows with the coercion function shown. Lines +without arrows indicate that the more complex class (\`\`phyloseq") +contains a slot with the associated data class as its components. # Load *phyloseq* and import data -Now let's get started by loading phyloseq, and describing some methods for importing data. +Now let's get started by loading phyloseq, and describing some methods +for importing data. ## Load *phyloseq* -To use *phyloseq* in a new R session, it will have to be loaded. This can be done in your package manager, or at the command line using the `library()` command: +To use *phyloseq* in a new R session, it will have to be loaded. This +can be done in your package manager, or at the command line using the +`library()` command: + ```{r load-packages, message=FALSE, warning=FALSE} library("phyloseq") ``` ## Import data -An important feature of *phyloseq* are methods -for importing phylogenetic sequencing data -from common taxonomic clustering pipelines. -These methods take file pathnames as input, -read and parse those files, -and return a single object that contains all of the data. - -Some additional background details are provided below. -The best reproducible examples on importing data with phyloseq -can be found on the official data import tutorial page: +An important feature of *phyloseq* are methods for importing +phylogenetic sequencing data from common taxonomic clustering pipelines. +These methods take file pathnames as input, read and parse those files, +and return a single object that contains all of the data. -http://joey711.github.com/phyloseq/import-data +Some additional background details are provided below. The best +reproducible examples on importing data with phyloseq can be found on +the official data import tutorial page: + ## Import from biom-format New versions of QIIME (see below) produce a file in *version 2* of the -[biom file format](http://biom-format.org/), -which is a specialized definition of the HDF5 format. - -The phyloseq package provides the `import_biom()` function, -which can import both -*Version 1* (JSON) and -*Version 2* (HDF5) -of the BIOM file format. +[biom file format](http://biom-format.org/), which is a specialized +definition of the HDF5 format. -The *phyloseq* package fully supports -both taxa and sample observations of the biom format standard, -and works with the BIOM files output from QIIME, RDP, MG-RAST, etc. +The phyloseq package provides the `import_biom()` function, which can +import both *Version 1* (JSON) and *Version 2* (HDF5) of the BIOM file +format. +The *phyloseq* package fully supports both taxa and sample observations +of the biom format standard, and works with the BIOM files output from +QIIME, RDP, MG-RAST, etc. ## Import from QIIME (Modern) -The default output from modern versions of QIIME -is a BIOM-format file (among others). -This is suppored in phyloseq. +The default output from modern versions of QIIME is a BIOM-format file +(among others). This is suppored in phyloseq. ### Sample data from QIIME -Sometimes inaccurately referred to as *metadata*, -additional observations on samples provided as *mapping file* to QIIME -have not typically been output in the BIOM files, -**even though BIOM format supports it**. -This failure to support the full capability of the BIOM format -means that you'll have to provide sample observations as a separate file. -There are many ways to do this, but the QIIME sample map is supported. +Sometimes inaccurately referred to as *metadata*, additional +observations on samples provided as *mapping file* to QIIME have not +typically been output in the BIOM files, **even though BIOM format +supports it**. This failure to support the full capability of the BIOM +format means that you'll have to provide sample observations as a +separate file. There are many ways to do this, but the QIIME sample map +is supported. ### Input -Two QIIME output files (`.biom`, `.tre`) -are recognized by the `import_biom()` function. -One QIIME input file (sample map, tab-delimited), -is recognized by the `import_qiime_sample_data()` function. +Two QIIME output files (`.biom`, `.tre`) are recognized by the +`import_biom()` function. One QIIME input file (sample map, +tab-delimited), is recognized by the `import_qiime_sample_data()` +function. --- Input File(s) | phyloseq function | Output @@ -175,94 +243,148 @@ is recognized by the `import_qiime_sample_data()` function. `map.txt` | `import_qiime_sample_data()` | A `sample_data` object --- -The objects created by each of the import functions above -should be merged using `merge_phyloseq` to create one coordinated, self-consistent object. +The objects created by each of the import functions above should be +merged using `merge_phyloseq` to create one coordinated, self-consistent +object. ### Output -- **Before Merging** - Before merging with `merge_phyloseq`, the output from these import activities is the three separate objects listed in the previous table. -- **After Merging** - After merging you have a single self-consistent phyloseq object -that contains an OTU table, taxonomy table, sample-data, and a phylogenetic tree. +- **Before Merging** - Before merging with `merge_phyloseq`, the + output from these import activities is the three separate objects + listed in the previous table. +- **After Merging** - After merging you have a single self-consistent + phyloseq object that contains an OTU table, taxonomy table, + sample-data, and a phylogenetic tree. ### QIIME Example Tutorial -QIIME's "Moving Pictures" example tutorial output is a little too large -to include within the phyloseq package -(and thus is not directly included in this vignette). -However, the phyloseq home page includes -a full reproducible example of the import procedure described above: +QIIME's "Moving Pictures" example tutorial output is a little too large +to include within the phyloseq package (and thus is not directly +included in this vignette). However, the phyloseq home page includes a +full reproducible example of the import procedure described above: **Link HERE** -For reference, or if you want to try yourself, -the following is the relative paths within the QIIME tutorial directory -for each of the files you will need. - -- BIOM file, originally at: -`r "moving_pictures_tutorial-1.9.0/illumina/precomputed-output/otus/otu_table_mc2_w_tax_no_pynast_failures.biom"` -- Tree file, originally at: -`r "moving_pictures_tutorial-1.9.0/illumina/precomputed-output/otus/rep_set.tre"` -- Map File, originally at: -`r "moving_pictures_tutorial-1.9.0/illumina/map.tsv"` +For reference, or if you want to try yourself, the following is the +relative paths within the QIIME tutorial directory for each of the files +you will need. +- BIOM file, originally at: + `r "moving_pictures_tutorial-1.9.0/illumina/precomputed-output/otus/otu_table_mc2_w_tax_no_pynast_failures.biom"` +- Tree file, originally at: + `r "moving_pictures_tutorial-1.9.0/illumina/precomputed-output/otus/rep_set.tre"` +- Map File, originally at: + `r "moving_pictures_tutorial-1.9.0/illumina/map.tsv"` ## Import from QIIME Legacy -[QIIME](#cite:QIIME) is a free, open-source OTU clustering and analysis pipeline written for Unix (mostly Linux). It is distributed in a number of different forms (including a pre-installed virtual machine). See [the QIIME home page](http://qiime.org/) for details. +[QIIME](#cite:QIIME) is a free, open-source OTU clustering and analysis +pipeline written for Unix (mostly Linux). It is distributed in a number +of different forms (including a pre-installed virtual machine). See [the +QIIME home page](http://qiime.org/) for details. ### Input -One QIIME input file (sample map), and two QIIME output files (`otu_table.txt`, `.tre`) are recognized by the `import_qiime()` function. Only one of the three input files is required to run, although an `"otu_table.txt"` file is required if `import_qiime()` is to return a complete experiment object. - -In practice, you will have to find the relevant QIIME files among a number of other files created by the QIIME pipeline. A screenshot of the directory structure created during a typical QIIME run is shown in [the QIIME Directory Figure](#fig:qiimedirectory). - - - -![QIIME directory structure](import_qiime_directory_structure.jpg) - A typical QIIME output directory. The two output files suitable for import by *phyloseq* are highlighted. A third file describing the samples, their barcodes and covariates, is created by the user and required as *input* to QIIME. It is a good idea to import this file, as it can be converted directly to a `sample_data` object and can be extremely useful for certain analyses. - +One QIIME input file (sample map), and two QIIME output files +(`otu_table.txt`, `.tre`) are recognized by the `import_qiime()` +function. Only one of the three input files is required to run, although +an `"otu_table.txt"` file is required if `import_qiime()` is to return a +complete experiment object. + +In practice, you will have to find the relevant QIIME files among a +number of other files created by the QIIME pipeline. A screenshot of the +directory structure created during a typical QIIME run is shown in [the +QIIME Directory Figure](#fig:qiimedirectory). + + ![QIIME directory +structure](import_qiime_directory_structure.jpg) A typical QIIME output +directory. The two output files suitable for import by *phyloseq* are +highlighted. A third file describing the samples, their barcodes and +covariates, is created by the user and required as *input* to QIIME. It +is a good idea to import this file, as it can be converted directly to a +`sample_data` object and can be extremely useful for certain analyses. ### Output -The class of the object returned by `import_qiime()` depends upon which filenames are provided. The most comprehensive class is chosen automatically, based on the input files listed as arguments. At least one argument needs to be provided. - - +The class of the object returned by `import_qiime()` depends upon which +filenames are provided. The most comprehensive class is chosen +automatically, based on the input files listed as arguments. At least +one argument needs to be provided. ## Import from mothur -The open-source, platform-independent, locally-installed software package, [mothur](#cite:Schloss:2009do), can also process barcoded amplicon sequences and perform OTU-clustering. It is extensively documented on [the mothur wiki](http://www.mothur.org/wiki/) +The open-source, platform-independent, locally-installed software +package, [mothur](#cite:Schloss:2009do), can also process barcoded +amplicon sequences and perform OTU-clustering. It is extensively +documented on [the mothur wiki](http://www.mothur.org/wiki/) ### Input -Currently, there are three different files produced by the *mothur* package (Ver `1.22+`) that can be imported by *phyloseq*. At minimum, a user must supply a "`.list`" file, and at least one of the following two files: `.groups` or `.tree`. The group file is produced by *mothur*'s `make.group()` function. Details can be found at [its wiki page](http://www.mothur.org/wiki/Make.group). The tree file is a phylogenetic tree calculated by *mothur*. +Currently, there are three different files produced by the *mothur* +package (Ver `1.22+`) that can be imported by *phyloseq*. At minimum, a +user must supply a "`.list`" file, and at least one of the following two +files: `.groups` or `.tree`. The group file is produced by *mothur*'s +`make.group()` function. Details can be found at [its wiki +page](http://www.mothur.org/wiki/Make.group). The tree file is a +phylogenetic tree calculated by *mothur*. ### Output -The output from `import_mothur()` depends on which file types are provided. If all three file types are provided, an instance of the phyloseq-class is returned that contains both an OTU abundance table and its associated phylogenetic tree. - +The output from `import_mothur()` depends on which file types are +provided. If all three file types are provided, an instance of the +phyloseq-class is returned that contains both an OTU abundance table and +its associated phylogenetic tree. ## Import from PyroTagger -PyroTagger is an OTU-clustering pipeline for barcoded 16S rRNA amplicon sequences, served and maintained by the Department of Energy's (DOE's) Joint Genome Institute (JGI). It can be used through a straightforward web interface at [the PyroTagger home page](http://pyrotagger.jgi-psf.org/) +PyroTagger is an OTU-clustering pipeline for barcoded 16S rRNA amplicon +sequences, served and maintained by the Department of Energy's (DOE's) +Joint Genome Institute (JGI). It can be used through a straightforward +web interface at [the PyroTagger home +page](http://pyrotagger.jgi-psf.org/) -PyroTagger takes as input the untrimmed sequence (`.fasta`) and sequence-quality (`.qual`) files, as well as a sample mapping file that contains the bar code sequence for each sample and its name. It uses a 97\% identity threshold for defining OTU clusters (approximately species-level of taxonomic distinction), and provides no options for specifying otherwise. It does allow users to modify the threshold setting for low-quality bases. +PyroTagger takes as input the untrimmed sequence (`.fasta`) and +sequence-quality (`.qual`) files, as well as a sample mapping file that +contains the bar code sequence for each sample and its name. It uses a +97% identity threshold for defining OTU clusters (approximately +species-level of taxonomic distinction), and provides no options for +specifying otherwise. It does allow users to modify the threshold +setting for low-quality bases. ### Input -PyroTagger returns a single excel spreadsheet file (`.xls`) containing both abundance and taxonomy data, as well as some associated confidence information related to each taxonomic assignment. This spreadsheet also reports on potential chimeric sequences. This single output file is sufficient for `import_RDP_tab()`, provided the file has been converted to a tab-delimited plain-text format. Any spreadsheet application should suffice. No other changes should be made to the `.xls` file. +PyroTagger returns a single excel spreadsheet file (`.xls`) containing +both abundance and taxonomy data, as well as some associated confidence +information related to each taxonomic assignment. This spreadsheet also +reports on potential chimeric sequences. This single output file is +sufficient for `import_RDP_tab()`, provided the file has been converted +to a tab-delimited plain-text format. Any spreadsheet application should +suffice. No other changes should be made to the `.xls` file. ### Output -`import_RDP_tab()` returns an instance of the phyloseq-class that contains the OTU abundance table and taxonomy table. To my knowledge, PyroTagger does not calculate a tree of the representative sequences from each OTU cluster, nor a distance object, so analyses like `tip_glom()` and `UniFrac` are not applicable. - +`import_RDP_tab()` returns an instance of the phyloseq-class that +contains the OTU abundance table and taxonomy table. To my knowledge, +PyroTagger does not calculate a tree of the representative sequences +from each OTU cluster, nor a distance object, so analyses like +`tip_glom()` and `UniFrac` are not applicable. ## Import from RDP pipeline -The Ribosomal Database Project ([RDP](http://rdp.cme.msu.edu/)) provides a web-based barcoded 16S rRNA amplicon sequence processing pipeline called the [RDP Pyrosequencing Pipeline](http://pyro.cme.msu.edu/). A user must run all three of the "Data Processing" steps sequentially through the web interface in order to acquire the output from Complete Linkage Clustering, the approach to OTU clustering used by the RDP Pipeline. Note that this import function assumes that the sequence names in the resulting cluster file follow a particular naming convention with underscore delimiter (see below). +The Ribosomal Database Project ([RDP](http://rdp.cme.msu.edu/)) provides +a web-based barcoded 16S rRNA amplicon sequence processing pipeline +called the [RDP Pyrosequencing Pipeline](http://pyro.cme.msu.edu/). A +user must run all three of the "Data Processing" steps sequentially +through the web interface in order to acquire the output from Complete +Linkage Clustering, the approach to OTU clustering used by the RDP +Pipeline. Note that this import function assumes that the sequence names +in the resulting cluster file follow a particular naming convention with +underscore delimiter (see below). ### Input -The output from the Complete Linkage Clustering, `.clust`, is the only input to the RDP pipeline importer: +The output from the Complete Linkage Clustering, `.clust`, is the only +input to the RDP pipeline importer: ```{r, eval=FALSE} myOTU1 <- import_RDP_cluster("path/to/my/filename.clust") @@ -274,21 +396,35 @@ This importer returns an `otu_table` object. ### Expected Naming Convention -The RDP cluster pipeline (specifically, the output of the complete linkage clustering step) has no formal documentation for the ".clust" file structure or its apparent sequence naming convention. +The RDP cluster pipeline (specifically, the output of the complete +linkage clustering step) has no formal documentation for the ".clust" +file structure or its apparent sequence naming convention. -The cluster file itself contains the names of all sequences contained in the input alignment. If the upstream barcode and aligment processing steps are also done with the RDP pipeline, then the sequence names follow a predictable naming convention wherein each sequence is named by its sample and sequence ID, separated by a `"_"` as delimiter: +The cluster file itself contains the names of all sequences contained in +the input alignment. If the upstream barcode and aligment processing +steps are also done with the RDP pipeline, then the sequence names +follow a predictable naming convention wherein each sequence is named by +its sample and sequence ID, separated by a `"_"` as delimiter: `sampleName_sequenceIDnumber` -This import function assumes that the sequence names in the cluster file follow this convention, and that the sample name does not contain any `"_"`. It is unlikely to work if this is not the case. It is likely to work if you used the upstream steps in the RDP pipeline to process your raw (barcoded, untrimmed) fasta/fastq data. - - +This import function assumes that the sequence names in the cluster file +follow this convention, and that the sample name does not contain any +`"_"`. It is unlikely to work if this is not the case. It is likely to +work if you used the upstream steps in the RDP pipeline to process your +raw (barcoded, untrimmed) fasta/fastq data. ## Example Data (included) -There are multiple example data sets included in *phyloseq*. Many are from published investigations and include documentation with a summary and references, as well as some example code representing some aspect of analysis available in *phyloseq*. In the package index, go to the names beginning with "data-" to see the documentation of currently available example datasets. +There are multiple example data sets included in *phyloseq*. Many are +from published investigations and include documentation with a summary +and references, as well as some example code representing some aspect of +analysis available in *phyloseq*. In the package index, go to the names +beginning with "data-" to see the documentation of currently available +example datasets. -To load example data into the working environment, use the `data()` command: +To load example data into the working environment, use the `data()` +command: ```{r, eval=FALSE} data(GlobalPatterns) @@ -297,24 +433,39 @@ data(enterotype) data(soilrep) ``` -Similarly, entering `?enterotype` will reveal the documentation for the so-called "enterotype" dataset. For details examples, see [the Example Data tutorial](http://joey711.github.io/phyloseq/Example-Data.html) +Similarly, entering `?enterotype` will reveal the documentation for the +so-called "enterotype" dataset. For details examples, see [the Example +Data tutorial](http://joey711.github.io/phyloseq/Example-Data.html) ## phyloseq Object Summaries -In small font, the following is the summary of the `GlobalPatterns` dataset that prints to the terminal. These summaries are consistent among all `phyloseq-class` objects. Although the components of `GlobalPatterns` have many thousands of elements, the command-line returns only a short summary of each component. This encourages you to check that an object is still what you expect, without needing to let thousands of elements scroll across the terminal. In the cases in which you do want to see more of a particular component, use an accessor function (see table below). - +In small font, the following is the summary of the `GlobalPatterns` +dataset that prints to the terminal. These summaries are consistent +among all `phyloseq-class` objects. Although the components of +`GlobalPatterns` have many thousands of elements, the command-line +returns only a short summary of each component. This encourages you to +check that an object is still what you expect, without needing to let +thousands of elements scroll across the terminal. In the cases in which +you do want to see more of a particular component, use an accessor +function (see table below). ```{r} data(GlobalPatterns) GlobalPatterns ``` - ## Convert raw data to phyloseq components -Suppose you have already imported raw data from an experiment into `R`, and their indices are labeled correctly. How do you get *phyloseq* to recognize these tables as the appropriate class of data? And further combine them together? Table [Table of Component Constructor Functions](#table:build) lists key functions for converting these core data formats into specific component data objects recognized by *phyloseq*. These will also +Suppose you have already imported raw data from an experiment into `R`, +and their indices are labeled correctly. How do you get *phyloseq* to +recognize these tables as the appropriate class of data? And further +combine them together? Table [Table of Component Constructor +Functions](#table:build) lists key functions for converting these core +data formats into specific component data objects recognized by +*phyloseq*. These will also - Table of component constructor functions for building component data objects +Table of component constructor functions for building component data +objects --- Function | Input Class | Output Description @@ -328,7 +479,8 @@ Suppose you have already imported raw data from an experiment into `R`, and thei `read.table` | table file path | A matrix or data.frame (Std `R` core function) --- - phyloseq constructors: functions for building/merging *phyloseq* objects. +phyloseq constructors: functions for building/merging *phyloseq* +objects. --- Function | Input Class | Output Description @@ -337,7 +489,8 @@ Function | Input Class | Output Description `merge_phyloseq`| Two or more component or phyloseq-class objects | Combined instance of phyloseq-class --- -The following example illustrates using the constructor methods for component data tables. +The following example illustrates using the constructor methods for +component data tables. ```{r, eval=FALSE} otu1 <- otu_table(raw_abundance_matrix, taxa_are_rows=FALSE) @@ -348,36 +501,57 @@ tre1 <- read_tree(my_tree_file) ## phyloseq() function: building complex phyloseq objects -Once you've converted the data tables to their appropriate class, combining them into one object requires only one additional function call, `phyloseq()`: +Once you've converted the data tables to their appropriate class, +combining them into one object requires only one additional function +call, `phyloseq()`: + ```{r, eval=FALSE} ex1b <- phyloseq(my_otu_table, my_sample_data, my_taxonomyTable, my_tree) ``` -You do not need to have all four data types in the example above in order to combine them into one validity-checked experiment-level phyloseq-class object. The `phyloseq()` method will detect which component data classes are present, and build accordingly. Downstream analysis methods will access the required components using *phyloseq*'s accessors, and throw an error if something is missing. For most downstream methods you will only need to supply the combined, phyloseq-class object (the output of `phyloseq()` ), usually as the first argument. +You do not need to have all four data types in the example above in +order to combine them into one validity-checked experiment-level +phyloseq-class object. The `phyloseq()` method will detect which +component data classes are present, and build accordingly. Downstream +analysis methods will access the required components using *phyloseq*'s +accessors, and throw an error if something is missing. For most +downstream methods you will only need to supply the combined, +phyloseq-class object (the output of `phyloseq()` ), usually as the +first argument. + ```{r, eval=FALSE} ex1c <- phyloseq(my_otu_table, my_sample_data) ``` -Whenever an instance of the phyloseq-class is created by *phyloseq* --- for example, when we use the `import_qiime()` function to import data, or combine manually imported tables using `phyloseq()` --- the row and column indices representing taxa or samples are internally checked/trimmed for compatibility, such that all component data describe exactly (and only) the same OTUs and samples. +Whenever an instance of the phyloseq-class is created by *phyloseq* --- +for example, when we use the `import_qiime()` function to import data, +or combine manually imported tables using `phyloseq()` --- the row and +column indices representing taxa or samples are internally +checked/trimmed for compatibility, such that all component data describe +exactly (and only) the same OTUs and samples. ## Merge -The phyloseq project includes support for two complete different categories of merging. +The phyloseq project includes support for two complete different +categories of merging. - - Merging the OTUs or samples in a phyloseq object, based upon a taxonomic or sample variable: `merge_samples()`, `merge_taxa()` - - Merging two or more data objects that come from the same experiment, so that their data becomes part of the same phyloseq object: `merge_phyloseq()` +- Merging the OTUs or samples in a phyloseq object, based upon a + taxonomic or sample variable: `merge_samples()`, `merge_taxa()` +- Merging two or more data objects that come from the same experiment, + so that their data becomes part of the same phyloseq object: + `merge_phyloseq()` For further details, see the reproducible online tutorial at: -http://joey711.github.com/phyloseq/merge - - + # Accessor functions -Once you have a phyloseq object available, many accessor functions are available to query aspects of the data set. The function name and its purpose are summarized in [the Accessor Functions Table](#table:access). +Once you have a phyloseq object available, many accessor functions are +available to query aspects of the data set. The function name and its +purpose are summarized in [the Accessor Functions Table](#table:access). - Accessor functions for *phyloseq* objects. +Accessor functions for *phyloseq* objects. @@ -405,14 +579,25 @@ Function | Returns `phy_tree` | Access the tree contained in a phyloseq object --- - - # Trimming, subsetting, filtering phyloseq data ## Trimming: prune_taxa() -Trimming high-throughput phylogenetic sequencing data can be useful, or even necessary, for certain types of analyses. However, it is important that the original data always be available for reference and reproducibility; and that the methods used for trimming be transparent to others, so they can perform the same trimming or filtering steps on the same or related data. To facilitate this, *phyloseq* contains many ways to trim/filter the data from a phylogenetic sequencing project. Because matching indices for taxa and samples is strictly enforced, subsetting one of the data components automatically subsets the corresponding indices from the others. Variables holding trimmed versions of your original data can be declared, and further trimmed, without losing track of the original data. -In general, most trimming should be accomplished using the S4 methods `prune_taxa()` or `prune_samples()`. +Trimming high-throughput phylogenetic sequencing data can be useful, or +even necessary, for certain types of analyses. However, it is important +that the original data always be available for reference and +reproducibility; and that the methods used for trimming be transparent +to others, so they can perform the same trimming or filtering steps on +the same or related data. To facilitate this, *phyloseq* contains many +ways to trim/filter the data from a phylogenetic sequencing project. +Because matching indices for taxa and samples is strictly enforced, +subsetting one of the data components automatically subsets the +corresponding indices from the others. Variables holding trimmed +versions of your original data can be declared, and further trimmed, +without losing track of the original data. + +In general, most trimming should be accomplished using the S4 methods +`prune_taxa()` or `prune_samples()`. ## Simple filtering example @@ -420,7 +605,9 @@ In general, most trimming should be accomplished using the S4 methods `prune_tax topN <- 20 ``` -For example, lets make a new object that only holds the most abundant `r topN` taxa in the experiment. To accomplish this, we will use the `prune_taxa()` function. +For example, lets make a new object that only holds the most abundant +`r topN` taxa in the experiment. To accomplish this, we will use the +`prune_taxa()` function. ```{r} data(GlobalPatterns) @@ -428,7 +615,9 @@ most_abundant_taxa <- sort(taxa_sums(GlobalPatterns), TRUE)[1:topN] ex2 <- prune_taxa(names(most_abundant_taxa), GlobalPatterns) ``` -Now we can ask the question, "what taxonomic Family are these OTUs?" (Subsetting still returns a `taxonomyTable` object, which is summarized. We will need to convert to a vector) +Now we can ask the question, "what taxonomic Family are these OTUs?" +(Subsetting still returns a `taxonomyTable` object, which is summarized. +We will need to convert to a vector) ```{r} topFamilies <- tax_table(ex2)[, "Family"] @@ -437,13 +626,30 @@ as(topFamilies, "vector") ## Arbitrarily complex abundance filtering -The previous example was a relatively simple filtering in which we kept only the most abundant `r topN` in the whole experiment. But what if we wanted to keep the most abundant `r topN` taxa of each sample? And of those, keep only the taxa that are also found in at least one-third of our samples? What if we wanted to keep only those taxa that met some across-sample criteria? +The previous example was a relatively simple filtering in which we kept +only the most abundant `r topN` in the whole experiment. But what if we +wanted to keep the most abundant `r topN` taxa of each sample? And of +those, keep only the taxa that are also found in at least one-third of +our samples? What if we wanted to keep only those taxa that met some +across-sample criteria? ### genefilter_sample(): Filter by Within-Sample Criteria -For this more complicated filtering *phyloseq* contains a function, `genefilter_sample`, that takes as an argument a *phyloseq* object, as well as a list of one or more filtering functions that will be applied to each sample in the abundance matrix (`otu_table`), as well as an integer argument, `A`, that specifies for how many samples the filtering function must return `TRUE` for a particular taxa to avoid removal from the object. A supporting function `filterfun_sample` is also included in *phyloseq* to facilitate creating a properly formatted function (enclosure) if more than one function is going to be applied simultaneously. `genefilter_sample` returns a logical vector suitable for sending directly to `prune_taxa` for the actual trimming. +For this more complicated filtering *phyloseq* contains a function, +`genefilter_sample`, that takes as an argument a *phyloseq* object, as +well as a list of one or more filtering functions that will be applied +to each sample in the abundance matrix (`otu_table`), as well as an +integer argument, `A`, that specifies for how many samples the filtering +function must return `TRUE` for a particular taxa to avoid removal from +the object. A supporting function `filterfun_sample` is also included in +*phyloseq* to facilitate creating a properly formatted function +(enclosure) if more than one function is going to be applied +simultaneously. `genefilter_sample` returns a logical vector suitable +for sending directly to `prune_taxa` for the actual trimming. + +Here is an example on a completely fabricated `otu_table` called +`testOTU`. -Here is an example on a completely fabricated `otu_table` called `testOTU`. ```{r, eval=FALSE} testOTU <- otu_table(matrix(sample(1:50, 25, replace=TRUE), 5, 5), taxa_are_rows=FALSE) f1<- filterfun_sample(topk(2)) @@ -453,7 +659,12 @@ prune_taxa(wh1, testOTU) prune_taxa(wh2, testOTU) ``` -Here is a second example using the included dataset, `GlobalPatterns`. The most abundant taxa are kept only if they are in the most abundant 10\% of taxa in at least half of the samples in dataset `GlobalPatterns`. Note that it is not necessary to subset `GlobalPatterns` in order to do this filtering. The S4 method `prune_taxa` subsets each of the relavent component objects, and returns the complex object back. +Here is a second example using the included dataset, `GlobalPatterns`. +The most abundant taxa are kept only if they are in the most abundant +10% of taxa in at least half of the samples in dataset `GlobalPatterns`. +Note that it is not necessary to subset `GlobalPatterns` in order to do +this filtering. The S4 method `prune_taxa` subsets each of the relavent +component objects, and returns the complex object back. ```{r} data(GlobalPatterns) @@ -467,7 +678,13 @@ ex2 <- prune_taxa(wh1, GlobalPatterns) print(ex2) ``` -If instead of the most abundant fraction of taxa, you are interested in the most abundant fraction of individuals (aka sequences, observations), then the `topf` function is appropriate. For steep rank-abundance curves, `topf` will seem to be much more conservative (trim more taxa) because it is based on the cumulative sum of relative abundance. It does not guarantee that a certain number or fraction of total taxa (richness) will be retained. +If instead of the most abundant fraction of taxa, you are interested in +the most abundant fraction of individuals (aka sequences, observations), +then the `topf` function is appropriate. For steep rank-abundance +curves, `topf` will seem to be much more conservative (trim more taxa) +because it is based on the cumulative sum of relative abundance. It does +not guarantee that a certain number or fraction of total taxa (richness) +will be retained. ```{r, eval=FALSE} data(GlobalPatterns) @@ -479,14 +696,24 @@ prune_taxa(wh1, GlobalPatterns) ### filter_taxa(): Filter by Across-Sample Criteria -The `filter_taxa` function is directly analogous to the `genefilter` function for microarray filtering, but is used for filtering OTUs from phyloseq objects. It applies an arbitrary set of functions -- as a function list, for instance, created by `genefilter::filterfun` -- as across-sample criteria, one OTU at a time. It can be thought of as an extension of the genefilter-package (from the Bioconductor repository) for phyloseq objects. It takes as input a phyloseq object, and returns a logical vector indicating whether or not each OTU passed the criteria. Alternatively, if the `prune` option is set to `r FALSE`, it returns the already-trimmed version of the phyloseq object. +The `filter_taxa` function is directly analogous to the `genefilter` +function for microarray filtering, but is used for filtering OTUs from +phyloseq objects. It applies an arbitrary set of functions -- as a +function list, for instance, created by `genefilter::filterfun` -- as +across-sample criteria, one OTU at a time. It can be thought of as an +extension of the genefilter-package (from the Bioconductor repository) +for phyloseq objects. It takes as input a phyloseq object, and returns a +logical vector indicating whether or not each OTU passed the criteria. +Alternatively, if the `prune` option is set to `r FALSE`, it returns the +already-trimmed version of the phyloseq object. -Inspect the following example. Note that the functions `genefilter` and `kOverA` are from the genefilter package. +Inspect the following example. Note that the functions `genefilter` and +`kOverA` are from the genefilter package. ```{r} data("enterotype") library("genefilter") -flist<- filterfun(kOverA(5, 2e-05)) +flist <- filterfun(kOverA(5, 2e-05)) ent.logi <- filter_taxa(enterotype, flist) ent.trim <- filter_taxa(enterotype, flist, TRUE) identical(ent.trim, prune_taxa(ent.logi, enterotype)) @@ -496,14 +723,22 @@ filter_taxa(enterotype, flist, TRUE) ## subset_samples(): Subset by Sample Variables -It is possible to subset the samples in a *phyloseq* object based on the sample variables using the `subset_samples()` function. For example to subset `GlobalPatterns` such that only certain environments are retained, the following line is needed (the related tables are subsetted automatically as well): +It is possible to subset the samples in a *phyloseq* object based on the +sample variables using the `subset_samples()` function. For example to +subset `GlobalPatterns` such that only certain environments are +retained, the following line is needed (the related tables are subsetted +automatically as well): ```{r} ex3 <- subset_samples(GlobalPatterns, SampleType%in%c("Freshwater", "Ocean", "Freshwater (creek)")) ex3 ``` -For this example only a categorical variable is shown, but in principle a continuous variable could be specified and a logical expression provided just as for the `subset` function. In fact, because `sample_data` component objects are an extension of the data.frame class, they can also be subsetted with the `subset` function: +For this example only a categorical variable is shown, but in principle +a continuous variable could be specified and a logical expression +provided just as for the `subset` function. In fact, because +`sample_data` component objects are an extension of the data.frame +class, they can also be subsetted with the `subset` function: ```{r} subset(sample_data(GlobalPatterns), SampleType%in%c("Freshwater", "Ocean", "Freshwater (creek)")) @@ -511,7 +746,10 @@ subset(sample_data(GlobalPatterns), SampleType%in%c("Freshwater", "Ocean", "Fres ## subset_taxa(): subset by taxonomic categories -It is possible to subset by specific taxonomic category using the `subset_taxa()` function. For example, if we wanted to subset `GlobalPatterns` so that it only contains data regarding the phylum *Firmicutes*: +It is possible to subset by specific taxonomic category using the +`subset_taxa()` function. For example, if we wanted to subset +`GlobalPatterns` so that it only contains data regarding the phylum +*Firmicutes*: ```{r} ex4 <- subset_taxa(GlobalPatterns, Phylum=="Firmicutes") @@ -520,62 +758,107 @@ ex4 ## random subsample abundance data -Can also randomly subset, for example a random subset of 100 taxa from the full dataset. +Can also randomly subset, for example a random subset of 100 taxa from +the full dataset. ```{r} randomSpecies100 <- sample(taxa_names(GlobalPatterns), 100, replace=FALSE) ex5 <- prune_taxa(randomSpecies100, GlobalPatterns) ``` - # Transform abundance data -Sample-wise transformation can be achieved with the `transform_sample_counts()` function. It requires two arguments, (1) the *phyloseq* object that you want to transform, and the function that you want to use to perform the transformation. Any arbitrary function can be provided as the second argument, as long as it returns a numeric vector with the same length as its input. In the following trivial example, we create a second object, `ex2`, that has been "transformed" by the identity function such that it is actually identical to `GlobalPatterns`. +Sample-wise transformation can be achieved with the +`transform_sample_counts()` function. It requires two arguments, (1) the +*phyloseq* object that you want to transform, and the function that you +want to use to perform the transformation. Any arbitrary function can be +provided as the second argument, as long as it returns a numeric vector +with the same length as its input. In the following trivial example, we +create a second object, `ex2`, that has been "transformed" by the +identity function such that it is actually identical to +`GlobalPatterns`. ```{r, eval=FALSE} data(GlobalPatterns) ex2 <- transform_sample_counts(GlobalPatterns, I) ``` -For certain kinds of analyis we may want to transform the abundance data. For example, for RDA we want to transform abundance counts to within-sample ranks, and to further include a threshold beyond which all taxa receive the same rank value. The ranking for each sample is performed independently, so that the rank of a particular taxa within a particular sample is not influenced by that sample's total quantity of sequencing relative to the other samples in the project. +For certain kinds of analyis we may want to transform the abundance +data. For example, for RDA we want to transform abundance counts to +within-sample ranks, and to further include a threshold beyond which all +taxa receive the same rank value. The ranking for each sample is +performed independently, so that the rank of a particular taxa within a +particular sample is not influenced by that sample's total quantity of +sequencing relative to the other samples in the project. -The following example shows how to perform such a thresholded-rank transformation of the abundance table in the complex *phyloseq* object `GlobalPatterns` with an arbitrary threshold of 500. +The following example shows how to perform such a thresholded-rank +transformation of the abundance table in the complex *phyloseq* object +`GlobalPatterns` with an arbitrary threshold of 500. ```{r} -ex4<- transform_sample_counts(GlobalPatterns, threshrankfun(500)) +ex4 <- transform_sample_counts(GlobalPatterns, threshrankfun(500)) ``` - # Phylogenetic smoothing -## tax_glom() +## tax_glom() -Suppose we are skeptical about the importance of OTU-level distinctions in our dataset. For this scenario, *phyloseq* includes a taxonomic-agglommeration method,`tax_glom()`, which merges taxa of the same taxonomic category for a user-specified taxonomic level. In the following code, we merge all taxa of the same Genus, and store that new object as `ex6`. +Suppose we are skeptical about the importance of OTU-level distinctions +in our dataset. For this scenario, *phyloseq* includes a +taxonomic-agglommeration method,`tax_glom()`, which merges taxa of the +same taxonomic category for a user-specified taxonomic level. In the +following code, we merge all taxa of the same Genus, and store that new +object as `ex6`. ```{r, eval=FALSE} -ex6 <- tax_glom(GlobalPatterns, taxlevel="Genus") +ex6 <- tax_glom(GlobalPatterns, taxrank = "Genus") ``` -## tip_glom() - -Similarly, our original example object (`GlobalPatterns`) also contains a phlyogenetic tree corresponding to each OTU, which we could also use as a means to merge taxa in our dataset that are closely related. In this case, we specify a threshold patristic distance. Taxa more closely related than this threshold are merged. This is especially useful when a dataset has many taxa that lack a taxonomic assignment at the level you want to investigate, a problem when using `tax_glom()`. Note that for datasets with a large number of taxa, `tax_glom` will be noticeably faster than `tip_glom`. Also, keep in mind that `tip_glom` requires that its first argument be an object that contains a tree, while `tax_glom` instead requires a `taxonomyTable` (See [phyloseq classes](#sec:app-classes)). +## tip_glom() + +Similarly, our original example object (`GlobalPatterns`) also contains +a phlyogenetic tree corresponding to each OTU, which we could also use +as a means to merge taxa in our dataset that are closely related. In +this case, we specify a threshold patristic distance. Taxa more closely +related than this threshold are merged. This is especially useful when a +dataset has many taxa that lack a taxonomic assignment at the level you +want to investigate, a problem when using `tax_glom()`. Note that for +datasets with a large number of taxa, `tax_glom` will be noticeably +faster than `tip_glom`. Also, keep in mind that `tip_glom` requires that +its first argument be an object that contains a tree, while `tax_glom` +instead requires a `taxonomyTable` (See [phyloseq +classes](#sec:app-classes)). ```{r, eval=FALSE} ex7 <- tip_glom(GlobalPatterns, speciationMinLength = 0.05) ``` -Command output not provided here to save time during compilation of the vignette. The user is encouraged to try this out on your dataset, or even this example, if interested. It may take a while to run on the full, untrimmed data. - +Command output not provided here to save time during compilation of the +vignette. The user is encouraged to try this out on your dataset, or +even this example, if interested. It may take a while to run on the +full, untrimmed data. # Installation ## Installation -Please check [the phyloseq installation tutorial](http://joey711.github.com/phyloseq/install) for help with installation. This is likely to be the first place news and updated information about installation will be posted, as well. Also check out the rest of [the phyloseq homepage on GitHub](http://joey711.github.io/phyloseq/), as this is the best place to post issues, bug reports, feature requests, contribute code, etc. +Please check [the phyloseq installation +tutorial](http://joey711.github.com/phyloseq/install) for help with +installation. This is likely to be the first place news and updated +information about installation will be posted, as well. Also check out +the rest of [the phyloseq homepage on +GitHub](http://joey711.github.io/phyloseq/), as this is the best place +to post issues, bug reports, feature requests, contribute code, etc. ## Installing Parallel Backend -For running parallel implementation of functions/methods in *phyloseq* (e.g. `UniFrac(GlobalPatterns, parallel=TRUE)`), you will need also to install a function for registering a parallel "backend". Only one working parallel backend is needed, but there are several options, and the best one will depend on the details of your particular system. The "doParallel" package is a good place to start. Any one of the following lines from an `R` session will install a backend package. +For running parallel implementation of functions/methods in *phyloseq* +(e.g. `UniFrac(GlobalPatterns, parallel=TRUE)`), you will need also to +install a function for registering a parallel "backend". Only one +working parallel backend is needed, but there are several options, and +the best one will depend on the details of your particular system. The +"doParallel" package is a good place to start. Any one of the following +lines from an `R` session will install a backend package. ```{r, eval=FALSE} install.packages("doParallel") @@ -584,17 +867,25 @@ install.packages("doSNOW") install.packages("doMPI") ``` - # References - -Robert C Gentleman, Vincent J. Carey, Douglas M. Bates, et al. **Bioconductor: Open software development for computational biology and bioinformatics.** *Genome Biology* 5:R80, 2004. - - -J Gregory Caporaso, Justin Kuczynski, Jesse Stombaugh, Kyle Bittinger, Frederic D Bushman **QIIME allows analysis of high-throughput community sequencing data.** *Nature Methods* 7(5):335-336, 2010. - - -P D Schloss, S L Westcott, T Ryabin, J R Hall, M Hartmann, et al. **Introducing mothur: Open-Source, Platform-Independent, Community-Supported Software for Describing and Comparing Microbial Communities.** *Applied and Environmental Microbiology* 75(23):7537-7541, 2009. - - -J R Cole, Q Wang, E Cardenas, J Fish, B Chai et al. **The Ribosomal Database Project: improved alignments and new tools for rRNA analysis.** *Nucleic Acids Research* 37(Database issue):D141-5, 2009. + Robert C Gentleman, Vincent J. Carey, +Douglas M. Bates, et al. **Bioconductor: Open software development for +computational biology and bioinformatics.** *Genome Biology* 5:R80, +2004. + + J Gregory Caporaso, Justin Kuczynski, Jesse +Stombaugh, Kyle Bittinger, Frederic D Bushman **QIIME allows analysis of +high-throughput community sequencing data.** *Nature Methods* +7(5):335-336, 2010. + + P D Schloss, S L Westcott, T Ryabin, J +R Hall, M Hartmann, et al. **Introducing mothur: Open-Source, +Platform-Independent, Community-Supported Software for Describing and +Comparing Microbial Communities.** *Applied and Environmental +Microbiology* 75(23):7537-7541, 2009. + + J R Cole, Q Wang, E Cardenas, J Fish, B Chai et +al. **The Ribosomal Database Project: improved alignments and new tools +for rRNA analysis.** *Nucleic Acids Research* 37(Database issue):D141-5, +2009. From eeb0edd94acdc960ba38b33570b9bb5ddb29e2bf Mon Sep 17 00:00:00 2001 From: Salix <31168746+salix-d@users.noreply.github.com> Date: Sun, 23 Oct 2022 14:43:30 -0400 Subject: [PATCH 5/9] created while building pkg --- vignettes/phyloseq-FAQ.R | 31 ++++ vignettes/phyloseq-analysis.R | 218 ++++++++++++++++++++++++++++ vignettes/phyloseq-basics.R | 112 ++++++++++++++ vignettes/phyloseq-mixture-models.R | 74 ++++++++++ 4 files changed, 435 insertions(+) create mode 100644 vignettes/phyloseq-FAQ.R create mode 100644 vignettes/phyloseq-analysis.R create mode 100644 vignettes/phyloseq-basics.R create mode 100644 vignettes/phyloseq-mixture-models.R diff --git a/vignettes/phyloseq-FAQ.R b/vignettes/phyloseq-FAQ.R new file mode 100644 index 00000000..87524771 --- /dev/null +++ b/vignettes/phyloseq-FAQ.R @@ -0,0 +1,31 @@ +## ---- warning=FALSE, message=FALSE-------------------------------------------- +library("phyloseq"); packageVersion("phyloseq") +library("ggplot2"); packageVersion("ggplot2") +theme_set(theme_bw()) + +## ----------------------------------------------------------------------------- +data(esophagus) +plot_tree(esophagus) + +## ----------------------------------------------------------------------------- +p1 = plot_tree(esophagus, color = "Sample") +p1 +p1 + + ggtitle("This is my title.") + + annotate("text", 0.25, 3, + color = "orange", + label = "my annotation") + +## ----------------------------------------------------------------------------- +data("esophagus") +mdf = psmelt(esophagus) +# Simple bar plot. See plot_bar() for more. +ggplot(mdf, aes(x = Sample, + y = Abundance)) + + geom_bar(stat = "identity", position = "stack", color = "black") +# Simple heat map. See plot_heatmap() for more. +ggplot(mdf, aes(x = Sample, + y = OTU, + fill = Abundance)) + + geom_raster() + diff --git a/vignettes/phyloseq-analysis.R b/vignettes/phyloseq-analysis.R new file mode 100644 index 00000000..6e8ced84 --- /dev/null +++ b/vignettes/phyloseq-analysis.R @@ -0,0 +1,218 @@ +## ----dontrun-basics-vignette, eval=FALSE-------------------------------------- +# vignette("phyloseq-basics") + +## ----load-packages, message=FALSE, warning=FALSE------------------------------ +library("phyloseq") +library("ggplot2") + +## ----ggplot2-themes----------------------------------------------------------- +theme_set(theme_bw()) + +## ----------------------------------------------------------------------------- +data(GlobalPatterns) + +## ----------------------------------------------------------------------------- +# prune OTUs that are not present in at least one sample +GP <- prune_taxa(taxa_sums(GlobalPatterns) > 0, GlobalPatterns) +# Define a human-associated versus non-human categorical variable: +human <- get_variable(GP, "SampleType") %in% c("Feces", "Mock", "Skin", "Tongue") +# Add new human variable to sample data: +sample_data(GP)$human <- factor(human) + +## ----richness_estimates0, fig.width=13, fig.height=7-------------------------- +alpha_meas = c("Observed", "Chao1", "ACE", "Shannon", "Simpson", "InvSimpson") +(p <- plot_richness(GP, "human", "SampleType", measures=alpha_meas)) + +## ----richness_estimates, fig.width=13,height=7-------------------------------- +p + geom_boxplot(data=p$data, aes(x=human, y=value, color=NULL), alpha=0.1) + +## ----------------------------------------------------------------------------- +GP.chl <- subset_taxa(GP, Phylum=="Chlamydiae") + +## ----GP-chl-tree, fig.width=15, fig.height=7, message=FALSE, warning=FALSE---- +plot_tree(GP.chl, color="SampleType", shape="Family", label.tips="Genus", size="Abundance") + +## ----------------------------------------------------------------------------- +data(enterotype) + +## ----EntAbundPlot, fig.height=6, fig.width=8---------------------------------- +par(mar = c(10, 4, 4, 2) + 0.1) # make more room on bottom margin +N <- 30 +barplot(sort(taxa_sums(enterotype), TRUE)[1:N]/nsamples(enterotype), las=2) + +## ----------------------------------------------------------------------------- +rank_names(enterotype) + +## ----------------------------------------------------------------------------- +TopNOTUs <- names(sort(taxa_sums(enterotype), TRUE)[1:10]) +ent10 <- prune_taxa(TopNOTUs, enterotype) +print(ent10) + +## ----------------------------------------------------------------------------- +sample_variables(ent10) + +## ----entbarplot0, fig.height=6, fig.width=10---------------------------------- +plot_bar(ent10, "SeqTech", fill="Enterotype", facet_grid=~Genus) + +## ----GPheatmap---------------------------------------------------------------- +data("GlobalPatterns") +gpac <- subset_taxa(GlobalPatterns, Phylum=="Crenarchaeota") +(p <- plot_heatmap(gpac, "NMDS", "bray", "SampleType", "Family")) + +## ----GPheatmap-rename-axes---------------------------------------------------- +p$scales$scales[[1]]$name <- "My X-Axis" +p$scales$scales[[2]]$name <- "My Y-Axis" +print(p) + +## ----plot_sample_network, fig.width=11, fig.height=7, message=FALSE, warning=FALSE---- +data(enterotype) +plot_net(enterotype, maxdist=0.4, color="SeqTech", shape="Enterotype") + +## ----eval=FALSE--------------------------------------------------------------- +# my.physeq <- import("Biom", BIOMfilename="myBiomFile.biom") +# my.ord <- ordinate(my.physeq) +# plot_ordination(my.physeq, my.ord, color="myFavoriteVarible") + +## ----help-import, eval=FALSE-------------------------------------------------- +# help(import) +# help(ordinate) +# help(distance) +# help(plot_ordination) + +## ----GP-data-load------------------------------------------------------------- +data(GlobalPatterns) + +## ---- eval=FALSE-------------------------------------------------------------- +# GPUF <- UniFrac(GlobalPatterns) + +## ----load-precomputed-UF------------------------------------------------------ +load(system.file("doc", "Unweighted_UniFrac.RData", package="phyloseq")) + +## ----------------------------------------------------------------------------- +GloPa.pcoa = ordinate(GlobalPatterns, method="PCoA", distance=GPUF) + +## ----PCoAScree, fig.width=6, fig.height=4------------------------------------- +plot_scree(GloPa.pcoa, "Scree plot for Global Patterns, UniFrac/PCoA") + +## ----GPfig5ax1213------------------------------------------------------------- +(p12 <- plot_ordination(GlobalPatterns, GloPa.pcoa, "samples", color="SampleType") + + geom_point(size=5) + geom_path() + scale_colour_hue(guide = "none") ) +(p13 <- plot_ordination(GlobalPatterns, GloPa.pcoa, "samples", axes=c(1, 3), + color="SampleType") + geom_line() + geom_point(size=5) ) + +## ----GP_UF_NMDS0-------------------------------------------------------------- +# (Re)load UniFrac distance matrix and GlobalPatterns data +data(GlobalPatterns) +load(system.file("doc", "Unweighted_UniFrac.RData", package="phyloseq")) +# perform NMDS, set to 2 axes +GP.NMDS <- ordinate(GlobalPatterns, "NMDS", GPUF) +(p <- plot_ordination(GlobalPatterns, GP.NMDS, "samples", color="SampleType") + + geom_line() + geom_point(size=5) ) + +## ----GPCAscree0, fig=FALSE---------------------------------------------------- +data(GlobalPatterns) +# Take a subset of the GP dataset, top 200 species +topsp <- names(sort(taxa_sums(GlobalPatterns), TRUE)[1:200]) +GP <- prune_taxa(topsp, GlobalPatterns) +# Subset further to top 5 phyla, among the top 200 OTUs. +top5ph <- sort(tapply(taxa_sums(GP), tax_table(GP)[, "Phylum"], sum), decreasing=TRUE)[1:5] +GP <- subset_taxa(GP, Phylum %in% names(top5ph)) +# Re-add human variable to sample data: +sample_data(GP)$human <- factor(human) + +## ----GPCAscree, fig.width=8, fig.height=5------------------------------------- +# Now perform a unconstrained correspondence analysis +gpca <- ordinate(GP, "CCA") +# Scree plot +plot_scree(gpca, "Scree Plot for Global Patterns Correspondence Analysis") + +## ----GPCA1234----------------------------------------------------------------- +(p12 <- plot_ordination(GP, gpca, "samples", color="SampleType") + + geom_line() + geom_point(size=5) ) +(p34 <- plot_ordination(GP, gpca, "samples", axes=c(3, 4), color="SampleType") + + geom_line() + geom_point(size=5) ) + +## ----GPCAspecplot0------------------------------------------------------------ +p1 <- plot_ordination(GP, gpca, "species", color="Phylum") +(p1 <- ggplot(p1$data, p1$mapping) + geom_point(size=5, alpha=0.5) + + facet_wrap(~Phylum) + scale_colour_hue(guide = "none") ) + +## ----GPCAspecplotTopo0-------------------------------------------------------- +(p3 <- ggplot(p1$data, p1$mapping) + geom_density2d() + + facet_wrap(~Phylum) + scale_colour_hue(guide = "none") ) + +## ----GPCAjitter0-------------------------------------------------------------- +library("reshape2") +# Melt the species-data.frame, DF, to facet each CA axis separately +mdf <- melt(p1$data[, c("CA1", "CA2", "Phylum", "Family", "Genus")], + id=c("Phylum", "Family", "Genus") ) +# Select some special outliers for labelling +LF <- subset(mdf, variable=="CA2" & value < -1.0) +# build plot: boxplot summaries of each CA-axis, with labels +p <- ggplot(mdf, aes(Phylum, value, color=Phylum)) + + geom_boxplot() + + facet_wrap(~variable, 2) + + scale_colour_hue(guide = "none") + + theme_bw() + + theme( axis.text.x = element_text(angle = -90, vjust = 0.5) ) +# Add the text label layer, and render ggplot graphic +(p <- p + geom_text(data=subset(LF, !is.na(Family)), + mapping = aes(Phylum, value+0.1, color=Phylum, label=Family), + vjust=0, + size=2)) + +## ----GPtaxaplot0-------------------------------------------------------------- +plot_bar(GP, x="human", fill="SampleType", facet_grid= ~ Phylum) + +## ----GPdpcoa01---------------------------------------------------------------- +# Perform ordination +GP.dpcoa <- ordinate(GP, "DPCoA") +# Generate default ordination bi-plot +pdpcoa <- + plot_ordination( + physeq = GP, + ordination = GP.dpcoa, + type="biplot", + color="SampleType", + shape="Phylum") +# Adjust the shape scale manually +# to make taxa hollow and samples filled (advanced) +shape.fac <- pdpcoa$data$Phylum +man.shapes <- c(19, 21:25) +names(man.shapes) <- c("Samples", levels(shape.fac)[levels(shape.fac)!="Samples"]) +p2dpcoa <- pdpcoa + scale_shape_manual(values=man.shapes) +p2dpcoa + +## ----GPdpcoa02---------------------------------------------------------------- +# Show just Samples or just Taxa +plot_ordination(GP, GP.dpcoa, type="taxa", shape="Phylum") +plot_ordination(GP, GP.dpcoa, type="samples", color="SampleType") +# Split +plot_ordination(GP, GP.dpcoa, type="split", + color="SampleType", shape="Phylum") + + ggplot2::scale_colour_discrete() + +## ----distancefun-------------------------------------------------------------- +data(esophagus) +distance(esophagus, "bray") +distance(esophagus, "wunifrac") # weighted UniFrac +distance(esophagus, "jaccard") # vegdist jaccard +distance(esophagus, "g") # betadiver method option "g" + +## ----eval=FALSE, echo=TRUE---------------------------------------------------- +# data(esophagus) +# distance(esophagus, "wUniFrac") +# distance(esophagus, "uUniFrac") + +## ----------------------------------------------------------------------------- +# (Re)load UniFrac distance matrix and GlobalPatterns data +data(GlobalPatterns) +load(system.file("doc", "Unweighted_UniFrac.RData", package="phyloseq")) +# Manually define color-shading vector based on sample type. +colorScale <- rainbow(length(levels(get_variable(GlobalPatterns, "SampleType")))) +cols <- colorScale[get_variable(GlobalPatterns, "SampleType")] +GP.tip.labels <- as(get_variable(GlobalPatterns, "SampleType"), "character") +# This is the actual hierarchical clustering call, specifying average-link clustering +GP.hclust <- hclust(GPUF, method="average") +plot(GP.hclust, col=cols) + diff --git a/vignettes/phyloseq-basics.R b/vignettes/phyloseq-basics.R new file mode 100644 index 00000000..53966a37 --- /dev/null +++ b/vignettes/phyloseq-basics.R @@ -0,0 +1,112 @@ +## ---- eval=FALSE-------------------------------------------------------------- +# vignette("phyloseq_analysis") + +## ----load-packages, message=FALSE, warning=FALSE------------------------------ +library("phyloseq") + +## ---- eval=FALSE-------------------------------------------------------------- +# myOTU1 <- import_RDP_cluster("path/to/my/filename.clust") + +## ---- eval=FALSE-------------------------------------------------------------- +# data(GlobalPatterns) +# data(esophagus) +# data(enterotype) +# data(soilrep) + +## ----------------------------------------------------------------------------- +data(GlobalPatterns) +GlobalPatterns + +## ---- eval=FALSE-------------------------------------------------------------- +# otu1 <- otu_table(raw_abundance_matrix, taxa_are_rows=FALSE) +# sam1 <- sample_data(raw_sample_data.frame) +# tax1 <- tax_table(raw_taxonomy_matrix) +# tre1 <- read_tree(my_tree_file) + +## ---- eval=FALSE-------------------------------------------------------------- +# ex1b <- phyloseq(my_otu_table, my_sample_data, my_taxonomyTable, my_tree) + +## ---- eval=FALSE-------------------------------------------------------------- +# ex1c <- phyloseq(my_otu_table, my_sample_data) + +## ----echo=FALSE--------------------------------------------------------------- +topN <- 20 + +## ----------------------------------------------------------------------------- +data(GlobalPatterns) +most_abundant_taxa <- sort(taxa_sums(GlobalPatterns), TRUE)[1:topN] +ex2 <- prune_taxa(names(most_abundant_taxa), GlobalPatterns) + +## ----------------------------------------------------------------------------- +topFamilies <- tax_table(ex2)[, "Family"] +as(topFamilies, "vector") + +## ---- eval=FALSE-------------------------------------------------------------- +# testOTU <- otu_table(matrix(sample(1:50, 25, replace=TRUE), 5, 5), taxa_are_rows=FALSE) +# f1<- filterfun_sample(topk(2)) +# wh1 <- genefilter_sample(testOTU, f1, A=2) +# wh2 <- c(T, T, T, F, F) +# prune_taxa(wh1, testOTU) +# prune_taxa(wh2, testOTU) + +## ----------------------------------------------------------------------------- +data(GlobalPatterns) +f1<- filterfun_sample(topp(0.1)) +wh1 <- genefilter_sample(GlobalPatterns, f1, A=(1/2*nsamples(GlobalPatterns))) +sum(wh1) +ex2 <- prune_taxa(wh1, GlobalPatterns) + +## ----------------------------------------------------------------------------- +print(ex2) + +## ---- eval=FALSE-------------------------------------------------------------- +# data(GlobalPatterns) +# f1<- filterfun_sample(topf(0.9)) +# wh1 <- genefilter_sample(GlobalPatterns, f1, A=(1/3*nsamples(GlobalPatterns))) +# sum(wh1) +# prune_taxa(wh1, GlobalPatterns) + +## ----------------------------------------------------------------------------- +data("enterotype") +library("genefilter") +flist <- filterfun(kOverA(5, 2e-05)) +ent.logi <- filter_taxa(enterotype, flist) +ent.trim <- filter_taxa(enterotype, flist, TRUE) +identical(ent.trim, prune_taxa(ent.logi, enterotype)) +identical(sum(ent.logi), ntaxa(ent.trim)) +filter_taxa(enterotype, flist, TRUE) + +## ----------------------------------------------------------------------------- +ex3 <- subset_samples(GlobalPatterns, SampleType%in%c("Freshwater", "Ocean", "Freshwater (creek)")) +ex3 + +## ----------------------------------------------------------------------------- +subset(sample_data(GlobalPatterns), SampleType%in%c("Freshwater", "Ocean", "Freshwater (creek)")) + +## ----------------------------------------------------------------------------- +ex4 <- subset_taxa(GlobalPatterns, Phylum=="Firmicutes") +ex4 + +## ----------------------------------------------------------------------------- +randomSpecies100 <- sample(taxa_names(GlobalPatterns), 100, replace=FALSE) +ex5 <- prune_taxa(randomSpecies100, GlobalPatterns) + +## ---- eval=FALSE-------------------------------------------------------------- +# data(GlobalPatterns) +# ex2 <- transform_sample_counts(GlobalPatterns, I) + +## ----------------------------------------------------------------------------- +ex4 <- transform_sample_counts(GlobalPatterns, threshrankfun(500)) + +## ---- eval=FALSE-------------------------------------------------------------- +# ex6 <- tax_glom(GlobalPatterns, taxrank = "Genus") + +## ---- eval=FALSE-------------------------------------------------------------- +# ex7 <- tip_glom(GlobalPatterns, speciationMinLength = 0.05) + +## ---- eval=FALSE-------------------------------------------------------------- +# install.packages("doParallel") +# install.packages("doMC") +# install.packages("doSNOW") +# install.packages("doMPI") + diff --git a/vignettes/phyloseq-mixture-models.R b/vignettes/phyloseq-mixture-models.R new file mode 100644 index 00000000..8b01888d --- /dev/null +++ b/vignettes/phyloseq-mixture-models.R @@ -0,0 +1,74 @@ +## ----load-phyloseq, message=FALSE, warning=FALSE------------------------------ +library("phyloseq"); packageVersion("phyloseq") + +## ----filepath----------------------------------------------------------------- +filepath = system.file("extdata", "study_1457_split_library_seqs_and_mapping.zip", package="phyloseq") +kostic = microbio_me_qiime(filepath) + +## ----example-path-local, eval=FALSE------------------------------------------- +# filepath = "~/Downloads/study_1457_split_library_seqs_and_mapping.zip" +# kostic = microbio_me_qiime(filepath) + +## ----example-path-remote, eval=FALSE------------------------------------------ +# kostic = microbio_me_qiime(1457) + +## ----show-variables----------------------------------------------------------- +kostic +head(sample_data(kostic)$DIAGNOSIS, 10) + +## ----deseq2, message=FALSE, warning=FALSE------------------------------------- +library("DESeq2"); packageVersion("DESeq2") + +## ----rm-bad-samples----------------------------------------------------------- +kostic <- subset_samples(kostic, DIAGNOSIS != "None") +kostic <- prune_samples(sample_sums(kostic) > 500, kostic) +kostic + +## ----run-deseq2--------------------------------------------------------------- +diagdds = phyloseq_to_deseq2(kostic, ~ DIAGNOSIS) +# calculate geometric means prior to estimate size factors +gm_mean = function(x, na.rm=TRUE){ + exp(sum(log(x[x > 0]), na.rm=na.rm) / length(x)) +} +geoMeans = apply(counts(diagdds), 1, gm_mean) +diagdds = estimateSizeFactors(diagdds, geoMeans = geoMeans) +diagdds = DESeq(diagdds, fitType="local") + +## ----grab-results-process-table----------------------------------------------- +res = results(diagdds) +res = res[order(res$padj, na.last=NA), ] +alpha = 0.01 +sigtab = res[(res$padj < alpha), ] +sigtab = cbind(as(sigtab, "data.frame"), as(tax_table(kostic)[rownames(sigtab), ], "matrix")) +head(sigtab) + +## ----table-prelim------------------------------------------------------------- +posigtab = sigtab[sigtab[, "log2FoldChange"] > 0, ] +posigtab = posigtab[, c("baseMean", "log2FoldChange", "lfcSE", "padj", "Phylum", "Class", "Family", "Genus")] + +## ----make-markdown-table, echo=FALSE, results='asis'-------------------------- +# Make a markdown table +posigtab = data.frame(OTU=rownames(posigtab), posigtab) +cat(paste(colnames(posigtab), collapse=" | "), fill=TRUE) +cat(paste(rep("---", times=ncol(posigtab)), collapse=" | "), fill=TRUE) +dummy = apply(posigtab, 1, function(x){ + cat(paste(x, collapse=" | "), fill=TRUE) +}) + +## ----bar-plot----------------------------------------------------------------- +library("ggplot2") +theme_set(theme_bw()) +sigtabgen = subset(sigtab, !is.na(Genus)) +# Phylum order +x = tapply(sigtabgen$log2FoldChange, sigtabgen$Phylum, function(x) max(x)) +x = sort(x, TRUE) +sigtabgen$Phylum = factor(as.character(sigtabgen$Phylum), levels=names(x)) +# Genus order +x = tapply(sigtabgen$log2FoldChange, sigtabgen$Genus, function(x) max(x)) +x = sort(x, TRUE) +sigtabgen$Genus = factor(as.character(sigtabgen$Genus), levels=names(x)) +ggplot(sigtabgen, aes(y=Genus, x=log2FoldChange, color=Phylum)) + + geom_vline(xintercept = 0.0, color = "gray", size = 0.5) + + geom_point(size=6) + + theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust=0.5)) + From e8a4b92b4bf684655dd58ce99bf63d6b10d095da Mon Sep 17 00:00:00 2001 From: Salix <31168746+salix-d@users.noreply.github.com> Date: Sun, 23 Oct 2022 15:05:50 -0400 Subject: [PATCH 6/9] fix err --- R/transform_filter-methods.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/transform_filter-methods.R b/R/transform_filter-methods.R index 52a123af..81977e05 100644 --- a/R/transform_filter-methods.R +++ b/R/transform_filter-methods.R @@ -188,7 +188,7 @@ rarefy_even_depth <- newotu <- apply(physeq@otu_table, taxa_are_rows(physeq) + 1, rarefaction_subsample, sample.size = sample.size, replace = replace) # Add OTU names to the row indices - dimnames(newotu) <- dimnames(physeq@otu_table)#.taxa_names + taxa_names(newotu) <- taxa_names(physeq@otu_table) # replace the otu_table physeq@otu_table <- otu_table(newotu, taxa_are_rows(physeq)) if (trimOTUs) { From 8da5660dc1877ac13afc58e3e0ac8e43b8c3e2c8 Mon Sep 17 00:00:00 2001 From: Salix <31168746+salix-d@users.noreply.github.com> Date: Sun, 23 Oct 2022 15:24:21 -0400 Subject: [PATCH 7/9] refix err --- R/transform_filter-methods.R | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/R/transform_filter-methods.R b/R/transform_filter-methods.R index 81977e05..37e93414 100644 --- a/R/transform_filter-methods.R +++ b/R/transform_filter-methods.R @@ -182,15 +182,18 @@ rarefy_even_depth <- # Now done with notifying user of pruning, actually prune. physeq = prune_samples(!rmsamples, physeq) } - # apply through each sample, and replace - # instead of transposing the table : - # if taxa are rows use margin 2 (cols) else use margin 1 (rows) - newotu <- apply(physeq@otu_table, taxa_are_rows(physeq) + 1, rarefaction_subsample, - sample.size = sample.size, replace = replace) - # Add OTU names to the row indices - taxa_names(newotu) <- taxa_names(physeq@otu_table) - # replace the otu_table - physeq@otu_table <- otu_table(newotu, taxa_are_rows(physeq)) + # apply through each sample, and replace + # instead of transposing the table twice : + # if taxa are rows use margin 2 (cols) else use margin 1 (rows) + newotu <- apply(physeq@otu_table, taxa_are_rows(physeq) + 1, rarefaction_subsample, + sample.size = sample.size, replace = replace) + newotu <- otu_table(newotu, TRUE) + # Add OTU names to the row indices + taxa_names(newotu) <- taxa_names(physeq@otu_table) + # transpose otu_table if taxa were columns + if(!taxa_are_rows(physeq)) newotu <- t(newotu) + # replace the otu_table + physeq@otu_table <- newotu if (trimOTUs) { # Check for and remove empty OTUs # 1. Notify user of empty OTUs being cut. From 5efdac67297de980621189b5f9aadcb6322564d8 Mon Sep 17 00:00:00 2001 From: Salix <31168746+salix-d@users.noreply.github.com> Date: Sun, 23 Oct 2022 15:33:05 -0400 Subject: [PATCH 8/9] Update transform_filter-methods.R merge message to match other messages --- R/transform_filter-methods.R | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/R/transform_filter-methods.R b/R/transform_filter-methods.R index 37e93414..d9e63708 100644 --- a/R/transform_filter-methods.R +++ b/R/transform_filter-methods.R @@ -174,10 +174,10 @@ rarefy_even_depth <- if (verbose) { n.rmsamples <- sum(rmsamples) message(n.rmsamples, " samples removed", - "because they contained fewer reads than `sample.size`.") - message("Up to first five removed samples are: \n") - message(paste(names(rmsamples[rmsamples])[1:min(5, n.rmsamples)], "\t")) - message("...") + "because they contained fewer reads than `sample.size`.", + "Up to first five removed samples are: \n", + paste(names(rmsamples[rmsamples])[1:min(5, n.rmsamples)], "\t"), + "...") } # Now done with notifying user of pruning, actually prune. physeq = prune_samples(!rmsamples, physeq) From 236c215a5ed08928f76aaf7d29af13dfe5340d82 Mon Sep 17 00:00:00 2001 From: Salix <31168746+salix-d@users.noreply.github.com> Date: Tue, 25 Oct 2022 22:42:10 -0400 Subject: [PATCH 9/9] fix the check of rngseed the way I modified it `FALSE wasn't allowed anymore, so fixed that --- R/transform_filter-methods.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/R/transform_filter-methods.R b/R/transform_filter-methods.R index d9e63708..3b688f08 100644 --- a/R/transform_filter-methods.R +++ b/R/transform_filter-methods.R @@ -127,9 +127,9 @@ rarefy_even_depth <- trimOTUs = TRUE, verbose = TRUE) { - if ( !(is.integer(rngseed) || is.numeric(rngseed)) || length(rngseed) > 1) { - stop("`rngseed` must be a single integer value or FALSE") - } + if ( !(is.integer(rngseed) || is.numeric(rngseed) || is.logical(rngseed)) || length(rngseed) > 1) { + stop("`rngseed` must be a single integer or logical value") + } if ( !(is.logical(rngseed) && !rngseed) ) { # Now call the set.seed using the value expected in phyloseq