Skip to content

Commit

Permalink
update sample ordering and filtering nans and infs
Browse files Browse the repository at this point in the history
  • Loading branch information
shackett committed Sep 6, 2024
1 parent 1a49881 commit d062021
Show file tree
Hide file tree
Showing 5 changed files with 173 additions and 17 deletions.
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ export(tomic_sort_status)
export(tomic_to)
export(tomic_to_matrix)
export(triple_to_tidy)
export(update_sample_factors)
export(update_tidy_omic)
export(update_tomic)
import(ggplot2)
Expand Down
65 changes: 49 additions & 16 deletions R/dim_reduction.R
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,8 @@ add_pcs <- function(
#' then drop features}
#' \item{impute}{Impute missing values}
#' }
#' @param missing_value_types Types of values to treat as missing. One or more
#' of "NA", "NaN" and "Inf"
#' @inheritParams create_tidy_omic
#'
#' @returns A \code{tomic} object where missing values have been accounted
Expand All @@ -143,18 +145,24 @@ add_pcs <- function(
#'
#' @export
remove_missing_values <- function(
tomic,
value_var = NULL,
missing_val_method = "drop_samples",
verbose = TRUE
) {
tomic,
value_var = NULL,
missing_val_method = "drop_samples",
missing_value_types = c("NA", "NaN", "Inf"),
verbose = FALSE
) {

checkmate::assertClass(tomic, "tomic")
checkmate::assertChoice(
missing_val_method,
c("drop_features", "drop_samples")
)
checkmate::assertCharacter(missing_value_types, min.len = 1)
checkmate::assertLogical(verbose, len = 1)

VALID_MISSING_VALUE_TYPES <- c("NA", "NaN", "Inf")
purrr::walk(missing_value_types, checkmate::assertChoice, VALID_MISSING_VALUE_TYPES)

triple_omic <- tomic_to(tomic, "triple_omic")

design <- tomic$design
Expand All @@ -167,7 +175,8 @@ remove_missing_values <- function(
# find missing values of value_var
found_missing_values <- find_triple_omic_missing_values(
triple_omic,
value_var
value_var,
missing_value_types
)
observed_measurements <- found_missing_values$observed_measurements
missing_values <- found_missing_values$missing_values
Expand Down Expand Up @@ -248,11 +257,11 @@ remove_missing_values <- function(
#'
#' @export
impute_missing_values <- function(
tomic,
impute_var_name = "imputed",
value_var = NULL,
...
){
tomic,
impute_var_name = "imputed",
value_var = NULL,
...
){

if (!("impute" %in% rownames(utils::installed.packages()))) {
stop("Install \"impute\" using remotes::install_bioc(\"impute\") to use this function")
Expand Down Expand Up @@ -296,7 +305,7 @@ impute_missing_values <- function(
omic_matrix <- triple_omic$measurements %>%
reshape2::acast(formula = cast_formula, value.var = value_var)

# imput data
# impute
imputed_measurements <- impute::impute.knn(
omic_matrix,
...
Expand Down Expand Up @@ -397,15 +406,39 @@ value_var_handler <- function(value_var = NULL, design) {
return(value_var)
}

find_triple_omic_missing_values <- function(triple_omic, value_var) {
find_triple_omic_missing_values <- function(
triple_omic,
value_var,
missing_values_types = c("NA", "NaN", "Inf")
) {

all_expected_obs <- tidyr::expand_grid(
triple_omic$features[triple_omic$design$feature_pk],
triple_omic$samples[triple_omic$design$sample_pk]
)

observed_measurements <- triple_omic$measurements %>%
# drop missing values
dplyr::filter_at(value_var, function(x) !is.na(x))
observed_measurements <- triple_omic$measurements

if ("NA" %in% missing_values_types) {
observed_measurements <- observed_measurements %>%
dplyr::filter(
!is.na(!!rlang::sym(value_var))
)
}

if ("NaN" %in% missing_values_types) {
observed_measurements <- observed_measurements %>%
dplyr::filter(
!is.nan(!!rlang::sym(value_var))
)
}

if ("Inf" %in% missing_values_types) {
observed_measurements <- observed_measurements %>%
dplyr::filter(
is.finite(!!rlang::sym(value_var))
)
}

missing_values <- all_expected_obs %>%
dplyr::anti_join(
Expand Down
92 changes: 92 additions & 0 deletions R/mutates.R
Original file line number Diff line number Diff line change
Expand Up @@ -495,3 +495,95 @@ tomic_sort_status <- function(tomic) {

return(status)
}

#' Update Sample Factors
#'
#' Update sample metadata to order categorical variables based on a
#' specified factor order.
#'
#' @inheritParams tomic_to
#' @param factor_levels a character vector specifying the ordering of factor levels.
#'
#' @returns a tomic object with updated sample metadata
#'
#' @examples
#' update_sample_factors(
#' brauer_2008_tidy, list(nutrient = c("G", "N", "P", "S", "L", "U"))
#' )
#'
#' @export
update_sample_factors <- function (tomic, factor_levels) {

checkmate::assertClass(tomic, "tomic")
checkmate::assertNamed(factor_levels)
checkmate::assertList(factor_levels)

samples <- romic::get_tomic_table(tomic, "samples")
purrr::walk(names(factor_levels), checkmate::assertChoice, colnames(samples))

# update all categorical variables with specified factor orders
for (fct in names(factor_levels)) {
samples[[fct]] <- set_factor_levels(samples[[fct]], factor_levels[[fct]], fct)
}

out <- romic::update_tomic(tomic, samples)

return(out)
}

set_factor_levels <- function(samples_vec, fct_levels, fct_label = "?") {

# validate factor orders
if (!("character" %in% class(fct_levels))) {
cli::cli_abort(
"The factor levels for {fct_label} were {.val {class(fct_levels)}}.
This should be a character vector."
)
}

duplicated_levels <- unique(fct_levels[duplicated(fct_levels)])
if (length(duplicated_levels) > 0) {
cli::cli_abort(
"{length(duplicated_levels)} factor levels {?was/were} duplicated in the `factor_levels` specification for
{.val {fct_label}}: {duplicated_levels}"
)
}

if ("character" %in% class(samples_vec)) {

extra_sample_vars <- setdiff(samples_vec, fct_levels)
if (length(extra_sample_vars)) {
cli::cli_alert_warning(
"{.val {extra_sample_vars}} {?was/were} present in the sample metadata's {.field {fct_label}} field but did not have a corresponding factor level in the {.arg factor_levels} list. They will be added to the end of the specified factor levels"
)

fct_levels <- c(fct_levels, extra_sample_vars)
}

missing_sample_vars <- setdiff(fct_levels, samples_vec)
if (length(missing_sample_vars)) {
cli::cli_alert_warning(
"{.val {missing_sample_vars}} {?was/were} present in {.arg factor_levels} for {.field {fct_label}} but did not have a corresponding entry in the sample metadata."
)
}

if (any(is.na(samples_vec))) {
cli::cli_alert_warning(
"The {.field {fct_label}} field in the sample metadata contains {sum(is.na(samples_vec))} NA values. These entries will be replaced with an {.val unspecified} level.")

samples_vec[is.na(samples_vec)] <- "unspecified"
fct_levels <- c(fct_levels, "unspecified")
}

samples_fct_vec <- factor(
samples_vec,
levels = fct_levels
)
} else {
cli::cli_abort(
"The factor levels for fct were {.val {class(fct)}} and cannot be converted
to factors using the specified factor orders.")
}

return(samples_fct_vec)
}
6 changes: 5 additions & 1 deletion man/remove_missing_values.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

26 changes: 26 additions & 0 deletions man/update_sample_factors.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit d062021

Please sign in to comment.