From 8e01366e14728c5c18e92ade0194e7c130c1929f Mon Sep 17 00:00:00 2001 From: Iakov Davydov <671660+idavydov@users.noreply.github.com> Date: Thu, 13 Jul 2023 10:39:23 +0200 Subject: [PATCH 01/32] optimize_design and assign_ return a copy of bc, bc has a tibble with the trace --- R/assignment.R | 18 +- R/batch_container.R | 140 +++++++++--- R/optimize.R | 92 +++++--- R/score_autoscaling.R | 22 +- R/score_plates.R | 17 +- R/trace.R | 254 ++------------------- man/examples/assignment.R | 4 +- tests/testthat/test-compute-score-vector.R | 21 ++ 8 files changed, 243 insertions(+), 325 deletions(-) create mode 100644 tests/testthat/test-compute-score-vector.R diff --git a/R/assignment.R b/R/assignment.R index f52f0572..f2fcd044 100644 --- a/R/assignment.R +++ b/R/assignment.R @@ -4,16 +4,16 @@ #' @param samples data.frame with samples. #' @param batch_container Instance of BatchContainer class #' -#' @return Returns `BatchContainer`, invisibly. +#' @return Returns a new `BatchContainer`. #' @example man/examples/assignment.R assign_random <- function(batch_container, samples = NULL) { - assign_in_order(batch_container, samples) + batch_container <- assign_in_order(batch_container, samples) batch_container$move_samples( location_assignment = sample(batch_container$assignment) ) - invisible(batch_container) + batch_container } #' Distributes samples in order. @@ -25,9 +25,10 @@ assign_random <- function(batch_container, samples = NULL) { #' @param samples data.frame with samples. #' @param batch_container Instance of BatchContainer class #' -#' @return Returns `BatchContainer`, invisibly. +#' @return Returns a new `BatchContainer`. #' @example man/examples/assignment.R assign_in_order <- function(batch_container, samples = NULL) { + batch_container <- batch_container$copy() if (is.null(samples)) { assertthat::assert_that(batch_container$has_samples, msg = "batch-container is empty and no samples provided" @@ -46,7 +47,7 @@ assign_in_order <- function(batch_container, samples = NULL) { rep(NA_integer_, n_locations - n_samples) )) - invisible(batch_container) + batch_container } #' Shuffling proposal function with constraints. @@ -113,7 +114,7 @@ shuffle_with_constraints <- function(src = TRUE, dst = TRUE) { #' the function will check if samples in `batch_container` are identical to the ones in the #' `samples` argument. #' -#' @return Returns `BatchContainer`, invisibly. +#' @return Returns a new `BatchContainer`. #' #' @examples #' bc <- BatchContainer$new( @@ -133,11 +134,12 @@ shuffle_with_constraints <- function(src = TRUE, dst = TRUE) { #' 2, "a", 3, 5, "TRT", #' ) #' # assign samples from the sample sheet -#' assign_from_table(bc, sample_sheet) +#' bc <- assign_from_table(bc, sample_sheet) #' #' bc$get_samples(remove_empty_locations = TRUE) #' assign_from_table <- function(batch_container, samples) { + batch_container <- batch_container$copy() # sample sheet has all the batch variable assertthat::assert_that(is.data.frame(samples) && nrow(samples) > 0, msg = "samples should be non-empty data.frame" @@ -177,5 +179,5 @@ assign_from_table <- function(batch_container, samples) { batch_container$move_samples(location_assignment = samples_with_id$.sample_id) - invisible(batch_container) + batch_container } diff --git a/R/batch_container.R b/R/batch_container.R index 19c0b763..18efc1b4 100644 --- a/R/batch_container.R +++ b/R/batch_container.R @@ -328,23 +328,51 @@ BatchContainer <- R6::R6Class("BatchContainer", #' @description #' Score current sample assignment, - #' @return Returns a vector of all scoring functions values. - score = function() { - assertthat::assert_that(!is.null(private$scoring_funcs), - msg = "Scoring function needs to be assigned" + #' @param scoring names list of scoring functions. Each functin should + #' return a numeric vector. + #' @return Returns a named vector of all scoring functions values. + score = function(scoring) { + assertthat::assert_that( + !missing(scoring), + !is.null(scoring), + msg = "Scoring function needs to be provided" ) - assertthat::assert_that(is.list(private$scoring_funcs), - length(private$scoring_funcs) >= 1, + assertthat::assert_that(is.list(scoring), + length(scoring) >= 1, msg = "Scroring function should be a non-empty list" ) - assertthat::assert_that(!is.null(private$samples_table), + assertthat::assert_that(!is.null(names(scoring)), + msg = "scoring should be a named list" + ) + assertthat::assert_that(self$has_samples, msg = "No samples in the batch container, cannot compute score" ) - - res <- purrr::map_dbl(private$scoring_funcs, ~ .x(self)) - assertthat::assert_that(length(res) == length(private$scoring_funcs)) - - assertthat::assert_that(is.numeric(res), msg = "Scoring function should return a number") + res <- purrr::imap( + scoring, + \(f, i) { + v <- f(self) + assertthat::assert_that( + is.numeric(v), + length(v) >= 1, + msg = "scoring function should return a numeric vector of positive length" + ) + if (length(v) > 1) { + if (is.null(names(v))) { + names(v) <- seq_along(v) + } + names(v) <- stringr::str_c(i, names(v)) + } else { + names(v) <- i + } + v + } + ) |> + purrr::flatten_dbl() + assertthat::assert_that(length(res) >= length(scoring)) + assertthat::assert_that( + !any(names(res) == "step"), + msg = "score name cannot be 'step'" + ) return(res) }, @@ -368,7 +396,7 @@ BatchContainer <- R6::R6Class("BatchContainer", bc$samples_attr <- private$samples_attributes } - bc$scoring_f <- self$scoring_f + bc$trace <- self$trace bc }, @@ -398,6 +426,75 @@ BatchContainer <- R6::R6Class("BatchContainer", cat() cat("\n") invisible(self) + }, + + #' @description + #' Optimization trace, a [tibble::tibble()] + trace = tibble::tibble( + optimization_index = numeric(), + call = list(), + start_assignment_vec = list(), + end_assignment_vec = list(), + scores = list(), + aggregated_scores = list(), + seed = list(), + elapsed = as.difftime(character(0)) + ), + + #' @description + #' Return a tibble with scores from the last optimization. + #' + #' @param include_aggregated shall aggregated scores be included + #' @return a [tibble::tibble()] with scores + last_optimization_tibble = function(include_aggregated = FALSE) { + assertthat::assert_that( + tibble::is_tibble(self$trace), + nrow(self$trace) >= 1, + msg = "trace should be available" + ) + assertthat::assert_that(assertthat::is.flag(include_aggregated)) + d <- tail(self$trace, 1)$scores[[1]] %>% + tidyr::pivot_longer(-.data$step, + names_to = "score", + values_to = "value") + d$aggregated <- FALSE + if (include_aggregated) { + d_agg <- tail(self$trace, 1)$aggregated_scores[[1]] + if (!is.null(d_agg)) { + d_agg <- tidyr::pivot_longer( + d_agg, + -.data$step, + names_to = "score", + values_to = "value" + ) + d_agg$score <- paste0("agg.", d_agg$score) + d_agg$aggregated <- TRUE + d <- dplyr::bind_rows( + d, + d_agg + ) + } + } else { + d_agg <- NULL + } + d + }, + + #' @description + #' Plot trace + plot_trace = function(include_aggregated = FALSE, ...) { + d <- self$last_optimization_tibble(include_aggregated) + p <- ggplot2::ggplot(d) + + ggplot2::aes(.data$step, .data$value, group = .data$score, color = .data$score) + + ggplot2::geom_line() + + ggplot2::geom_point() + if (include_aggregated && any(d$aggregated)) { + p <- p + + ggplot2::facet_wrap(~ score, scales = "free_y", ncol = 1) + } else { + p <- p + + ggplot2::facet_wrap(~ aggregated, scales = "free_y", ncol = 1) + } } ), private = list( @@ -445,22 +542,7 @@ BatchContainer <- R6::R6Class("BatchContainer", #' Upon assignment a single function will be automatically converted to a list #' In the later case each function is called. scoring_f = function(value) { - if (missing(value)) { - private$scoring_funcs - } else { - if (is.null(value)) { - private$scoring_funcs <- NULL - } else if (is.function(value)) { - private$scoring_funcs <- list(value) - } else { - assertthat::assert_that(is.list(value), length(value) >= 1) - assertthat::assert_that( - all(purrr::map_lgl(self$scoring_f, is.function)), - msg = "All elements of scoring_f should be functions" - ) - private$scoring_funcs <- value - } - } + stop("scoring_f is deprecated, pass it to optimize_design() directly instead") }, #' @field has_samples diff --git a/R/optimize.R b/R/optimize.R index 2db24aa5..c8951a41 100644 --- a/R/optimize.R +++ b/R/optimize.R @@ -123,6 +123,7 @@ update_batchcontainer <- function(bc, shuffle_params) { #' @param batch_container An instance of `BatchContainer`. #' @param samples A `data.frame` with sample information. #' Should be `NULL` if the `BatchContainer` already has samples in it. +#' @param scoring Scoring function or a named [list()] of scoring functions. #' @param n_shuffle Vector of length 1 or larger, defining how many random sample #' swaps should be performed in each iteration. If length(n_shuffle)==1, #' this sets no limit to the number of iterations. Otherwise, the optimization @@ -170,10 +171,14 @@ update_batchcontainer <- function(bc, shuffle_params) { #' bc <- BatchContainer$new( #' dimensions = c("plate" = 2, "column" = 5, "row" = 6) #' ) -#' bc$scoring_f <- osat_score_generator("plate", "Sex") -#' optimize_design(bc, invivo_study_samples, max_iter = 100) +#' bc <- optimize_design(bc, invivo_study_samples, +#' scoring = osat_score_generator("plate", "Sex"), +#' max_iter = 100 +#' ) #' plot_plate(bc$get_samples(), .col = Sex) -optimize_design <- function(batch_container, samples = NULL, n_shuffle = NULL, +optimize_design <- function(batch_container, samples = NULL, + scoring = NULL, + n_shuffle = NULL, shuffle_proposal_func = NULL, acceptance_func = accept_strict_improvement, aggregate_scores_func = identity, @@ -182,10 +187,19 @@ optimize_design <- function(batch_container, samples = NULL, n_shuffle = NULL, sample_attributes_fixed = FALSE, max_iter = 1e4, min_delta = NA, quiet = FALSE) { start_time <- Sys.time() + cl <- match.call() + + # create a copy, so that we do not modify the BatchContainer + batch_container <- batch_container$copy() + trace <- tibble::tibble( + optimization_index = max(batch_container$trace$optimization_index, 0) + 1, + call = list(cl), + start_assignment_vec = list(batch_container$assignment) + ) # based on https://stat.ethz.ch/pipermail/r-help/2007-September/141717.html if (!exists(".Random.seed")) stats::runif(1) - save_random_seed <- .Random.seed + trace$seed <- list(.Random.seed) if (is.null(samples)) { assertthat::assert_that(batch_container$has_samples, @@ -197,11 +211,22 @@ optimize_design <- function(batch_container, samples = NULL, n_shuffle = NULL, } - # Check presence of scoring function and that it's a list of functions - assertthat::assert_that(!is.null(batch_container$scoring_f), msg = "no scoring function set for BatchContainer") - assertthat::assert_that(is.list(batch_container$scoring_f), msg = "scoring function is expected to be a list") - assertthat::assert_that(all(purrr::map_lgl(batch_container$scoring_f, is.function)), msg = "All scoring functions have to be function definitions") - + assertthat::assert_that( + !is.null(scoring), + msg = "Scoring should be provided when calling optimize_design()" + ) + if (is.function(scoring)) { + scoring <- list(scoring) + } else { + assertthat::assert_that(is.list(scoring), length(scoring) >= 1) + assertthat::assert_that( + all(purrr::map_lgl(scoring, is.function)), + msg = "All elements of scoring should be functions" + ) + } + if (is.null(names(scoring))) { + names(scoring) <- stringr::str_c("score_", seq_along(scoring)) + } # Get assigned samples and locations from the batch container samp <- batch_container$get_samples(include_id = TRUE, assignment = TRUE, remove_empty_locations = FALSE) @@ -257,13 +282,14 @@ optimize_design <- function(batch_container, samples = NULL, n_shuffle = NULL, best_shuffle <- list(src = NULL, dst = NULL, location_assignment = batch_container$assignment, samples_attr = NULL) } - initial_score <- batch_container$score() # Evaluate this just once in order not to break current tests + # Evaluate this just once in order not to break current tests + initial_score <- batch_container$score(scoring) score_dim <- length(initial_score) # Check score variances (should be all >0) if (check_score_variance) { bc_copy <- batch_container$copy() - score_vars <- random_score_variances(batch_container$copy(), random_perm = 100, sample_attributes_fixed) + score_vars <- random_score_variances(batch_container$copy(), scoring = scoring, random_perm = 100, sample_attributes_fixed) low_var_scores <- score_vars < 1e-10 if (!quiet) { message( @@ -298,6 +324,7 @@ optimize_design <- function(batch_container, samples = NULL, n_shuffle = NULL, ) } autoscale_func <- mk_autoscale_function(batch_container$copy(), + scoring = scoring, random_perm = autoscaling_permutations, use_boxcox = autoscale_useboxcox, sample_attributes_fixed @@ -312,20 +339,23 @@ optimize_design <- function(batch_container, samples = NULL, n_shuffle = NULL, prev_agg <- NULL - trace <- OptimizationTrace$new( - max_iter + 1, # + 1 to accommodate initial score - length(batch_container$scoring_f), - names(batch_container$scoring_f) + scores_mat <- matrix( + nrow = max_iter + 1, # + 1 to accommodate initial score + ncol = length(best_score), + dimnames = list(NULL, names(best_score)) ) + scores_mat[1,] <- best_score if (identical(aggregate_scores_func, identity)) { - # Do not store aggregated scores if unnecessary - trace$set_scores(1, best_score, NULL) + aggregated_scores_mat <- NULL } else { - trace$set_scores(1, best_score, best_agg) + aggregated_scores_mat <- matrix( + nrow = max_iter + 1, # + 1 to accommodate initial score + ncol = length(best_agg), + dimnames = list(NULL, names(best_agg)) + ) + aggregated_scores_mat[1,] <- best_agg } - # to do: make work with >1-dim agg, line should read as - # trace$set_scores(1, best_score, best_agg) if (!quiet) report_scores(best_score, best_agg, iteration = 0) @@ -338,7 +368,7 @@ optimize_design <- function(batch_container, samples = NULL, n_shuffle = NULL, using_attributes <- TRUE } - new_score <- autoscale_func(batch_container$score()) + new_score <- autoscale_func(batch_container$score(scoring)) assertthat::assert_that(!any(is.na(new_score)), msg = stringr::str_c("NA apprearing during scoring in iteration ", iteration)) new_agg <- aggregate_scores_func(new_score) @@ -358,11 +388,9 @@ optimize_design <- function(batch_container, samples = NULL, n_shuffle = NULL, } iteration <- iteration + 1 - if (identical(aggregate_scores_func, identity)) { - # Do not store aggregated scores if unnecessary - trace$set_scores(iteration, best_score, NULL) - } else { - trace$set_scores(iteration, best_score, best_agg) + scores_mat[iteration,] <- best_score + if (!is.null(aggregated_scores_mat)) { + aggregated_scores_mat[iteration,] <- best_agg } # Test stopping criteria @@ -384,8 +412,14 @@ optimize_design <- function(batch_container, samples = NULL, n_shuffle = NULL, # In the end, always make sure that final state of bc is the one with the best score update_batchcontainer(batch_container, best_shuffle) - trace$shrink(iteration) - trace$seed <- save_random_seed + # shrink + trace$scores <- shrink_mat(scores_mat, iteration) + trace$aggregated_scores <- shrink_mat(aggregated_scores_mat, iteration) trace$elapsed <- Sys.time() - start_time - trace + trace$end_assignment_vec = list(bc$assignment) + batch_container$trace <- dplyr::bind_rows( + batch_container$trace, + trace + ) + batch_container } diff --git a/R/score_autoscaling.R b/R/score_autoscaling.R index bdcf1dfc..8104d48c 100644 --- a/R/score_autoscaling.R +++ b/R/score_autoscaling.R @@ -1,20 +1,22 @@ #' Sample scores from a number of completely random sample permutations #' #' @param batch_container An instance of [BatchContainer]. +#' @param scoring A named [list()] of scoring function. Each function should +#' return a vector of non-zero length. #' @param random_perm Number of random sample permutations to be done. #' @param sample_attributes_fixed Logical; if `FALSE`, simulate a shuffle function that alters sample attributes at each iteration. #' #' @return A score matrix with n (# of permutations) rows and m (dimensionality of score) columns. #' #' @keywords internal -sample_random_scores <- function(batch_container, random_perm, sample_attributes_fixed) { - random_scores <- matrix(NA_real_, nrow = random_perm, ncol = length(batch_container$score())) +sample_random_scores <- function(batch_container, scoring, random_perm, sample_attributes_fixed) { + random_scores <- matrix(NA_real_, nrow = random_perm, ncol = length(batch_container$score(scoring))) for (i in seq_len(random_perm)) { batch_container$move_samples(location_assignment = complete_random_shuffling(batch_container)) if (!sample_attributes_fixed && batch_container$has_samples_attr) { batch_container$samples_attr <- batch_container$samples_attr[sample(nrow(batch_container$samples_attr)), ] } - random_scores[i, ] <- batch_container$score() + random_scores[i, ] <- batch_container$score(scoring) } random_scores @@ -24,6 +26,8 @@ sample_random_scores <- function(batch_container, random_perm, sample_attributes #' Create a function that transforms a current (multi-dimensional) score into a boxcox normalized one #' #' @param batch_container An instance of [BatchContainer]. +#' @param scoring A named [list()] of scoring function. Each function should +#' return a vector of non-zero length. #' @param random_perm Number of random sample permutations for the estimation of the scaling params. #' @param use_boxcox Logical; if TRUE and the `bestNormalize` package is available, boxcox transformations will be used to #' normalize individual scores. If not possible, scores will just be transformed to a zero mean and unit standard deviation. @@ -31,9 +35,9 @@ sample_random_scores <- function(batch_container, random_perm, sample_attributes #' #' @return The transformation function for a new score vector #' @keywords internal -mk_autoscale_function <- function(batch_container, random_perm, use_boxcox = TRUE, sample_attributes_fixed = FALSE) { - random_scores <- sample_random_scores(batch_container, random_perm, sample_attributes_fixed) - score_dim <- length(batch_container$score()) +mk_autoscale_function <- function(batch_container, scoring, random_perm, use_boxcox = TRUE, sample_attributes_fixed = FALSE) { + random_scores <- sample_random_scores(batch_container, scoring, random_perm, sample_attributes_fixed) + score_dim <- length(batch_container$score(scoring)) # Return function using boxcox transform if bestNormalize package is available if (use_boxcox && requireNamespace("bestNormalize", quietly = T)) { @@ -69,13 +73,15 @@ mk_autoscale_function <- function(batch_container, random_perm, use_boxcox = TRU #' Estimate the variance of individual scores by a series of completely random sample permutations #' #' @param batch_container An instance of `BatchContainer`. +#' @param scoring A named [list()] of scoring function. Each function should +#' return a vector of non-zero length. #' @param random_perm Number of random sample permutations to be done. #' @param sample_attributes_fixed Logical; if FALSE, simulate a shuffle function that alters sample attributes at each iteration. #' #' @return Vector of length m (=dimensionality of score) with estimated variances of each subscore #' #' @keywords internal -random_score_variances <- function(batch_container, random_perm, sample_attributes_fixed) { - random_scores <- sample_random_scores(batch_container, random_perm, sample_attributes_fixed) +random_score_variances <- function(batch_container, scoring, random_perm, sample_attributes_fixed) { + random_scores <- sample_random_scores(batch_container, scoring, random_perm, sample_attributes_fixed) purrr::map_dbl(asplit(random_scores, 2), stats::var, na.rm = T) } diff --git a/R/score_plates.R b/R/score_plates.R index 7cf1c50c..793bc24f 100644 --- a/R/score_plates.R +++ b/R/score_plates.R @@ -229,8 +229,6 @@ optimize_multi_plate_design <- function(batch_container, across_plates_variables msg = "All columns in 'within_plate_variable' argument have to be found in batch container samples." ) - traces <- list() - skip_osat <- is.null(across_plates_variables) || is.null(plate) || dplyr::n_distinct(bc$get_locations()[[plate]]) < 2 if (skip_osat && !quiet) message("\nNo balancing of variables across plates required...") @@ -239,25 +237,22 @@ optimize_multi_plate_design <- function(batch_container, across_plates_variables scoring_funcs <- purrr::map(across_plates_variables, ~ osat_score_generator(batch_vars = plate, feature_vars = .x)) %>% unlist() names(scoring_funcs) <- across_plates_variables - bc$scoring_f <- scoring_funcs if (!quiet) message("\nAssigning samples to plates...") - trace1 <- optimize_design(bc, + bc <- optimize_design(bc, + scoring = scoring_funcs, max_iter = max_iter, n_shuffle = n_shuffle, acceptance_func = accept_leftmost_improvement, quiet = TRUE ) - traces <- c(traces, osat_across_plates = trace1) } if (!is.null(within_plate_variables)) { - within_traces <- list() plate_levels <- unique(bc$get_locations()[[plate]]) scoring_funcs <- purrr::map(within_plate_variables, ~ mk_plate_scoring_functions(bc, plate = plate, row = row, column = column, group = .x)) %>% unlist() names(scoring_funcs) <- paste(rep(within_plate_variables, each = length(plate_levels)), names(scoring_funcs)) - bc$scoring_f <- scoring_funcs if (!quiet) { @@ -270,7 +265,8 @@ optimize_multi_plate_design <- function(batch_container, across_plates_variables for (curr_plate in plate_levels) { if (!quiet && length(plate_levels) > 1) cat(curr_plate, "... ") - trace2 <- optimize_design(bc, + bc <- optimize_design(bc, + scoring = scoring_funcs, max_iter = max_iter, quiet = TRUE, shuffle_proposal_func = mk_subgroup_shuffling_function( @@ -279,16 +275,13 @@ optimize_multi_plate_design <- function(batch_container, across_plates_variables ), acceptance_func = accept_leftmost_improvement ) - within_traces <- c(within_traces, trace2) } if (!quiet) cat("\n") - names(within_traces) <- paste0("within_plate_", plate_levels) - traces <- c(traces, within_traces) } if (skip_osat && is.null(within_plate_variables) && !quiet) { message("\nBoth across plates and within plate optimization skipped ('within_plate_variables' is empty).\nBatch container unchanged.\n") } - invisible(traces) + bc } diff --git a/R/trace.R b/R/trace.R index b4a88dad..7abc89ac 100644 --- a/R/trace.R +++ b/R/trace.R @@ -1,240 +1,3 @@ -#' @title -#' OptimizationTrace represents optimization trace. -#' -#' @description -#' Usually it is created by [optimize_design()]. -#' -#' @export -OptimizationTrace <- R6::R6Class("OptimizationTrace", - public = list( - #' @field scores - #' Contains a matrix of scores. The matrix size is usually - #' `c(iterations + 1, length(bc$scoring_f))` - scores = NULL, - - #' @field aggregated_scores - #' Contains a matrix of scores after aggregation. - #' The matrix size is usually `c(iterations + 1, length(aggregated))`, - #' where `length(aggregated)` is the length of aggregated scores vector. - #' Can be `NULL` if aggregated scores are not used. - aggregated_scores = NULL, - - #' @field seed - #' Saved value of [.Random.seed]. - seed = NULL, - - #' @field elapsed - #' Running time of the optimization. - elapsed = NULL, - - #' @field last_step - #' Last iteration step for which the score was set. - last_step = 0, - - #' @description - #' Create a new `OptimizationTrace` object. - #' - #' @param n_steps - #' Number of values to save. Usually `n_steps == iterations + 1`. - #' @param n_scores - #' Number of scoring functions. - #' @param score_names - #' Names of scoring functions. - #' - #' @examples - #' tr <- OptimizationTrace$new(10, 2, c("score1", "score2")) - initialize = function(n_steps, n_scores, score_names) { - self$scores <- matrix(NA_real_, nrow = n_steps, ncol = n_scores) - if (!is.null(score_names)) { - dimnames(self$scores) <- list(NULL, score_names) - } - }, - - #' @description - #' Set scores for i-th step. - #' - #' @param i - #' Step number. - #' @param scores - #' Scores, a vector or a value if no auxiliary functions are used. - #' @param aggregated_scores - #' Vector of aggregated scores. Can be NULL. - #' - #' @return `OptimizationTrace` invisibly. - #' - #' @examples - #' tr$set_scores(1, c(0.5, 0.5), NULL) - #' tr$set_scores(2, c(0.5, 0.5), NULL) - set_scores = function(i, scores, aggregated_scores) { - assertthat::assert_that( - assertthat::is.count(i), - is.vector(scores), - is.null(aggregated_scores) || is.vector(aggregated_scores) - ) - # initialize aggregated scores, in case they're empty - self$scores[i, ] <- scores - if (!is.null(aggregated_scores)) { - if (is.null(self$aggregated_scores)) { - self$aggregated_scores <- matrix( - NA_real_, - nrow = nrow(self$scores), - ncol = length(aggregated_scores) - ) - } - assertthat::assert_that( - length(aggregated_scores) == ncol(self$aggregated_scores) - ) - self$aggregated_scores[i, ] <- aggregated_scores - } - self$last_step <- i - invisible(self) - }, - - #' @description - #' Shrink scores by keeping only first `last_step` scores. - #' - #' @param last_step - #' Last step to keep. - #' - #' @return `OptimizationTrace` invisibly. - #' @examples - #' tr$shrink(2) - shrink = function(last_step = self$last_step) { - self$scores <- head(self$scores, last_step) - if (!is.null(self$aggregated_scores)) { - self$aggregated_scores <- head(self$aggregated_scores, last_step) - } - invisible(self) - }, - - #' @description - #' Return individual (not aggregated!) scores by keeping only first `last_step` scores. - #' - #' @param last_step - #' Last step to keep. - #' - #' @return `OptimizationTrace` invisibly. - #' @examples - #' tr$get_scores() - get_scores = function(last_step = self$last_step) { - head(self$scores, last_step) - }, - - #' @description - #' Print `OptimizationTrace`. - #' - #' @param ... - #' Unused. - #' - #' @return `OptimizationTrace` invisibly. - #' @examples - #' print(tr) - print = function(...) { - start_score <- self$scores[1, ] %>% - round(3) %>% - stringr::str_c(collapse = ",") - final_score <- tail(self$scores, 1) %>% - round(3) %>% - stringr::str_c(collapse = ",") - cat(stringr::str_glue("Optimization trace ({self$n_steps} score values, elapsed {format(self$elapsed)}).\n\n")) - cat(" Starting score: ", start_score, "\n", sep = "") - cat(" Final score : ", final_score, "\n", sep = "") - invisible(self) - }, - - #' @description - #' Convert to a [data.frame]. - #' - #' @param include_aggregated Include aggregated scores. Otherwise only - #' raw scores are exported. - #' - #' @return [data.frame] - #' @examples - #' tr$as_tibble() - as_tibble = function(include_aggregated = TRUE) { - scores <- make_colnames(self$scores, "score") %>% - tibble::as_tibble() %>% - dplyr::mutate(step = dplyr::row_number()) %>% - tidyr::pivot_longer( - c(-step), - names_to = "score", - values_to = "value" - ) %>% - dplyr::mutate(score = factor(score)) - if (include_aggregated) { - agg_scores <- self$aggregated_scores - } else { - agg_scores <- NULL - } - if (!is.null(agg_scores) && include_aggregated) { - colnames(agg_scores) <- stringr::str_c( - "agg.", seq_len(ncol(agg_scores)) - ) - agg_scores <- agg_scores %>% - tibble::as_tibble() %>% - dplyr::mutate(step = dplyr::row_number()) %>% - tidyr::pivot_longer( - c(-step), - names_to = "score", - values_to = "value" - ) %>% - dplyr::mutate(score = factor(score)) - } - dplyr::bind_rows( - score = scores, - aggregated = agg_scores, - .id = "type" - ) %>% - dplyr::mutate(type = factor(type, levels = c("score", "aggregated"))) - }, - - #' @description - #' Plot `OptimizationTrace`. Only the main score at the moment. - #' - #' @param include_aggregated Include aggregated scores. Otherwise only - #' raw scores are plotted. - #' @param ... - #' Not used. - #' - #' @examples - #' tr <- OptimizationTrace$new(10, 3, letters[1:3]) - #' for (i in seq_len(10)) { - #' tr$set_scores(i, rnorm(3)*(1:3), rnorm(3)*(1:3)) - #' } - #' - #' # plot only the main scores - #' plot(tr) - #' # plot main and aggregated scores - #' plot(tr, include_aggregated=TRUE) - plot = function(include_aggregated = FALSE, ...) { - p <- self$as_tibble(include_aggregated = include_aggregated) %>% - ggplot2::ggplot() + - ggplot2::aes(x = step, y = value, group = score, color = score) + - ggplot2::geom_point() + - ggplot2::geom_line() - - if (include_aggregated) { - p + - ggplot2::facet_wrap(~type, scales = "free_y", ncol = 1) - } else { - p + - ggplot2::facet_wrap(~score, scales = "free_y", ncol = 1) - } - } - ), - active = list( - #' @field n_steps - #' Returns number of steps in the `OptimizationTrace`. - n_steps = function(value) { - if (missing(value)) { - nrow(self$scores) - } else { - stop("Cannot set n_steps (read-only).") - } - } - ) -) - #' Make [matrix] column names unique. #' #' @param prefix Prefix to add if column names are empty. @@ -251,3 +14,20 @@ make_colnames <- function(m, prefix = "X") { colnames(m) <- make.names(colnames(m), unique = TRUE) m } + +#' Shrinks a matrix with scores and adds an iteration column. +#' +#' @param m input matrix +#' @param last_iteration last iteration +#' +#' @return a [tibble::tibble()] wrapped in a list +#' @keywords internal +shrink_mat <- function(m, last_iteration) { + if (is.null(m)) + return(m) + dplyr::bind_cols( + tibble::tibble(step=seq_len(last_iteration)), + as.data.frame(head(m, last_iteration)) + ) %>% + list() +} diff --git a/man/examples/assignment.R b/man/examples/assignment.R index e6604410..bc1b2a9f 100644 --- a/man/examples/assignment.R +++ b/man/examples/assignment.R @@ -6,9 +6,9 @@ bc set.seed(42) # assigns samples randomly -assign_random(bc, samples) +bc <- assign_random(bc, samples) bc$get_samples() # assigns samples in order -assign_in_order(bc) +bc <- assign_in_order(bc) bc$get_samples() diff --git a/tests/testthat/test-compute-score-vector.R b/tests/testthat/test-compute-score-vector.R new file mode 100644 index 00000000..205cc8ac --- /dev/null +++ b/tests/testthat/test-compute-score-vector.R @@ -0,0 +1,21 @@ +test_that("bc$score() produces correct vector names", { + bc <- BatchContainer$new( + dimensions = c(row = 3, column = 3) + ) + samp <- data.frame(sid = 1) + bc <- assign_in_order(bc, samp) + expect_equal( + bc$score( + list( + a = function(...) c(1, 2), + b = function(...) c(1), + c = function(...) c(x=1, y=2), + d = function(...) c(1) + ) + ), + setNames( + c(1, 2, 1, 1, 2, 1), + c("a1", "a2", "b", "cx", "cy", "d") + ) + ) +}) From dde80ea683a77f3e8cb5c74cc66c59e8cf5b0e58 Mon Sep 17 00:00:00 2001 From: Iakov Davydov <671660+idavydov@users.noreply.github.com> Date: Thu, 13 Jul 2023 11:28:10 +0200 Subject: [PATCH 02/32] plot_plate and scores_tibble can support multiple optimization --- R/batch_container.R | 78 +++++++++++++++++++++++++++++---------------- 1 file changed, 50 insertions(+), 28 deletions(-) diff --git a/R/batch_container.R b/R/batch_container.R index 18efc1b4..946acef9 100644 --- a/R/batch_container.R +++ b/R/batch_container.R @@ -442,59 +442,81 @@ BatchContainer <- R6::R6Class("BatchContainer", ), #' @description - #' Return a tibble with scores from the last optimization. + #' Return a tibble with scores from an optimization. #' - #' @param include_aggregated shall aggregated scores be included + #' @param index optimization index, all by default + #' @param include_aggregated include aggregated scores #' @return a [tibble::tibble()] with scores - last_optimization_tibble = function(include_aggregated = FALSE) { + scores_tibble = function(index = NULL, include_aggregated = FALSE) { assertthat::assert_that( tibble::is_tibble(self$trace), nrow(self$trace) >= 1, msg = "trace should be available" ) assertthat::assert_that(assertthat::is.flag(include_aggregated)) - d <- tail(self$trace, 1)$scores[[1]] %>% - tidyr::pivot_longer(-.data$step, + if (is.null(index)) { + index <- self$trace$optimization_index + } + assertthat::assert_that( + rlang::is_integerish(index), + msg = "index should be an integer" + ) + d <- self$trace %>% + dplyr::filter(.data$optimization_index %in% index) %>% + dplyr::select(.data$optimization_index, .data$scores) %>% + tidyr::unnest(.data$scores) %>% + tidyr::pivot_longer(c(-.data$optimization_index, -.data$step), names_to = "score", - values_to = "value") - d$aggregated <- FALSE + values_to = "value") %>% + dplyr::mutate(aggregated = FALSE) if (include_aggregated) { - d_agg <- tail(self$trace, 1)$aggregated_scores[[1]] - if (!is.null(d_agg)) { - d_agg <- tidyr::pivot_longer( - d_agg, - -.data$step, - names_to = "score", - values_to = "value" - ) - d_agg$score <- paste0("agg.", d_agg$score) - d_agg$aggregated <- TRUE - d <- dplyr::bind_rows( - d, - d_agg - ) + d_agg <- self$trace %>% + dplyr::filter(.data$optimization_index %in% index) %>% + dplyr::select(.data$optimization_index, .data$aggregated_scores) %>% + tidyr::unnest(.data$aggregated_scores) + + if ("step" %in% colnames(d_agg)) { + # if no aggregated scores are provided (aggregated_scores=NULL), + # there will be no step column after unnesting + d_agg <- d_agg %>% + tidyr::pivot_longer(c(-.data$optimization_index, -.data$step), + names_to = "score", + values_to = "value") %>% + dplyr::mutate( + aggregated = TRUE, + score = paste0("agg.", .data$score) + ) + d <- dplyr::bind_rows( + d, + d_agg + ) } - } else { - d_agg <- NULL } d }, #' @description #' Plot trace - plot_trace = function(include_aggregated = FALSE, ...) { - d <- self$last_optimization_tibble(include_aggregated) + #' @param index optimization index, all by default + #' @param include_aggregated include aggregated scores + #' @return a [ggplot2::ggplot()] object + plot_trace = function(index = NULL, include_aggregated = FALSE, ...) { + d <- self$scores(index, include_aggregated) p <- ggplot2::ggplot(d) + ggplot2::aes(.data$step, .data$value, group = .data$score, color = .data$score) + ggplot2::geom_line() + ggplot2::geom_point() - if (include_aggregated && any(d$aggregated)) { + if (length(unique(d$optimization_index)) > 1) { p <- p + - ggplot2::facet_wrap(~ score, scales = "free_y", ncol = 1) - } else { + ggplot2::facet_wrap(~ optimization_index, scales = "free") + } else if (include_aggregated && any(d$aggregated)) { p <- p + ggplot2::facet_wrap(~ aggregated, scales = "free_y", ncol = 1) + } else { + p <- p + + ggplot2::facet_wrap(~ score, scales = "free_y", ncol = 1) } + p } ), private = list( From 996339fd9e5415ba68d638258cd947afebe3fd19 Mon Sep 17 00:00:00 2001 From: Iakov Davydov <671660+idavydov@users.noreply.github.com> Date: Thu, 13 Jul 2023 11:28:40 +0200 Subject: [PATCH 03/32] update NCS22 and basic examples vignettes to use read-only bc --- vignettes/NCS22_talk.Rmd | 50 +++++++++++++++++------------------- vignettes/basic_examples.Rmd | 37 +++++++++++++------------- 2 files changed, 41 insertions(+), 46 deletions(-) diff --git a/vignettes/NCS22_talk.Rmd b/vignettes/NCS22_talk.Rmd index 07746f40..7580c73f 100644 --- a/vignettes/NCS22_talk.Rmd +++ b/vignettes/NCS22_talk.Rmd @@ -57,16 +57,8 @@ set.seed(17) # gives `bad` random assignment bc <- BatchContainer$new( dimensions = list("batch" = 3, "location" = 11), -) - -bc$scoring_f <- list( - group = osat_score_generator(batch_vars = "batch", - feature_vars = "Group"), - sex = osat_score_generator(batch_vars = "batch", - feature_vars = "Sex") -) - -assign_random(bc, subject_data) +) %>% + assign_random(subject_data) ``` Gone wrong: Random distribution of 31 grouped subjects into 3 batches @@ -119,14 +111,8 @@ set.seed(17) # gives `bad` random assignment ```{r} bc <- BatchContainer$new( dimensions = list("batch" = 3, "location" = 11), -) -bc$scoring_f <- list( - group = osat_score_generator(batch_vars = "batch", - feature_vars = "Group"), - sex = osat_score_generator(batch_vars = "batch", - feature_vars = "Sex") -) -assign_random(bc, subject_data) +) %>% + assign_random(subject_data) ``` @@ -172,10 +158,20 @@ bind_rows(head(bc$get_samples(), 3) %>% * sex (lower priority) ```{r, warning=FALSE} -trace <- optimize_design( +bc <- optimize_design( bc, + scoring = list( + group = osat_score_generator( + batch_vars = "batch", + feature_vars = "Group" + ), + sex = osat_score_generator( + batch_vars = "batch", + feature_vars = "Sex" + ) + ), n_shuffle = 1, - acceptance_func = + acceptance_func = ~ accept_leftmost_improvement(..., tolerance = 0.01), max_iter = 150, quiet = TRUE @@ -193,7 +189,7 @@ cowplot::plot_grid( bc$get_samples() %>% ggplot(aes(x = batch, fill = Sex)) + geom_bar() + labs(y = "subject count"), - trace$plot(include_aggregated = TRUE) + bc$plot_trace(include_aggregated = TRUE) ), ncol = 3 ) @@ -243,9 +239,9 @@ set.seed(1) #1 #2 bc <- BatchContainer$new( dimensions = list("plate" = 3, "row" = 4, "col" = 6), -) -assign_random(bc, dat) -#assign_in_order(bc, dat) +) %>% + assign_random(dat) +# assign_in_order(dat) ``` ```{r, fig.width= 5, fig.height=4.5, eval=FALSE} @@ -276,7 +272,7 @@ cowplot::plot_grid( ## Spatial arrangement ```{r, warning=FALSE, message=FALSE} -traces <- optimize_multi_plate_design( +bc <- optimize_multi_plate_design( bc, across_plates_variables = c("Group", "Sex"), within_plate_variables = c("Group"), @@ -299,8 +295,8 @@ cowplot::plot_grid( ) ``` -```{r fig.width=3, fig.height=3, echo=FALSE} -purrr::imap(traces, ~ .x$plot(include_aggregated=TRUE) + labs(title = .y))[1:2] +```{r fig.width=5, fig.height=4, echo=FALSE} +bc$plot_trace() ``` diff --git a/vignettes/basic_examples.Rmd b/vignettes/basic_examples.Rmd index 0f53c09f..66144d0f 100644 --- a/vignettes/basic_examples.Rmd +++ b/vignettes/basic_examples.Rmd @@ -102,7 +102,7 @@ bc$get_locations() %>% head() Use random assignment function to place samples to plate locations ```{r} -assign_random(bc, samples) +bc <- assign_random(bc, samples) bc$get_samples() bc$get_samples(remove_empty_locations = TRUE) @@ -128,7 +128,9 @@ plot_plate(bc$get_samples(remove_empty_locations = TRUE), To move individual samples or manually assigning all locations we can use the `batchContainer$move_samples()` method -To swap two or more samples use +To swap two or more samples use: + +**Warning**: This will change your BatchContainer in-place. ```{r, fig.width=6, fig.height=3.5} bc$move_samples(src = c(1L, 2L), dst = c(2L, 1L)) @@ -140,6 +142,9 @@ plot_plate(bc$get_samples(remove_empty_locations = TRUE), ``` To assign all samples in one go, use the option `location_assignment`. + +**Warning**: This will change your BatchContainer in-place. + The example below orders samples by ID and adds the empty locations afterwards ```{r, fig.width=6, fig.height=3.5} bc$move_samples( @@ -155,29 +160,23 @@ plot_plate(bc$get_samples(remove_empty_locations = TRUE, include_id = TRUE), ) ``` -## Scoring a layout - -To evaluate how good a layout is, we need a scoring function. -This we also assign to the batch container. - -This function will assess how well treatment -and dose are balanced across the two plates. - -```{r} -bc$scoring_f <- osat_score_generator( - batch_vars = "plate", - feature_vars = c("treatment", "dose") -) -``` - ## Run an optimization The optimization procedure is invoked with e.g. `optimize_design`. Here we use a simple shuffling schedule: swap 10 samples for 100 times, then swap 2 samples for 400 times. +To evaluate how good a layout is, we need a scoring function. + +This function will assess how well treatment +and dose are balanced across the two plates. + ```{r} -trace <- optimize_design(bc, +bc <- optimize_design(bc, + scoring = osat_score_generator( + batch_vars = "plate", + feature_vars = c("treatment", "dose") + ), # shuffling schedule n_shuffle = c(rep(10, 200), rep(2, 400)) ) @@ -186,7 +185,7 @@ trace <- optimize_design(bc, Development of the score can be viewed with ```{r, fig.width=3.5, fig.height=3} -trace$plot() +bc$plot_trace() ``` The layout after plate batching looks the following From c710b02388735e7122883bb9f6c091b665feeedc Mon Sep 17 00:00:00 2001 From: Iakov Davydov <671660+idavydov@users.noreply.github.com> Date: Thu, 13 Jul 2023 13:45:59 +0200 Subject: [PATCH 04/32] move scoring preprocessing/validation to $score() for consistency the behaviour of optimize_design and bc$score() should be identical now --- R/batch_container.R | 16 ++++++++++++++-- R/optimize.R | 12 ------------ 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/R/batch_container.R b/R/batch_container.R index 946acef9..4ebac8c0 100644 --- a/R/batch_container.R +++ b/R/batch_container.R @@ -328,8 +328,8 @@ BatchContainer <- R6::R6Class("BatchContainer", #' @description #' Score current sample assignment, - #' @param scoring names list of scoring functions. Each functin should - #' return a numeric vector. + #' @param scoring a function or a names list of scoring functions. + #' Each function should return a numeric vector. #' @return Returns a named vector of all scoring functions values. score = function(scoring) { assertthat::assert_that( @@ -337,6 +337,18 @@ BatchContainer <- R6::R6Class("BatchContainer", !is.null(scoring), msg = "Scoring function needs to be provided" ) + if (is.function(scoring)) { + scoring <- list(scoring) + } else { + assertthat::assert_that(is.list(scoring), length(scoring) >= 1) + assertthat::assert_that( + all(purrr::map_lgl(scoring, is.function)), + msg = "All elements of scoring should be functions" + ) + } + if (is.null(names(scoring))) { + names(scoring) <- stringr::str_c("score_", seq_along(scoring)) + } assertthat::assert_that(is.list(scoring), length(scoring) >= 1, msg = "Scroring function should be a non-empty list" diff --git a/R/optimize.R b/R/optimize.R index c8951a41..be82f3eb 100644 --- a/R/optimize.R +++ b/R/optimize.R @@ -215,18 +215,6 @@ optimize_design <- function(batch_container, samples = NULL, !is.null(scoring), msg = "Scoring should be provided when calling optimize_design()" ) - if (is.function(scoring)) { - scoring <- list(scoring) - } else { - assertthat::assert_that(is.list(scoring), length(scoring) >= 1) - assertthat::assert_that( - all(purrr::map_lgl(scoring, is.function)), - msg = "All elements of scoring should be functions" - ) - } - if (is.null(names(scoring))) { - names(scoring) <- stringr::str_c("score_", seq_along(scoring)) - } # Get assigned samples and locations from the batch container samp <- batch_container$get_samples(include_id = TRUE, assignment = TRUE, remove_empty_locations = FALSE) From 678fba3e95f98485ae9386d4df1b03cc22ccab76 Mon Sep 17 00:00:00 2001 From: Iakov Davydov <671660+idavydov@users.noreply.github.com> Date: Thu, 13 Jul 2023 13:46:29 +0200 Subject: [PATCH 05/32] rename scores_tibble to scores_table --- R/batch_container.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/batch_container.R b/R/batch_container.R index 4ebac8c0..7824ab24 100644 --- a/R/batch_container.R +++ b/R/batch_container.R @@ -454,12 +454,12 @@ BatchContainer <- R6::R6Class("BatchContainer", ), #' @description - #' Return a tibble with scores from an optimization. + #' Return a table with scores from an optimization. #' #' @param index optimization index, all by default #' @param include_aggregated include aggregated scores #' @return a [tibble::tibble()] with scores - scores_tibble = function(index = NULL, include_aggregated = FALSE) { + scores_table = function(index = NULL, include_aggregated = FALSE) { assertthat::assert_that( tibble::is_tibble(self$trace), nrow(self$trace) >= 1, From b2437567474ec96922b73cef85653f1402f87712 Mon Sep 17 00:00:00 2001 From: Iakov Davydov <671660+idavydov@users.noreply.github.com> Date: Thu, 13 Jul 2023 13:47:20 +0200 Subject: [PATCH 06/32] when facet-plotting aggregated scores show nicer labes (T/F -> agg/score) --- R/batch_container.R | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/R/batch_container.R b/R/batch_container.R index 7824ab24..3f3cc915 100644 --- a/R/batch_container.R +++ b/R/batch_container.R @@ -513,7 +513,10 @@ BatchContainer <- R6::R6Class("BatchContainer", #' @param include_aggregated include aggregated scores #' @return a [ggplot2::ggplot()] object plot_trace = function(index = NULL, include_aggregated = FALSE, ...) { - d <- self$scores(index, include_aggregated) + d <- self$scores_table(index, include_aggregated) %>% + dplyr::mutate( + agg_title = dplyr::if_else(.data$aggregated, "aggregated", "score") + ) p <- ggplot2::ggplot(d) + ggplot2::aes(.data$step, .data$value, group = .data$score, color = .data$score) + ggplot2::geom_line() + @@ -523,7 +526,7 @@ BatchContainer <- R6::R6Class("BatchContainer", ggplot2::facet_wrap(~ optimization_index, scales = "free") } else if (include_aggregated && any(d$aggregated)) { p <- p + - ggplot2::facet_wrap(~ aggregated, scales = "free_y", ncol = 1) + ggplot2::facet_wrap(~ agg_title, scales = "free_y", ncol = 1) } else { p <- p + ggplot2::facet_wrap(~ score, scales = "free_y", ncol = 1) From 1dd126d3c37d8d39657f6a620b0d4c8938d8fdcb Mon Sep 17 00:00:00 2001 From: Iakov Davydov <671660+idavydov@users.noreply.github.com> Date: Thu, 13 Jul 2023 13:48:12 +0200 Subject: [PATCH 07/32] adapt custom_shuffle and optimizer vignettes to the new bc behaviour --- vignettes/custom_shuffle.Rmd | 26 +++--- vignettes/optimizer_examples.Rmd | 154 ++++++++++++++----------------- 2 files changed, 80 insertions(+), 100 deletions(-) diff --git a/vignettes/custom_shuffle.Rmd b/vignettes/custom_shuffle.Rmd index 6d835657..1dec234f 100644 --- a/vignettes/custom_shuffle.Rmd +++ b/vignettes/custom_shuffle.Rmd @@ -55,8 +55,8 @@ We start by assigning samples randomly. set.seed(42) bc <- BatchContainer$new( dimensions = c("cage" = 11, "position" = 5) -) -assign_random(bc, samples) +) %>% + assign_random(samples) bc ``` @@ -96,15 +96,14 @@ and `sex` interactions are considered in the scoring function. We only use 10 it shuffling is limited to locations with males and enforces change of cage on every iteration. ```{r} -bc$scoring_f <- osat_score_generator( - "cage", - "sex" -) - set.seed(10) -res <- optimize_design( +bc <- optimize_design( bc, + scoring = osat_score_generator( + "cage", + "sex" + ), shuffle_proposal_func = shuffle_with_constraints( sex == "M", cage != .src$cage @@ -112,7 +111,7 @@ res <- optimize_design( max_iter = 10 ) -plot(res) +bc$plot_trace() ``` We expect the distribution of males become even, while other variables are not significantly affected. @@ -129,7 +128,7 @@ locations. We also ensure that on every iteration the cage number is changed; we `position` dimension does affect actual animal allocation. ```{r} -bc$scoring_f <- function(bc) { +scoring_f <- function(bc) { samples <- bc$get_samples(include_id = TRUE, as_tibble = FALSE) avg_w <- samples[, mean(weight, na.rm = TRUE)] avg_w_per_cage <- samples[!is.na(weight), mean(weight), by = cage]$V1 @@ -141,7 +140,8 @@ bc$scoring_f <- function(bc) { } set.seed(12) -res <- optimize_design(bc, +bc <- optimize_design(bc, + scoring = scoring_f, shuffle_proposal = shuffle_with_constraints( sex == "F", cage != .src$cage & (is.na(sex) | sex != "M") @@ -149,8 +149,8 @@ res <- optimize_design(bc, n_shuffle = c(rep(10, 20), rep(5, 20), rep(3, 20), rep(1, 140)), max_iter = 200 ) -plot(res) -bc$score() +bc$plot_trace() +scoring_f(bc) ``` Now we have a much more even distribution of weights and treatment/control balance. diff --git a/vignettes/optimizer_examples.Rmd b/vignettes/optimizer_examples.Rmd index c707c416..be3bf7de 100644 --- a/vignettes/optimizer_examples.Rmd +++ b/vignettes/optimizer_examples.Rmd @@ -17,6 +17,7 @@ knitr::opts_chunk$set( ```{r setup} library(designit) +library(magrittr) ``` @@ -60,12 +61,9 @@ bc <- BatchContainer$new( run = 2, position = 5 ), exclude = tibble::tibble(batch = 4, run = c(1, 2), position = c(5, 5)) -) - -# Add samples to container -assign_in_order(bc, samples = multi_trt_day_samples) -# Set scoring function -bc$scoring_f <- osat_score_generator(c("batch"), c("Treatment", "Time")) +) %>% + # Add samples to container + assign_in_order(samples = multi_trt_day_samples) bc ``` @@ -85,15 +83,15 @@ Optimization finishes after the list of permutations is exhausted. ```{r} n_shuffle <- rep(c(32, 10, 5, 2, 1), c(20, 40, 40, 50, 50)) +scoring_f <- osat_score_generator(c("batch"), c("Treatment", "Time")) -bc1 <- bc$copy() - -trace1 <- optimize_design( - bc1, +bc1 <- optimize_design( + bc, + scoring = scoring_f, n_shuffle = n_shuffle # will implicitly generate a shuffling function according to the provided schedule ) -trace1$elapsed +bc1$trace$elapsed ``` ## Optimization trace @@ -101,14 +99,27 @@ trace1$elapsed Custom plot with some colours: ```{r, fig.width=5, fig.height= 4} -ggplot2::qplot(x = seq_along(trace1$scores), y = trace1$scores, color = factor(n_shuffle)[1:length(trace1$scores)], geom = "point") + - ggplot2::labs(title = "Score 1 tracing", subtitle = stringr::str_glue("Final score = {bc1$score()}"), x = "Iteration", y = "Score", color = "n_shuffle") +bc1$scores_table() %>% + dplyr::mutate( + n_shuffle = c(NA, n_shuffle) + ) %>% + ggplot2::ggplot( + ggplot2::aes(step, value, color = factor(n_shuffle)) + ) + + ggplot2::geom_point() + + ggplot2::labs( + title = "Score 1 tracing", + subtitle = stringr::str_glue("Final score = {bc1$score(scoring_f)}"), + x = "Iteration", + y = "Score", + color = "n_shuffle" + ) ``` Using the internal method... ```{r, fig.width=5, fig.height= 4} -trace1$plot() +bc1$plot_trace() ``` We may safely apply the batch container methods get_samples() and score() also @@ -116,7 +127,7 @@ after using the new optimization code. ## Final batch layout ```{r, fig.width=6, fig.height=5} -bc1$score() +bc1$score(scoring_f) bc1$get_samples(assignment = TRUE) %>% dplyr::filter(!is.na(Treatment)) %>% @@ -136,8 +147,9 @@ immediately on the same batch container. ```{r} n_shuffle <- rep(c(5, 2, 1), c(30, 30, 30)) -optimize_design( +bc1 <- optimize_design( bc1, + scoring = scoring_f, n_shuffle = n_shuffle ) ``` @@ -154,10 +166,9 @@ reaching a specific minimum delta threshold (score improvement from one selected solution to the next). ```{r} -bc2 <- bc$copy() - -optimize_design( - bc2, +bc2 <- optimize_design( + bc, + scoring = scoring_f, n_shuffle = 3, # will implicitly generate a shuffling function that will do 3 swaps at each iteration max_iter = 2000, min_delta = 0.1 @@ -168,7 +179,7 @@ optimize_design( # Optimization with multi-variate scoring function Instead of passing a single scoring function, a list of multiple scoring -functions can be assigned to a batch container, each of which to return a scalar +functions can be passed to the optimizer, each of which to return a scalar value on evaluation. By default, a strict improvement rule is applied for classifying a potential @@ -182,16 +193,15 @@ The second scoring function used here is by the way rather redundant and just serves for illustration. ```{r} -bc3 <- bc$copy() - -bc3$scoring_f <- list( +multi_scoring_f <- list( osat_score_generator(c("batch"), c("Treatment", "Time")), osat_score_generator(c("batch"), c("Treatment")) ) -trace <- optimize_design( - bc3, +bc3 <- optimize_design( + bc, + scoring = multi_scoring_f, n_shuffle = 3, max_iter = 200, min_delta = 0.1 @@ -216,16 +226,9 @@ We may also want to decrease the delta_min parameter to match the new numerical range. ```{r} -bc3_as <- bc$copy() - -bc3_as$scoring_f <- list( - osat_score_generator(c("batch"), c("Treatment", "Time")), - osat_score_generator(c("batch"), c("Treatment")) -) - - -trace <- optimize_design( - bc3_as, +bc3_as <- optimize_design( + bc, + scoring = multi_scoring_f, n_shuffle = 3, max_iter = 200, min_delta = 0.01, @@ -245,16 +248,9 @@ simply set the aggregated score to whichever of the individual scores is larger ```{r} -bc4 <- bc$copy() - -bc4$scoring_f <- list( - osat_score_generator(c("batch"), c("Treatment", "Time")), - osat_score_generator(c("batch"), c("Treatment")) -) - - -optimize_design( - bc4, +bc4 <- optimize_design( + bc, + scoring = multi_scoring_f, n_shuffle = 3, aggregate_scores_func = worst_score, max_iter = 200, @@ -271,15 +267,9 @@ For illustration, we omit the `n_shuffle` parameter here, which will lead by default to pairwise sample swaps being done on each iteration. ```{r, eval = FALSE} -bc5 <- bc$copy() - -bc5$scoring_f <- list( - osat_score_generator(c("batch"), c("Treatment", "Time")), - osat_score_generator(c("batch"), c("Treatment")) -) - -optimize_design( - bc5, +bc5 <- optimize_design( + bc, + scoring = multi_scoring_f, aggregate_scores_func = sum_scores, max_iter = 200, autoscale_scores = TRUE, @@ -295,15 +285,9 @@ Note that we don't use the auto-scaling in this case as the L2-norm based optimi not the minimal (negative) value that would be desired in that case. ```{r, eval = FALSE} -bc5_2 <- bc$copy() - -bc5_2$scoring_f <- list( - osat_score_generator(c("batch"), c("Treatment", "Time")), - osat_score_generator(c("batch"), c("Treatment")) -) - -optimize_design( - bc5_2, +bc5_2 <- optimize_design( + bc, + scoring = multi_scoring_f, aggregate_scores_func = L2s_norm, max_iter = 200, ) @@ -322,10 +306,9 @@ across all available positions in the batch container. Note that this is usually not a good strategy for converging to a solution. ```{r} -bc6 <- bc$copy() - -optimize_design( - bc6, +bc6 <- optimize_design( + bc, + scoring = scoring_f, shuffle_proposal_func = complete_random_shuffling, max_iter = 200 ) @@ -349,10 +332,9 @@ to be optimized. Choose an appropriate aggregation function if you happen to have multiple scores initially. ```{r} -bc7 <- bc$copy() - -trace7 <- optimize_design( - bc7, +bc7 <- optimize_design( + bc, + scoring = scoring_f, n_shuffle = 1, acceptance_func = mk_simanneal_acceptance_func(), max_iter = 200 @@ -362,23 +344,22 @@ The trace may show a non strictly monotonic behavior now, reflecting the SA protocol at work. ```{r, fig.width=5, fig.height= 4} -trace7$plot() +bc7$plot_trace() ``` Better results and quicker convergence may be achieved by playing with the starting temperature (T0) and cooling speed (alpha) in a specific case. ```{r} -bc8 <- bc$copy() - -trace8 <- optimize_design( - bc8, +bc8 <- optimize_design( + bc, + scoring = scoring_f, n_shuffle = 1, acceptance_func = mk_simanneal_acceptance_func(mk_simanneal_temp_func(T0 = 100, alpha = 2)), max_iter = 150 ) -trace8$plot() +bc8$plot_trace() ``` # Full blown example @@ -387,16 +368,15 @@ The following example puts together all possible options to illustrate the flexibility of the optimization. ```{r} -bc$scoring_f <- list( - osat_score_generator(c("batch"), c("Treatment", "Time")), - osat_score_generator(c("batch"), c("Treatment")), - osat_score_generator(c("batch"), c("Time")) -) - n_shuffle <- rep(c(3, 2, 1), c(20, 20, 200)) -trace <- optimize_design( +bc9 <- optimize_design( bc, + scoring = list( + osat_score_generator(c("batch"), c("Treatment", "Time")), + osat_score_generator(c("batch"), c("Treatment")), + osat_score_generator(c("batch"), c("Time")) + ), n_shuffle = n_shuffle, aggregate_scores_func = sum_scores, acceptance_func = mk_simanneal_acceptance_func(mk_simanneal_temp_func(T0 = 500, alpha = 1)), @@ -405,9 +385,9 @@ trace <- optimize_design( autoscale_scores = T ) -trace$plot() +bc9$plot_trace() -bc$get_samples(assignment = TRUE) %>% +bc9$get_samples(assignment = TRUE) %>% dplyr::mutate(batch = factor(batch)) %>% ggplot2::ggplot(ggplot2::aes(x = batch, fill = Treatment, alpha = factor(Time))) + ggplot2::geom_bar() From dcfce24f1d3979dc02d9a57f3e4b5034b04d6b59 Mon Sep 17 00:00:00 2001 From: Iakov Davydov <671660+idavydov@users.noreply.github.com> Date: Thu, 13 Jul 2023 13:53:44 +0200 Subject: [PATCH 08/32] improve BatchContainer roxygen docs --- R/batch_container.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/batch_container.R b/R/batch_container.R index 3f3cc915..3f5e3775 100644 --- a/R/batch_container.R +++ b/R/batch_container.R @@ -440,8 +440,7 @@ BatchContainer <- R6::R6Class("BatchContainer", invisible(self) }, - #' @description - #' Optimization trace, a [tibble::tibble()] + #' @field trace Optimization trace, a [tibble::tibble()] trace = tibble::tibble( optimization_index = numeric(), call = list(), @@ -511,6 +510,7 @@ BatchContainer <- R6::R6Class("BatchContainer", #' Plot trace #' @param index optimization index, all by default #' @param include_aggregated include aggregated scores + #' @param ... not used. #' @return a [ggplot2::ggplot()] object plot_trace = function(index = NULL, include_aggregated = FALSE, ...) { d <- self$scores_table(index, include_aggregated) %>% From 69c29f8d3b98fb8e9d79a50fe1cc8c47baee1806 Mon Sep 17 00:00:00 2001 From: Iakov Davydov <671660+idavydov@users.noreply.github.com> Date: Thu, 13 Jul 2023 14:16:16 +0200 Subject: [PATCH 09/32] remove warning from a vignette --- vignettes/NCS22_talk.Rmd | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/vignettes/NCS22_talk.Rmd b/vignettes/NCS22_talk.Rmd index 7580c73f..19c59010 100644 --- a/vignettes/NCS22_talk.Rmd +++ b/vignettes/NCS22_talk.Rmd @@ -138,13 +138,13 @@ bc$get_samples() ```{r, echo=FALSE} bind_rows(head(bc$get_samples(), 3) %>% - mutate(across(, as.character)), + mutate(across(everything(), as.character)), tibble(batch = "...", location = " ...", SubjectID = "...", Group = "...", Sex = "..."), tail(bc$get_samples(), 3) %>% - mutate(across(, as.character))) %>% + mutate(across(everything(), as.character))) %>% gt::gt() %>% gt::tab_options(table.font.size = 11, data_row.padding = 0.1) ``` @@ -198,13 +198,13 @@ cowplot::plot_grid( ```{r, echo=FALSE} bind_rows(head(bc$get_samples(), 3) %>% - mutate(across(, as.character)), + mutate(across(everything(), as.character)), tibble(batch = "...", location = " ...", SubjectID = "...", Group = "...", Sex = "..."), tail(bc$get_samples(), 3) %>% - mutate(across(, as.character))) %>% + mutate(across(everything(), as.character))) %>% gt::gt() %>% gt::tab_options(table.font.size = 11, data_row.padding = 0.1) ``` From fa7a88faa038321908f7e3d6ba1419f3ae4f90a7 Mon Sep 17 00:00:00 2001 From: Iakov Davydov <671660+idavydov@users.noreply.github.com> Date: Thu, 13 Jul 2023 14:17:01 +0200 Subject: [PATCH 10/32] _doc --- DESCRIPTION | 2 +- NAMESPACE | 1 - man/BatchContainer.Rd | 75 +++++++- man/OptimizationTrace.Rd | 329 ---------------------------------- man/assign_from_table.Rd | 4 +- man/assign_in_order.Rd | 6 +- man/assign_random.Rd | 6 +- man/mk_autoscale_function.Rd | 4 + man/optimize_design.Rd | 9 +- man/random_score_variances.Rd | 10 +- man/sample_random_scores.Rd | 10 +- man/shrink_mat.Rd | 20 +++ 12 files changed, 127 insertions(+), 349 deletions(-) delete mode 100644 man/OptimizationTrace.Rd create mode 100644 man/shrink_mat.Rd diff --git a/DESCRIPTION b/DESCRIPTION index b38c59f7..36c7f444 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -74,7 +74,7 @@ Suggests: Encoding: UTF-8 LazyData: true Roxygen: list(markdown = TRUE) -RoxygenNote: 7.2.1 +RoxygenNote: 7.2.3 VignetteBuilder: knitr biocViews: Remotes: diff --git a/NAMESPACE b/NAMESPACE index 2cc2cf16..e8a9d03c 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -6,7 +6,6 @@ export(BatchContainer) export(BatchContainerDimension) export(L1_norm) export(L2s_norm) -export(OptimizationTrace) export(accept_leftmost_improvement) export(as_label) export(as_name) diff --git a/man/BatchContainer.Rd b/man/BatchContainer.Rd index db4a1e6b..e5c32ac8 100644 --- a/man/BatchContainer.Rd +++ b/man/BatchContainer.Rd @@ -27,6 +27,13 @@ bc <- BatchContainer$new( bc } +\section{Public fields}{ +\if{html}{\out{
an&zQSH@tdnZsn-R^5150
zSJP{``2bd{4${?6P4%QN!nP#3+At^4J3AxXd$}6F9ya7^CZy5H?|8SfhIU`c)tGKp
z`MZ?9x|gvqAGWj@SoW;>*7Q6(+5mbq)Gr&w_4DlR+a0OYN6KZ+m*LdiAphG=rRR-*
zbWhuhww;WL0>A;-!R#PMe3z+32FiDfYl5%wHhqC2B2kH;NE7)2R7N4R(
zw%eMh=9G~M0`xhQjadV2a1DEw&+j+l?!Q5H5}om8FJ;Tq>2<1Ok@kHyia~w*_Dw@W
z187=ubYZqOMPARI?SqapH#Zk7$zYZY1r&5THG8-*vA5N