-
Notifications
You must be signed in to change notification settings - Fork 12
/
average-spc.R
65 lines (56 loc) · 2.9 KB
/
average-spc.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
#' @title Average spectra in list-column by entries in grouping column
#' @description Average spectra in list-column of spectra tibble (`spc_tbl`) by
#' groups given in group column.
#' @param spc_tbl Tibble data frame containing at least the grouping column
#' given in argument `by` and input spectra given in list-column `column_in`.
#' @param by Character vector of length 1L or name/symbol that specifies the
#' column by which groups of spectra are averaged. Default is `"sample_id"`.
#' @param column_in Character vector of length 1L or or name/symbol that
#' specifies the list-column that contains the inputs spectra to be averaged.
#' Default is `"spc_rs"`, which are resampled spectra (i.e., resulting after
#' preceding `resample_spc()` step).
#' @return Spectra tibble data frame (class `"tbl_df"`, `"tbl"`, `"data.frame"`)
#' with a new list-column of column name `"spc_mean"` at the last position,
#' containing mean spectra with identical row replicates within the same
#' `by`-group.
#' @details For memory efficiency and subsequent modeling, consider slicing the
#' extra row copies of `spc_mean` resulting from `average_spc()` for example by
#' * `split(x = spc_tbl, f = spc_tbl$<by>) %>% lapply(., function(x) x x[1, ]) %>% do.call(., rbind)`
#' * `dplyr::group_by(spc_tbl, <by>) %>% dplyr::slice(1L)`
#' @import stats
#' @importFrom data.table data.table rbindlist setkey setDT := .SD
#' @importFrom rlang ensym quo_name
#' @export
average_spc <- function(spc_tbl, by = "sample_id", column_in = "spc_rs") {
# Avoid R CMD check note: `"...no visible binding for global variable..."`
spc_rs <- sample_id <- id <- NULL
# Quote the symbol or the string supplied by the second and third argument
column_in <- rlang::enquo(column_in)
by <- rlang::enquo(by)
# Combine rows of all resampled spectra in one data.table
spc <- data.table::rbindlist(dplyr::pull(spc_tbl, !!column_in))
# Add `id` group column to input spectra
spc[, id := dplyr::pull(spc_tbl, !!by)] # spc_tbl[, by][[by]]
# Average spectra, use `id` column as index for grouping
data.table::setkey(spc, id)
spc_mean <- spc[, lapply(.SD, mean), by = id]
# Create new vector of group ID values from column `id`
group_id_mean <- spc_mean[, id]
# Delete sample_id column in data.table
spc_mean_noid <- spc_mean[, id := NULL]
# Create list of averaged spectra, one spectrum is one data.table
# Use best performing alternative:
# https://github.com/jennybc/row-oriented-workflows/blob/master/iterate-over-rows.md
spc_mean_list <- stats::setNames(
map(purrr::transpose(spc_mean_noid), data.table::as.data.table),
group_id_mean
)
# Convert averaged spectra and sample_id to tibble
spc_mean_tbl <- tibble::tibble(
!!by := group_id_mean, spc_mean = spc_mean_list
)
# Join mean spectra tibble spc_tbl_mean to spc_tbl
spc_tbl_out <- dplyr::left_join(spc_tbl, spc_mean_tbl,
by = rlang::quo_name(by))
return(spc_tbl_out)
}