-
Notifications
You must be signed in to change notification settings - Fork 28
/
Copy pathnormalize.R
141 lines (120 loc) · 4.43 KB
/
normalize.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
#' Normalize observation variables.
#'
#' \code{normalize} normalizes observation variables based on the specified normalization method.
#'
#' @param population tbl with grouping (metadata) and observation variables.
#' @param variables character vector specifying observation variables.
#' @param strata character vector specifying grouping variables for grouping prior to normalization.
#' @param operation optional character string specifying method for normalization. This must be one of the strings \code{"standardize"} (default), \code{"robustize"}.
#' @param sample tbl containing sample that is used by normalization methods to estimate parameters. \code{sample} has same structure as \code{population}. Typically, \code{sample} corresponds to controls in the experiment.
#' @param ... arguments passed to normalization operation
#'
#' @return normalized data of the same class as \code{population}.
#'
#' @importFrom magrittr %>%
#' @importFrom magrittr %<>%
#' @importFrom rlang :=
#' @importFrom stats cor mad median sd setNames
#'
#' @examples
#' suppressMessages(suppressWarnings(library(magrittr)))
#' population <- tibble::data_frame(
#' Metadata_group = c("control", "control", "control", "control",
#' "experiment", "experiment", "experiment", "experiment"),
#' Metadata_batch = c("a", "a", "b", "b", "a", "a", "b", "b"),
#' AreaShape_Area = c(10, 12, 15, 16, 8, 8, 7, 7)
#' )
#' variables <- c('AreaShape_Area')
#' strata <- c('Metadata_batch')
#' sample <- population %>% dplyr::filter(Metadata_group == 'control')
#' cytominer::normalize(population, variables, strata, sample, operation = "standardize")
#'
#' @export
normalize <- function(population, variables, strata, sample,
operation = "standardize", ...) {
scale <- function(data, location, dispersion, variables) {
if (is.data.frame(data)) {
futile.logger::flog.debug(paste0(
"\t\tUsing base::scale (data is ",
paste(class(data), collapse = ","),
")"
))
dplyr::bind_cols(
data %>% dplyr::select_(~-dplyr::one_of(variables)),
data %>%
dplyr::select_(.dots = variables) %>%
as.matrix() %>%
base::scale(
center = as.matrix(location),
scale = as.matrix(dispersion)
) %>%
tibble::as_data_frame()
)
} else {
futile.logger::flog.debug(paste0(
"\t\tNot using base::scale (data is ",
paste(class(data), collapse = ","),
")"
))
for (variable in variables) {
x <- rlang::sym(variable)
m <- location[[variable]]
s <- dispersion[[variable]]
data %<>%
dplyr::mutate(!! x := ((!! x) - m) / s)
}
data
}
}
sample_is_df <- is.data.frame(sample)
if (operation == "robustize") {
location <- dplyr::funs(median(., na.rm = TRUE))
dispersion <- dplyr::funs(mad(., na.rm = TRUE))
} else if (operation == "standardize") {
location <- dplyr::funs(mean(., na.rm = TRUE))
dispersion <- dplyr::funs(sd(., na.rm = TRUE))
} else {
error <- paste0("undefined operation `", operation, "'")
futile.logger::flog.error(msg = error)
stop(error)
}
futile.logger::flog.debug("Creating temp table for sample")
sample %<>% dplyr::compute()
futile.logger::flog.debug("Created temp table for sample")
groups <-
sample %>%
dplyr::select_(.dots = strata) %>%
dplyr::distinct() %>%
dplyr::collect()
Reduce(
dplyr::union_all,
Map(
f = function(group) {
futile.logger::flog.debug(group)
futile.logger::flog.debug("\tstratum")
stratum <-
sample %>%
dplyr::inner_join(y = group, by = names(group), copy = TRUE) %>%
dplyr::compute()
futile.logger::flog.debug("\tlocation")
location <-
stratum %>%
dplyr::summarise_at(.funs = location, .vars = variables) %>%
dplyr::collect()
futile.logger::flog.debug("\tdispersion")
dispersion <-
stratum %>%
dplyr::summarise_at(.funs = dispersion, .vars = variables) %>%
dplyr::collect()
futile.logger::flog.debug("\tscale")
scaled <-
population %>%
dplyr::inner_join(y = group, by = names(group), copy = TRUE) %>%
scale(location, dispersion, variables)
futile.logger::flog.debug("\tscaled")
scaled
},
split(x = groups, f = seq(nrow(groups)))
)
)
}