-
-
Notifications
You must be signed in to change notification settings - Fork 15
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #101 from nathaneastwood/feat_add_pivot_functions
feat: Implement pivot_*() functions
- Loading branch information
Showing
9 changed files
with
713 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,7 @@ | ||
Package: poorman | ||
Type: Package | ||
Title: A Poor Man's Dependency Free Recreation of 'dplyr' | ||
Version: 0.2.5.6 | ||
Version: 0.2.5.7 | ||
Authors@R: person("Nathan", "Eastwood", "", "[email protected]", | ||
role = c("aut", "cre")) | ||
Maintainer: Nathan Eastwood <[email protected]> | ||
|
@@ -18,7 +18,7 @@ Suggests: | |
tinytest | ||
License: MIT + file LICENSE | ||
Encoding: UTF-8 | ||
RoxygenNote: 7.1.1 | ||
RoxygenNote: 7.2.0 | ||
Roxygen: list(markdown = TRUE) | ||
VignetteBuilder: knitr | ||
Language: en-GB |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,138 @@ | ||
#' Pivot data from wide to long | ||
#' | ||
#' `pivot_longer()` "lengthens" data, increasing the number of rows and decreasing the number of columns. The inverse | ||
#' transformation is [pivot_wider()]. | ||
#' | ||
#' @param data `data.frame`. The data to pivot. | ||
#' @param cols <[`poor-select`][select_helpers]>. Columns to pivot into longer format. | ||
#' @param names_to `character(n)`. The name of the new column(s) that will contain the column names. | ||
#' @param names_prefix `character(1)`. A regular expression used to remove matching text from the start of each variable | ||
#' name. | ||
#' @param names_sep,names_pattern `character(1)`. If `names_to` contains multiple values, this argument controls how the | ||
#' column name is broken up. `names_pattern` takes a regular expression containing matching groups (`()`). | ||
#' @param values_to `character(n)`. The name of the new column(s) that will contain the values of the pivoted variables. | ||
#' @param values_drop_na `logical(1)`. If `TRUE`, will drop rows that contain only `NA` in the `values_to` column. This | ||
#' effectively converts explicit missing values to implicit missing values, and should generally be used only when | ||
#' missing values in data were created by its structure. | ||
#' @param ... Additional arguments passed on to methods. | ||
#' | ||
#' @return A `data.frame`. | ||
#' | ||
#' @examples | ||
#' wide_data <- data.frame(replicate(5, rnorm(10))) | ||
#' # Customizing the names | ||
#' pivot_longer( | ||
#' data = wide_data, | ||
#' cols = c(1, 2), | ||
#' names_to = "Column", | ||
#' values_to = "Numbers" | ||
#' ) | ||
#' | ||
#' @export | ||
pivot_longer <- function( | ||
data, | ||
cols, | ||
names_to = "name", | ||
names_prefix = NULL, | ||
names_sep = NULL, | ||
names_pattern = NULL, | ||
values_to = "value", | ||
values_drop_na = FALSE, | ||
... | ||
) { | ||
|
||
if (missing(cols)) { | ||
stop("`cols` must select at least one column.") | ||
} | ||
|
||
cols <- names(eval_select_pos(data, substitute(cols))) | ||
|
||
if (any(names_to %in% setdiff(names(data), cols))) { | ||
stop( | ||
paste0( | ||
"Some values of the columns specified in 'names_to' are already present | ||
as column names. Either use another value in `names_to` or rename the | ||
following columns: ", | ||
paste(names_to[which(names_to %in% setdiff(names(data), cols))], sep = ", ") | ||
), | ||
call. = FALSE) | ||
} | ||
|
||
# Sanity checks ---------------- | ||
|
||
# nothing to select? | ||
if (length(cols) == 0L) { | ||
stop("No columns found for reshaping data.", call. = FALSE) | ||
} | ||
|
||
# Reshaping --------------------- | ||
# Create Index column as needed by reshape | ||
data[["_Row"]] <- as.numeric(rownames(data)) | ||
|
||
# Create a new index for cases with length(names_to) > 1 | ||
names_to_2 <- paste(names_to, collapse = "_") | ||
|
||
# Reshape | ||
long <- stats::reshape( | ||
data, | ||
varying = cols, | ||
idvar = "_Row", | ||
v.names = values_to, | ||
timevar = names_to_2, | ||
direction = "long" | ||
) | ||
|
||
# Cleaning -------------------------- | ||
# Sort the dataframe (to match pivot_longer's output) | ||
long <- long[do.call(order, long[, c("_Row", names_to_2)]), ] | ||
|
||
long[["_Row"]] <- NULL | ||
|
||
# Re-insert col names as levels | ||
long[[names_to_2]] <- cols[long[[names_to_2]]] | ||
|
||
# if several variable in names_to, split the names either with names_sep or with names_pattern | ||
if (length(names_to) > 1) { | ||
for (i in seq_along(names_to)) { | ||
if (is.null(names_pattern)) { | ||
new_vals <- unlist(lapply( | ||
strsplit(unique(long[[names_to_2]]), names_sep, fixed = TRUE), | ||
function(x) x[i] | ||
)) | ||
long[[names_to[i]]] <- new_vals | ||
} else { | ||
colPattern <- regmatches( | ||
x = unique(long[[names_to_2]]), | ||
m = regexec(names_pattern, unique(long[[names_to_2]])) | ||
) | ||
colPattern <- as.data.frame(do.call(rbind, colPattern))[, c(1, i + 1)] | ||
names(colPattern) <- c(names_to_2, names_to[i]) | ||
long <- left_join(x = long, y = colPattern, by = names_to_2) | ||
} | ||
} | ||
long[[names_to_2]] <- NULL | ||
} | ||
|
||
# reorder | ||
long <- relocate(.data = long, values_to, .after = -1) | ||
|
||
# remove names prefix if specified | ||
if (!is.null(names_prefix)) { | ||
if (length(names_to) > 1) { | ||
stop("`names_prefix` only works when `names_to` is of length 1.", call. = FALSE) | ||
} | ||
long[[names_to]] <- gsub(paste0("^", names_prefix), "", long[[names_to]]) | ||
} | ||
|
||
if (values_drop_na) { | ||
long <- long[!is.na(long[, values_to]), ] | ||
} | ||
|
||
# Reset row names | ||
rownames(long) <- NULL | ||
|
||
# Remove reshape attributes | ||
attributes(long)$reshapeLong <- NULL | ||
|
||
long | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,217 @@ | ||
#' Pivot data from long to wide | ||
#' | ||
#' `pivot_wider()` "widens" data, increasing the number of columns and decreasing the number of rows. The inverse | ||
#' transformation is [pivot_longer()]. | ||
#' | ||
#' @param data `data.frame`. The data to pivot. | ||
#' @param id_cols `character(1)`. The name of the column that identifies the rows. If `NULL`, it will use all the unique | ||
#' rows. | ||
#' @param names_from `character(n)`. The name of the column(s) that contains the levels to be used as future column | ||
#' names. | ||
#' @param names_prefix `character(1)`. String added to the start of every variable name. This is particularly useful if | ||
#' `names_from` is a numeric vector and you want to create syntactic variable names. | ||
#' @param names_sep `character(1)`. If `names_from` or `values_from` contains multiple variables, this will be used to | ||
#' join their values together into a single string to use as a column name. | ||
#' @param names_glue `character(1)`. Instead of `names_sep` and `names_prefix`, you can supply a | ||
#' [glue specification](https://glue.tidyverse.org/index.html) that uses the `names_from` columns to create custom | ||
#' column names. Note that the only delimiters supported by `names_glue` are curly brackets, `{` and `}`. | ||
#' @param values_from `character(n)`. The name of the column that contains the values to be used as future variable | ||
#' values. | ||
#' @param values_fill `numeric(n)`. Optionally, a (scalar) value that will be used to replace missing values in the new | ||
#' columns created. | ||
#' @param ... Not used for now. | ||
#' | ||
#' @return If a tibble was provided as input, `pivot_wider()` also returns a | ||
#' tibble. Otherwise, it returns a data frame. | ||
#' | ||
#' @examples | ||
#' data_long <- read.table(header = TRUE, text = " | ||
#' subject sex condition measurement | ||
#' 1 M control 7.9 | ||
#' 1 M cond1 12.3 | ||
#' 1 M cond2 10.7 | ||
#' 2 F control 6.3 | ||
#' 2 F cond1 10.6 | ||
#' 2 F cond2 11.1 | ||
#' 3 F control 9.5 | ||
#' 3 F cond1 13.1 | ||
#' 3 F cond2 13.8 | ||
#' 4 M control 11.5 | ||
#' 4 M cond1 13.4 | ||
#' 4 M cond2 12.9") | ||
#' | ||
#' | ||
#' pivot_wider( | ||
#' data_long, | ||
#' id_cols = "subject", | ||
#' names_from = "condition", | ||
#' values_from = "measurement" | ||
#' ) | ||
#' | ||
#' pivot_wider( | ||
#' data_long, | ||
#' id_cols = "subject", | ||
#' names_from = "condition", | ||
#' values_from = "measurement", | ||
#' names_prefix = "Var.", | ||
#' names_sep = "." | ||
#' ) | ||
#' | ||
#' production <- expand.grid( | ||
#' product = c("A", "B"), | ||
#' country = c("AI", "EI"), | ||
#' year = 2000:2014 | ||
#' ) | ||
#' production <- filter(production, (product == "A" & country == "AI") | product == "B") | ||
#' | ||
#' production$production <- rnorm(nrow(production)) | ||
#' | ||
#' pivot_wider( | ||
#' production, | ||
#' names_from = c("product", "country"), | ||
#' values_from = "production", | ||
#' names_glue = "prod_{product}_{country}" | ||
#' ) | ||
#' | ||
#' @export | ||
pivot_wider <- function( | ||
data, | ||
id_cols = NULL, | ||
values_from = "Value", | ||
names_from = "Name", | ||
names_sep = "_", | ||
names_prefix = "", | ||
names_glue = NULL, | ||
values_fill = NULL, | ||
... | ||
) { | ||
|
||
old_names <- names(data) | ||
|
||
# Preserve attributes | ||
variable_attr <- lapply(data, attributes) | ||
|
||
# Create an id for stats::reshape | ||
if (is.null(id_cols)) { | ||
data[["_Rows"]] <- apply( | ||
X = data[, !names(data) %in% c(values_from, names_from), drop = FALSE], | ||
MARGIN = 1, | ||
FUN = paste, | ||
collapse = "_" | ||
) | ||
id_cols <- "_Rows" | ||
} | ||
|
||
# create pattern of column names - stats::reshape renames columns that concatenates "v.names" + values - we only want | ||
# values | ||
current_colnames <- colnames(data) | ||
current_colnames <- current_colnames[current_colnames != "_Rows"] | ||
if (is.null(names_glue)) { | ||
future_colnames <- unique(apply(data, 1, function(x) paste(x[c(names_from)], collapse = names_sep))) | ||
} else { | ||
vars <- regmatches(names_glue, gregexpr("\\{\\K[^{}]+(?=\\})", names_glue, perl = TRUE))[[1]] | ||
tmp_data <- unique(data[, vars]) | ||
future_colnames <- unique(apply(tmp_data, 1, function(x) { | ||
tmp_vars <- list() | ||
for (i in seq_along(vars)) { | ||
tmp_vars[[i]] <- x[vars[i]] | ||
} | ||
|
||
tmp_colname <- gsub("\\{\\K[^{}]+(?=\\})", "", names_glue, perl = TRUE) | ||
tmp_colname <- gsub("\\{\\}", "%s", tmp_colname) | ||
do.call(sprintf, c(fmt = tmp_colname, tmp_vars)) | ||
})) | ||
} | ||
|
||
# stop if some column names would be duplicated (follow tidyr workflow) | ||
if (any(future_colnames %in% current_colnames)) { | ||
stop( | ||
paste0( | ||
"Some values of the columns specified in 'names_from' are already present | ||
as column names. Either use `name_prefix` or rename the following columns: ", | ||
paste(current_colnames[which(current_colnames %in% future_colnames)], sep = ", ") | ||
), | ||
call. = FALSE | ||
) | ||
} | ||
|
||
# stats::reshape works strangely when several variables are in idvar/timevar so we unite all ids in a single temporary | ||
# column that will be used by stats::reshape | ||
data$new_time <- apply(data, 1, function(x) paste(x[names_from], collapse = "_")) | ||
data[, names_from] <- NULL | ||
|
||
wide <- stats::reshape( | ||
data, | ||
v.names = values_from, | ||
idvar = id_cols, | ||
timevar = "new_time", | ||
sep = names_sep, | ||
direction = "wide" | ||
) | ||
|
||
# Clean | ||
if ("_Rows" %in% names(wide)) wide[["_Rows"]] <- NULL | ||
rownames(wide) <- NULL | ||
|
||
if (length(values_from) == 1) { | ||
to_rename <- which(startsWith(names(wide), paste0(values_from, names_sep))) | ||
names(wide)[to_rename] <- future_colnames | ||
} | ||
|
||
# Order columns as in tidyr | ||
if (length(values_from) > 1) { | ||
for (i in values_from) { | ||
tmp1 <- wide[, which(!startsWith(names(wide), i))] | ||
tmp2 <- wide[, which(startsWith(names(wide), i))] | ||
wide <- cbind(tmp1, tmp2) | ||
# doesn't work | ||
# wide <- relocate(wide, starts_with(i), .after = -1) | ||
} | ||
} | ||
|
||
new_cols <- setdiff(names(wide), old_names) | ||
names(wide)[which(names(wide) %in% new_cols)] <- paste0(names_prefix, new_cols) | ||
|
||
# Fill missing values | ||
if (!is.null(values_fill)) { | ||
if (length(values_fill) == 1) { | ||
if (is.numeric(wide[[new_cols[1]]])) { | ||
if (!is.numeric(values_fill)) { | ||
stop(paste0("`values_fill` must be of type numeric."), call. = FALSE) | ||
} else { | ||
for (i in new_cols) { | ||
wide[[i]] <- replace_na(wide[[i]], replace = values_fill) | ||
} | ||
} | ||
} else if (is.character(wide[[new_cols[1]]])) { | ||
if (!is.character(values_fill)) { | ||
stop(paste0("`values_fill` must be of type character."), call. = FALSE) | ||
} else { | ||
for (i in new_cols) { | ||
wide[[i]] <- replace_na(wide[[i]], replace = values_fill) | ||
} | ||
} | ||
} else if (is.factor(wide[[new_cols[1]]])) { | ||
if (!is.factor(values_fill)) { | ||
stop(paste0("`values_fill` must be of type factor."), call. = FALSE) | ||
} else { | ||
for (i in new_cols) { | ||
wide[[i]] <- replace_na(wide[[i]], replace = values_fill) | ||
} | ||
} | ||
} | ||
} else { | ||
stop("`values_fill` must be of length 1.", call. = FALSE) | ||
} | ||
} | ||
|
||
# Remove reshape attributes | ||
attributes(wide)$reshapeWide <- NULL | ||
|
||
# add back attributes where possible | ||
for (i in colnames(wide)) { | ||
attributes(wide[[i]]) <- variable_attr[[i]] | ||
} | ||
|
||
wide | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.