Merge pull request #101 from nathaneastwood/feat_add_pivot_functions

feat: Implement pivot_*() functions
nathaneastwood · Jul 31, 2022 · ba96eb7 · ba96eb7
2 parents db58e5a + 291ace6
commit ba96eb7
Show file tree

Hide file tree

Showing 9 changed files with 713 additions and 2 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: poorman
 Type: Package
 Title: A Poor Man's Dependency Free Recreation of 'dplyr'
-Version: 0.2.5.6
+Version: 0.2.5.7
 Authors@R: person("Nathan", "Eastwood", "", "[email protected]",
     role = c("aut", "cre"))
 Maintainer: Nathan Eastwood <[email protected]>
@@ -18,7 +18,7 @@ Suggests:
     tinytest
 License: MIT + file LICENSE
 Encoding: UTF-8
-RoxygenNote: 7.1.1
+RoxygenNote: 7.2.0
 Roxygen: list(markdown = TRUE)
 VignetteBuilder: knitr
 Language: en-GB
diff --git a/NAMESPACE b/NAMESPACE
@@ -118,6 +118,8 @@ export(ntile)
 export(num_range)
 export(peek_vars)
 export(percent_rank)
+export(pivot_longer)
+export(pivot_wider)
 export(pull)
 export(recode)
 export(recode_factor)

diff --git a/R/pivot_longer.R b/R/pivot_longer.R
@@ -0,0 +1,138 @@
+#' Pivot data from wide to long
+#'
+#' `pivot_longer()` "lengthens" data, increasing the number of rows and decreasing the number of columns. The inverse
+#' transformation is [pivot_wider()].
+#'
+#' @param data `data.frame`. The data to pivot.
+#' @param cols <[`poor-select`][select_helpers]>. Columns to pivot into longer format.
+#' @param names_to `character(n)`. The name of the new column(s) that will contain the column names.
+#' @param names_prefix `character(1)`. A regular expression used to remove matching text from the start of each variable
+#' name.
+#' @param names_sep,names_pattern `character(1)`. If `names_to` contains multiple values, this argument controls how the
+#' column name is broken up. `names_pattern` takes a regular expression containing matching groups (`()`⁠).
+#' @param values_to `character(n)`. The name of the new column(s) that will contain the values of the pivoted variables.
+#' @param values_drop_na `logical(1)`. If `TRUE`, will drop rows that contain only `NA` in the `values_to` column. This
+#' effectively converts explicit missing values to implicit missing values, and should generally be used only when
+#' missing values in data were created by its structure.
+#' @param ... Additional arguments passed on to methods.
+#'
+#' @return A `data.frame`.
+#'
+#' @examples
+#' wide_data <- data.frame(replicate(5, rnorm(10)))
+#' # Customizing the names
+#' pivot_longer(
+#'   data = wide_data,
+#'   cols = c(1, 2),
+#'   names_to = "Column",
+#'   values_to = "Numbers"
+#' )
+#'
+#' @export
+pivot_longer <- function(
+  data,
+  cols,
+  names_to = "name",
+  names_prefix = NULL,
+  names_sep = NULL,
+  names_pattern = NULL,
+  values_to = "value",
+  values_drop_na = FALSE,
+  ...
+) {
+
+  if (missing(cols)) {
+    stop("`cols` must select at least one column.")
+  }
+
+  cols <- names(eval_select_pos(data, substitute(cols)))
+
+  if (any(names_to %in% setdiff(names(data), cols))) {
+    stop(
+      paste0(
+        "Some values of the columns specified in 'names_to' are already present
+        as column names. Either use another value in `names_to` or rename the
+        following columns: ",
+        paste(names_to[which(names_to %in% setdiff(names(data), cols))], sep = ", ")
+      ),
+      call. = FALSE)
+  }
+
+  # Sanity checks ----------------
+
+  # nothing to select?
+  if (length(cols) == 0L) {
+    stop("No columns found for reshaping data.", call. = FALSE)
+  }
+
+  # Reshaping ---------------------
+  # Create Index column as needed by reshape
+  data[["_Row"]] <- as.numeric(rownames(data))
+
+  # Create a new index for cases with length(names_to) > 1
+  names_to_2 <- paste(names_to, collapse = "_")
+
+  # Reshape
+  long <- stats::reshape(
+    data,
+    varying = cols,
+    idvar = "_Row",
+    v.names = values_to,
+    timevar = names_to_2,
+    direction = "long"
+  )
+
+  # Cleaning --------------------------
+  # Sort the dataframe (to match pivot_longer's output)
+  long <- long[do.call(order, long[, c("_Row", names_to_2)]), ]
+
+  long[["_Row"]] <- NULL
+
+  # Re-insert col names as levels
+  long[[names_to_2]] <- cols[long[[names_to_2]]]
+
+  # if several variable in names_to, split the names either with names_sep or with names_pattern
+  if (length(names_to) > 1) {
+    for (i in seq_along(names_to)) {
+      if (is.null(names_pattern)) {
+        new_vals <- unlist(lapply(
+          strsplit(unique(long[[names_to_2]]), names_sep, fixed = TRUE),
+          function(x) x[i]
+        ))
+        long[[names_to[i]]] <- new_vals
+      } else {
+        colPattern <- regmatches(
+          x = unique(long[[names_to_2]]),
+          m = regexec(names_pattern, unique(long[[names_to_2]]))
+        )
+        colPattern <- as.data.frame(do.call(rbind, colPattern))[, c(1, i + 1)]
+        names(colPattern) <- c(names_to_2, names_to[i])
+        long <- left_join(x = long, y = colPattern, by = names_to_2)
+      }
+    }
+    long[[names_to_2]] <- NULL
+  }
+
+  # reorder
+  long <- relocate(.data = long, values_to, .after = -1)
+
+  # remove names prefix if specified
+  if (!is.null(names_prefix)) {
+    if (length(names_to) > 1) {
+      stop("`names_prefix` only works when `names_to` is of length 1.", call. = FALSE)
+    }
+    long[[names_to]] <- gsub(paste0("^", names_prefix), "", long[[names_to]])
+  }
+
+  if (values_drop_na) {
+    long <- long[!is.na(long[, values_to]), ]
+  }
+
+  # Reset row names
+  rownames(long) <- NULL
+
+  # Remove reshape attributes
+  attributes(long)$reshapeLong <- NULL
+
+  long
+}
diff --git a/R/pivot_wider.R b/R/pivot_wider.R
@@ -0,0 +1,217 @@
+#' Pivot data from long to wide
+#'
+#' `pivot_wider()` "widens" data, increasing the number of columns and decreasing the number of rows. The inverse
+#' transformation is [pivot_longer()].
+#'
+#' @param data `data.frame`. The data to pivot.
+#' @param id_cols `character(1)`. The name of the column that identifies the rows. If `NULL`, it will use all the unique
+#' rows.
+#' @param names_from `character(n)`. The name of the column(s) that contains the levels to be used as future column
+#' names.
+#' @param names_prefix `character(1)`. String added to the start of every variable name. This is particularly useful if
+#' `names_from` is a numeric vector and you want to create syntactic variable names.
+#' @param names_sep `character(1)`. If `names_from` or `values_from` contains multiple variables, this will be used to
+#' join their values together into a single string to use as a column name.
+#' @param names_glue `character(1)`. Instead of `names_sep` and `names_prefix`, you can supply a
+#' [glue specification](https://glue.tidyverse.org/index.html) that uses the `names_from` columns to create custom
+#' column names. Note that the only delimiters supported by `names_glue` are curly brackets, `{` and `}`.
+#' @param values_from `character(n)`. The name of the column that contains the values to be used as future variable
+#' values.
+#' @param values_fill `numeric(n)`. Optionally, a (scalar) value that will be used to replace missing values in the new
+#' columns created.
+#' @param ... Not used for now.
+#'
+#' @return If a tibble was provided as input, `pivot_wider()` also returns a
+#' tibble. Otherwise, it returns a data frame.
+#'
+#' @examples
+#' data_long <- read.table(header = TRUE, text = "
+#'  subject sex condition measurement
+#'        1   M   control         7.9
+#'        1   M     cond1        12.3
+#'        1   M     cond2        10.7
+#'        2   F   control         6.3
+#'        2   F     cond1        10.6
+#'        2   F     cond2        11.1
+#'        3   F   control         9.5
+#'        3   F     cond1        13.1
+#'        3   F     cond2        13.8
+#'        4   M   control        11.5
+#'        4   M     cond1        13.4
+#'        4   M     cond2        12.9")
+#'
+#'
+#' pivot_wider(
+#'   data_long,
+#'   id_cols = "subject",
+#'   names_from = "condition",
+#'   values_from = "measurement"
+#' )
+#'
+#' pivot_wider(
+#'   data_long,
+#'   id_cols = "subject",
+#'   names_from = "condition",
+#'   values_from = "measurement",
+#'   names_prefix = "Var.",
+#'   names_sep = "."
+#' )
+#'
+#' production <- expand.grid(
+#'   product = c("A", "B"),
+#'   country = c("AI", "EI"),
+#'   year = 2000:2014
+#' )
+#' production <- filter(production, (product == "A" & country == "AI") | product == "B")
+#'
+#' production$production <- rnorm(nrow(production))
+#'
+#' pivot_wider(
+#'   production,
+#'   names_from = c("product", "country"),
+#'   values_from = "production",
+#'   names_glue = "prod_{product}_{country}"
+#' )
+#'
+#' @export
+pivot_wider <- function(
+  data,
+  id_cols = NULL,
+  values_from = "Value",
+  names_from = "Name",
+  names_sep = "_",
+  names_prefix = "",
+  names_glue = NULL,
+  values_fill = NULL,
+  ...
+) {
+
+  old_names <- names(data)
+
+  # Preserve attributes
+  variable_attr <- lapply(data, attributes)
+
+  # Create an id for stats::reshape
+  if (is.null(id_cols)) {
+    data[["_Rows"]] <- apply(
+      X = data[, !names(data) %in% c(values_from, names_from), drop = FALSE],
+      MARGIN = 1,
+      FUN = paste,
+      collapse = "_"
+    )
+    id_cols <- "_Rows"
+  }
+
+  # create pattern of column names - stats::reshape renames columns that concatenates "v.names" + values - we only want
+  # values
+  current_colnames <- colnames(data)
+  current_colnames <- current_colnames[current_colnames != "_Rows"]
+  if (is.null(names_glue)) {
+    future_colnames <- unique(apply(data, 1, function(x) paste(x[c(names_from)], collapse = names_sep)))
+  } else {
+    vars <- regmatches(names_glue, gregexpr("\\{\\K[^{}]+(?=\\})", names_glue, perl = TRUE))[[1]]
+    tmp_data <- unique(data[, vars])
+    future_colnames <- unique(apply(tmp_data, 1, function(x) {
+      tmp_vars <- list()
+      for (i in seq_along(vars)) {
+        tmp_vars[[i]] <- x[vars[i]]
+      }
+
+      tmp_colname <- gsub("\\{\\K[^{}]+(?=\\})", "", names_glue, perl = TRUE)
+      tmp_colname <- gsub("\\{\\}", "%s", tmp_colname)
+      do.call(sprintf, c(fmt = tmp_colname, tmp_vars))
+    }))
+  }
+
+  # stop if some column names would be duplicated (follow tidyr workflow)
+  if (any(future_colnames %in% current_colnames)) {
+    stop(
+      paste0(
+        "Some values of the columns specified in 'names_from' are already present
+        as column names. Either use `name_prefix` or rename the following columns: ",
+        paste(current_colnames[which(current_colnames %in% future_colnames)], sep = ", ")
+      ),
+      call. = FALSE
+    )
+  }
+
+  # stats::reshape works strangely when several variables are in idvar/timevar so we unite all ids in a single temporary
+  # column that will be used by stats::reshape
+  data$new_time <- apply(data, 1, function(x) paste(x[names_from], collapse = "_"))
+  data[, names_from] <- NULL
+
+  wide <- stats::reshape(
+    data,
+    v.names = values_from,
+    idvar = id_cols,
+    timevar = "new_time",
+    sep = names_sep,
+    direction = "wide"
+  )
+
+  # Clean
+  if ("_Rows" %in% names(wide)) wide[["_Rows"]] <- NULL
+  rownames(wide) <- NULL
+
+  if (length(values_from) == 1) {
+    to_rename <- which(startsWith(names(wide), paste0(values_from, names_sep)))
+    names(wide)[to_rename] <- future_colnames
+  }
+
+  # Order columns as in tidyr
+  if (length(values_from) > 1) {
+    for (i in values_from) {
+      tmp1 <- wide[, which(!startsWith(names(wide), i))]
+      tmp2 <- wide[, which(startsWith(names(wide), i))]
+      wide <- cbind(tmp1, tmp2)
+      # doesn't work
+      # wide <- relocate(wide, starts_with(i), .after = -1)
+    }
+  }
+
+  new_cols <- setdiff(names(wide), old_names)
+  names(wide)[which(names(wide) %in% new_cols)] <- paste0(names_prefix, new_cols)
+
+  # Fill missing values
+  if (!is.null(values_fill)) {
+    if (length(values_fill) == 1) {
+      if (is.numeric(wide[[new_cols[1]]])) {
+        if (!is.numeric(values_fill)) {
+          stop(paste0("`values_fill` must be of type numeric."), call. = FALSE)
+        } else {
+          for (i in new_cols) {
+            wide[[i]] <- replace_na(wide[[i]], replace = values_fill)
+          }
+        }
+      } else if (is.character(wide[[new_cols[1]]])) {
+        if (!is.character(values_fill)) {
+          stop(paste0("`values_fill` must be of type character."), call. = FALSE)
+        } else {
+          for (i in new_cols) {
+            wide[[i]] <- replace_na(wide[[i]], replace = values_fill)
+          }
+        }
+      } else if (is.factor(wide[[new_cols[1]]])) {
+        if (!is.factor(values_fill)) {
+          stop(paste0("`values_fill` must be of type factor."), call. = FALSE)
+        } else {
+          for (i in new_cols) {
+            wide[[i]] <- replace_na(wide[[i]], replace = values_fill)
+          }
+        }
+      }
+    } else {
+      stop("`values_fill` must be of length 1.", call. = FALSE)
+    }
+  }
+
+  # Remove reshape attributes
+  attributes(wide)$reshapeWide <- NULL
+
+  # add back attributes where possible
+  for (i in colnames(wide)) {
+    attributes(wide[[i]]) <- variable_attr[[i]]
+  }
+
+  wide
+}
diff --git a/ci/check_rhub.R b/ci/check_rhub.R
@@ -10,6 +10,7 @@ if (!is.element(platform, c("cran", rhub::platforms()[[1L]]))) {
 }
 
 install.packages("knitr")
+install.packages("rmarkdown")
 if (platform == "cran") {
   system("apt-get update && apt-get -y install libxml2-dev")
   install.packages("xml2")