easystats · etiennebacher · Oct 10, 2022 · Oct 3, 2022 · Oct 4, 2022 · Oct 4, 2022
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Type: Package
 Package: datawizard
 Title: Easy Data Wrangling and Statistical Transformations
-Version: 0.6.2
+Version: 0.6.2.1
 Authors@R: c(
     person("Indrajeet", "Patil", , "[email protected]", role = c("aut", "cre"),
            comment = c(ORCID = "0000-0003-1995-6531", Twitter = "@patilindrajeets")),

diff --git a/NAMESPACE b/NAMESPACE
@@ -220,6 +220,8 @@ export(kurtosis)
 export(normalize)
 export(object_has_names)
 export(object_has_rownames)
+export(old_data_to_long)
+export(old_data_to_wide)
 export(print_html)
 export(print_md)
 export(ranktransform)

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,10 @@
+# datawizard (development version)
+
+MAJOR CHANGES
+
+* `data_to_long()` and `data_to_wide()` have had significant performance improvements,
+  sometimes as high as a ten-fold speedup.
+
 # datawizard 0.6.2
 
 BREAKING CHANGES

diff --git a/R/data_to_long.R b/R/data_to_long.R
@@ -0,0 +1,282 @@
+#' Reshape (pivot) data from wide to long
+#'
+#' This function "lengthens" data, increasing the number of rows and decreasing
+#' the number of columns. This is a dependency-free base-R equivalent of
+#' `tidyr::pivot_longer()`.
+#'
+#' @param data A data frame to pivot.
+#' @param names_to The name of the new column that will contain the column
+#'   names.
+#' @param names_prefix A regular expression used to remove matching text from
+#' the start of each variable name.
+#' @param names_sep,names_pattern If `names_to` contains multiple values, this
+#' argument controls how the column name is broken up.
+#' `names_pattern` takes a regular expression containing matching groups, i.e. "()".
+#' @param values_to The name of the new column that will contain the values of
+#'   the pivoted variables.
+#' @param values_drop_na If `TRUE`, will drop rows that contain only `NA` in the
+#'   `values_to` column. This effectively converts explicit missing values to
+#'   implicit missing values, and should generally be used only when missing values
+#'   in data were created by its structure.
+#' @param rows_to The name of the column that will contain the row names or row
+#'   numbers from the original data. If `NULL`, will be removed.
+#' @param ... Currently not used.
+#' @inheritParams find_columns
+#' @param cols Identical to `select`. This argument is here to ensure compatibility
+#'   with `tidyr::pivot_longer()`. If both `select` and `cols` are provided, `cols`
+#'   is used.
+#' @param colnames_to Deprecated. Use `names_to` instead.
+#'
+#' @return If a tibble was provided as input, `reshape_longer()` also returns a
+#' tibble. Otherwise, it returns a data frame.
+#'
+#' @examples
+#' \donttest{
+#' wide_data <- data.frame(replicate(5, rnorm(10)))
+#'
+#' # Default behaviour (equivalent to tidyr::pivot_longer(wide_data, cols = 1:5))
+#' data_to_long(wide_data)
+#'
+#' # Customizing the names
+#' data_to_long(wide_data,
+#'   select = c(1, 2),
+#'   names_to = "Column",
+#'   values_to = "Numbers",
+#'   rows_to = "Row"
+#' )
+#'
+#' # Full example
+#' # ------------------
+#' if (require("psych")) {
+#'   data <- psych::bfi # Wide format with one row per participant's personality test
+#'
+#'   # Pivot long format
+#'   data_to_long(data,
+#'     select = regex("\\d"), # Select all columns that contain a digit
+#'     names_to = "Item",
+#'     values_to = "Score",
+#'     rows_to = "Participant"
+#'   )
+#'
+#'   if (require("tidyr")) {
+#'     reshape_longer(
+#'       tidyr::who,
+#'       select = new_sp_m014:newrel_f65,
+#'       names_to = c("diagnosis", "gender", "age"),
+#'       names_pattern = "new_?(.*)_(.)(.*)",
+#'       values_to = "count"
+#'     )
+#'   }
+#' }
+#' }
+#'
+#' @inherit data_rename seealso
+#' @export
+
+data_to_long <- function(
+    data,
+    select = "all",
+    names_to = "name",
+    names_prefix = NULL,
+    names_sep = NULL,
+    names_pattern = NULL,
+    values_to = "value",
+    values_drop_na = FALSE,
+    rows_to = NULL,
+    ignore_case = FALSE,
+    regex = FALSE,
+    ...,
+    cols,
+    colnames_to
+  ){
+
+  # Check args
+  if (!missing(colnames_to)) {
+    .is_deprecated("colnames_to", "names_to")
+    if (is.null(names_to)) {
+      names_to <- colnames_to
+    }
+  }
+
+  # Prefer "cols" over "select" for compat with tidyr::pivot_longer
+  if (!missing(cols)) {
+    select <- substitute(cols)
+    cols <- .select_nse(
+      select,
+      data,
+      exclude = NULL,
+      ignore_case = ignore_case,
+      regex = regex,
+      verbose = FALSE
+    )
+  } else {
+    if (!missing(select) || !is.null(select)) {
+      cols <- .select_nse(
+        select,
+        data,
+        exclude = NULL,
+        ignore_case = ignore_case,
+        regex = regex,
+        verbose = FALSE
+      )
+    } else {
+      insight::format_error(
+        "You need to specify columns to pivot, either with `select` or `cols`."
+      )
+    }
+  }
+
+  if (length(names_to) > 1 && is.null(names_sep) && is.null(names_pattern)) {
+    insight::format_error(
+      "If you supply multiple names in `names_to`, you must also supply one of `names_sep` or `names_pattern`."
+    )
+  }
+
+  # Remove tidyverse attributes, will add them back at the end
+  if (inherits(data, "tbl_df")) {
+    tbl_input <- TRUE
+    data <- as.data.frame(data, stringsAsFactors = FALSE)
+  } else {
+    tbl_input <- FALSE
+  }
+
+  if (any(names_to %in% setdiff(names(data), cols))) {
+    insight::format_error(
+      "Some values of the columns specified in 'names_to' are already present as column names.",
+      paste0(
+        "Either use another value in `names_to` or rename the following columns: ",
+        text_concatenate(names_to[which(names_to %in% setdiff(names(data), cols))])
+      )
+    )
+  }
+
+  # nothing to select?
+  if (!length(cols)) {
+    stop("No columns found for reshaping data.", call. = FALSE)
+  }
+
+  not_selected <- setdiff(names(data), cols)
+
+  # create a temp id so that we know how to rearrange the rows once the data is
+  # stacked
+  not_stacked <- data[, not_selected, drop = FALSE]
+  not_stacked[["_Rows"]] <- coerce_to_numeric(row.names(data))
+
+  # stack the selected columns
+  stacked_data <- .stack(data[, cols, drop = FALSE])[, 2:1]
+
+  # reorder the rows to have a repeated sequence when all vars are selected to
+  # pivot
+  #
+  # See with following example:
+  # wide_data <- data.frame(replicate(5, rnorm(10)))
+  # data_to_long(wide_data)
+
+  needs_to_rearrange <- length(not_selected) == 0 && is.null(rows_to)
+  if (isTRUE(needs_to_rearrange)) {
+    # https://stackoverflow.com/questions/73984957/efficient-way-to-reorder-rows-to-have-a-repeated-sequence
+    stacked_data <- stacked_data[c(
+      matrix(
+        seq_len(nrow(stacked_data)),
+        nrow = length(unique(stacked_data$ind)),
+        byrow = TRUE
+      )
+    ), ]
+
+    row.names(stacked_data) <- NULL
+  }
+
+  stacked_data <- data_rename(stacked_data, "values", values_to)
+
+  # split columns if several names in names_to or names_pattern is specified
+  if (length(names_to) > 1) {
+
+    if (is.null(names_pattern)) {
+      # faster than strsplit
+      tmp <- utils::read.csv(
+        text = stacked_data$ind,
+        sep = names_sep,
+        stringsAsFactors = FALSE,
+        header = FALSE
+      )
+      names(tmp) <- paste0("V", seq_len(ncol(tmp)))
+      tmp[tmp == ""] <- NA
+
+      stacked_data$ind <- NULL
+      stacked_data <- cbind(tmp, stacked_data)
+
+    } else {
+      tmp <- regmatches(
+        unique(stacked_data$ind),
+        regexec(names_pattern, unique(stacked_data$ind))
+      )
+      tmp <- as.data.frame(do.call(rbind, tmp), stringsAsFactors = FALSE)
+      names(tmp) <- c("ind", names_to)
+      # faster than merge
+      stacked_data <- cbind(stacked_data, tmp[match(stacked_data[["ind"]], tmp[["ind"]]), -1])
+      stacked_data$ind <- NULL
+
+    }
+
+  }
+
+  stacked_data <- data_relocate(stacked_data, select = values_to, after = -1)
+
+  # reunite unselected data with stacked data
+  out <- cbind(
+    not_stacked, stats::setNames(stacked_data, c(names_to, values_to)),
+    row.names = NULL
+  )
+
+
+  if (!is.null(names_prefix)) {
+    if (length(names_to) > 1) {
+      insight::format_error(
+        "`names_prefix` only works when `names_to` is of length 1."
+      )
+    }
+    out[[names_to]] <- gsub(paste0("^", names_prefix), "", out[[names_to]])
+  }
+
+  # rearrange the rows with the temp id
+  if (length(not_selected) > 0) {
+    out <- data_arrange(out, "_Rows")
+  }
+
+  # Remove or rename the row index
+  if (is.null(rows_to)) {
+    out[["_Rows"]] <- NULL
+  } else {
+    out <- data_rename(out, "_Rows", rows_to)
+  }
+
+  if (values_drop_na) {
+    out <- out[!is.na(out[, values_to]), ]
+  }
+
+  # add back tidyverse attributes
+  if (isTRUE(tbl_input)) {
+    class(out) <- c("tbl_df", "tbl", "data.frame")
+  }
+
+  # reset row names
+  if (.has_numeric_rownames(data)) {
+    row.names(out) <- NULL
+  }
+
+  out
+}
+
+
+#' Code adapted from utils::stack (but largely modified)
+#'
+#' @noRd
+
+.stack <- function(x) {
+  ind <- rep(names(x), times = lengths(x))
+  data.frame(values = unlist(unname(x)), ind, stringsAsFactors = FALSE)
+}
+
+#' @rdname data_to_long
+#' @export
+reshape_longer <- data_to_long