Merge branch 'main' into rewrite-reshape

easystats · Oct 10, 2022 · bf87604 · bf87604
2 parents 60b9797 + 8698506
commit bf87604
Show file tree

Hide file tree

Showing 28 changed files with 206 additions and 123 deletions.
diff --git a/.github/workflows/draft-pdf.yaml b/.github/workflows/draft-pdf.yaml
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -16,14 +16,17 @@ Authors@R: c(
     person("Etienne", "Bacher", , "[email protected]", role = "aut",
            comment = c(ORCID = "0000-0002-9271-5075")),
     person("Rémi", "Thériault", , "[email protected]", role = "ctb",
-           comment = c(ORCID = "0000-0003-4315-6788", Twitter = "@rempsyc"))
+           comment = c(ORCID = "0000-0003-4315-6788", Twitter = "@rempsyc")),
+    person("Thomas J.", "Faulkenberry", , "[email protected]", role = "rev"),
+    person("Robert", "Garrett", , "[email protected]", role = "rev")
   )
 Maintainer: Indrajeet Patil <[email protected]>
 Description: A lightweight package to assist in key steps involved in any data 
     analysis workflow: (1) wrangling the raw data to get it in the needed form, 
     (2) applying preprocessing steps and statistical transformations, and 
     (3) compute statistical summaries of data properties and distributions. 
     It is also the data wrangling backend for packages in 'easystats' ecosystem.
+    References: Patil et al. (2022) <doi:10.21105/joss.04684>.
 License: GPL (>= 3)
 URL: https://easystats.github.io/datawizard/
 BugReports: https://github.com/easystats/datawizard/issues

diff --git a/NEWS.md b/NEWS.md
@@ -1,10 +1,19 @@
-# datawizard (development version)
+=======
+# datawizard 0.6.2.1
 
 MAJOR CHANGES
 
+* There is new a publication about the `{datawizard}` package: 
+  Patil et al. (2022) <doi:10.21105/joss.04684>.
+
 * `data_to_long()` and `data_to_wide()` have had significant performance improvements,
   sometimes as high as a ten-fold speedup.
 
+MINOR CHANGES
+
+* When column names are misspelled, most functions now suggest which
+  existing columns possibly could be meant.
+
 # datawizard 0.6.2
 
 BREAKING CHANGES
@@ -17,14 +26,16 @@ BREAKING CHANGES
   `remove_empty_rows()` remove observations that completely have missing or
   empty character values.
 
-CHANGES
-
-* `data_arrange()` now works with data frames that were grouped using
-  `data_group()` (#274).
+MINOR CHANGES
 
 * `data_read()` gains a `convert_factors` argument, to turn off automatic
   conversion from numeric variables into factors.
 
+BUG FIXES
+
+* `data_arrange()` now works with data frames that were grouped using
+  `data_group()` (#274).
+
 # datawizard 0.6.1
 
 * Updates tests for upcoming changes in the `{tidyselect}` package (#267).

diff --git a/R/data_arrange.R b/R/data_arrange.R
@@ -42,7 +42,7 @@ data_arrange.default <- function(data, select = NULL, safe = TRUE) {
   data <- .coerce_to_dataframe(data)
 
   # find which vars should be decreasing
-  desc <- select[grepl("^-", select)]
+  desc <- select[startsWith(select, "-")]
   desc <- gsub("^-", "", desc)
   select <- gsub("^-", "", select)
 

diff --git a/R/data_relocate.R b/R/data_relocate.R
@@ -93,18 +93,30 @@ data_relocate <- function(data,
   data_cols <- names(data)
   position <- which(data_cols %in% cols)
 
+  # remember original values, for more informative messages
+  original_before <- before
+  original_after <- after
+
   # Find new positions
   if (!is.null(before)) {
     before <- before[before %in% data_cols][1] # Take first that exists (if vector is supplied)
-    if (length(before) != 1) {
-      stop("The column passed to `before` wasn't found. Possibly mispelled.", call. = FALSE)
+    if (length(before) != 1 || is.na(before)) {
+      # guess the misspelled column
+      insight::format_error(
+        "The column passed to `before` wasn't found.",
+        .misspelled_string(data_cols, original_before[1], default_message = "Possibly misspelled?")
+      )
     }
     where <- min(match(before, data_cols))
     position <- c(setdiff(position, where), where)
   } else if (!is.null(after)) {
     after <- after[after %in% data_cols][1] # Take first that exists (if vector is supplied)
-    if (length(after) != 1) {
-      stop("The column passed to `after` wasn't found. Possibly mispelled.", call. = FALSE)
+    if (length(after) != 1 || is.na(after)) {
+      # guess the misspelled column
+      insight::format_error(
+        "The column passed to `after` wasn't found.",
+        .misspelled_string(data_cols, original_after[1], default_message = "Possibly misspelled?")
+      )
     }
     where <- max(match(after, data_cols))
     position <- c(where, setdiff(position, where))

diff --git a/R/datawizard-package.R b/R/datawizard-package.R
@@ -12,6 +12,7 @@
 #' - compute statistical summaries of data properties and distributions.
 #'
 #' It is also the data wrangling backend for packages in 'easystats' ecosystem.
+#' References: Patil et al. (2022) <doi:10.21105/joss.04684>.
 #'
 #' @docType package
 #' @aliases datawizard datawizard-package

diff --git a/R/select_helpers.R b/R/select_helpers.R
@@ -260,10 +260,18 @@
     from <- which(cn == from_to[1])
     to <- which(cn == from_to[2])
     if (!length(from)) {
-      stop("Could not find variable '", from_to[1], "' in data.", call. = FALSE)
+      # guess the misspelled column
+      insight::format_error(
+        paste0("Could not find variable \"", from_to[1], "\" in data."),
+        .misspelled_string(cn, from_to[1], default_message = "Possibly misspelled?")
+      )
     }
     if (!length(to)) {
-      stop("Could not find variable '", from_to[2], "' in data.", call. = FALSE)
+      # guess the misspelled column
+      insight::format_error(
+        paste0("Could not find variable \"", from_to[2], "\" in data."),
+        .misspelled_string(cn, from_to[2], default_message = "Possibly misspelled?")
+      )
     }
     if (negate) {
       pattern <- columns[setdiff(seq_len(ncol(data)), from:to)]
@@ -298,7 +306,6 @@
     exclude <- .check_pattern_and_exclude(exclude, data, ignore_case, verbose)
     pattern <- setdiff(pattern, exclude)
   }
-
   pattern
 }
 
@@ -337,7 +344,8 @@
   if (!all(pattern %in% columns)) {
     if (isTRUE(verbose)) {
       insight::format_warning(
-        paste0("Following variable(s) were not found: ", paste0(setdiff(pattern, columns), collapse = ", "))
+        paste0("Following variable(s) were not found: ", paste0(setdiff(pattern, columns), collapse = ", ")),
+        .misspelled_string(columns, setdiff(pattern, columns), default_message = "Possibly misspelled?")
       )
     }
     pattern <- intersect(pattern, columns)

diff --git a/R/utils.R b/R/utils.R
@@ -54,3 +54,65 @@
 .has_numeric_rownames <- function(data) {
   identical(attributes(data)$row.names, seq_len(nrow(data)))
 }
+
+
+#' Fuzzy grep, matches pattern that are close, but not identical
+#' Example:
+#' colnames(iris)
+#' p <- sprintf("(%s){~%i}", "Spela", 2)
+#' grep(pattern = p, x = colnames(iris), ignore.case = FALSE)
+#' @keywords internal
+#' @noRd
+
+.fuzzy_grep <- function(x, pattern, precision = NULL) {
+  if (is.null(precision)) {
+    precision <- round(nchar(pattern) / 3)
+  }
+  if (precision > nchar(pattern)) {
+    return(NULL)
+  }
+  p <- sprintf("(%s){~%i}", pattern, precision)
+  grep(pattern = p, x = x, ignore.case = FALSE)
+}
+
+
+#' create a message string to tell user about matches that could possibly
+#' be the string they were looking for
+#'
+#' @keywords internal
+#' @noRd
+
+.misspelled_string <- function(source, searchterm, default_message = NULL) {
+  if (is.null(searchterm) || length(searchterm) < 1) {
+    return(default_message)
+  }
+  # used for many matches
+  more_found <- ""
+  # init default
+  msg <- ""
+  # guess the misspelled string
+  possible_strings <- unlist(lapply(searchterm, function(s) {
+    source[.fuzzy_grep(source, s)]
+  }))
+  if (length(possible_strings)) {
+    msg <- "Did you mean "
+    if (length(possible_strings) > 1) {
+      # make sure we don't print dozens of alternatives for larger data frames
+      if (length(possible_strings) > 5) {
+        more_found <- sprintf(
+          " We even found %i more possible matches, not shown here.",
+          length(possible_strings) - 5
+        )
+        possible_strings <- possible_strings[1:5]
+      }
+      msg <- paste0(msg, "one of ", text_concatenate(possible_strings, enclose = "\"", last = " or "))
+    } else {
+      msg <- paste0(msg, "\"", possible_strings, "\"")
+    }
+    msg <- paste0(msg, "?", more_found)
+  } else {
+    msg <- default_message
+  }
+  # no double white space
+  insight::trim_ws(msg)
+}
diff --git a/R/utils_data.R b/R/utils_data.R
@@ -45,17 +45,17 @@ rownames_as_column <- function(x, var = "rowname") {
 #' @rdname rownames
 #' @export
 column_as_rownames <- function(x, var = "rowname") {
-  if (!is.character(var) & !is.numeric(var)) {
-    stop("Argument 'var' must be of type character or numeric.")
+  if (!is.character(var) && !is.numeric(var)) {
+    insight::format_error("Argument `var` must be of type character or numeric.")
   }
   if (is.character(var)) {
     if (!var %in% names(x)) {
-      stop(paste0('Variable "', var, '" is not in the data frame.'))
+      insight::format_error(paste0("Variable \"", var, "\" is not in the data frame."))
     }
   }
   if (is.numeric(var)) {
-    if (var > ncol(x) | var <= 0) {
-      stop("Column ", var, " does not exist. There are ", ncol(x), " columns in the data frame.")
+    if (var > ncol(x) || var <= 0) {
+      insight::format_error("Column ", var, " does not exist. There are ", ncol(x), " columns in the data frame.")
     }
   }
   rownames(x) <- x[[var]]
@@ -102,10 +102,10 @@ column_as_rownames <- function(x, var = "rowname") {
 #'
 row_to_colnames <- function(x, row = 1, na_prefix = "x", verbose = TRUE) {
   if (!is.numeric(row)) {
-    insight::format_error("Argument 'row' must be of type numeric.")
+    insight::format_error("Argument `row` must be of type numeric.")
   }
   if (length(row) != 1) {
-    insight::format_error("Argument 'row' must be of length 1.")
+    insight::format_error("Argument `row` must be of length 1.")
   }
   if (nrow(x) < row) {
     insight::format_error(
@@ -129,8 +129,8 @@ row_to_colnames <- function(x, row = 1, na_prefix = "x", verbose = TRUE) {
       insight::format_warning(
         paste0(
           "Some values of row ", row,
-          " were NAs. The corresponding column names are prefixed with '",
-          na_prefix, "'."
+          " were NAs. The corresponding column names are prefixed with `",
+          na_prefix, "`."
         )
       )
     }
@@ -146,12 +146,12 @@ row_to_colnames <- function(x, row = 1, na_prefix = "x", verbose = TRUE) {
 #' @export
 colnames_to_row <- function(x, prefix = "x") {
   if (length(prefix) != 1) {
-    insight::format_error("Argument 'prefix' must be of length 1.")
+    insight::format_error("Argument `prefix` must be of length 1.")
   }
   if (!is.character(prefix)) {
-    insight::format_error("Argument 'prefix' must be of type character.")
+    insight::format_error("Argument `prefix` must be of type character.")
   }
   x2 <- rbind(colnames(x), x)
-  colnames(x2) <- paste0(prefix, 1:ncol(x2))
+  colnames(x2) <- paste0(prefix, seq_len(ncol(x2)))
   x2
 }
diff --git a/README.Rmd b/README.Rmd
@@ -17,7 +17,7 @@ set.seed(333)
 library(datawizard)
 ```
 
-[![publication](https://img.shields.io/badge/Cite-Unpublished-yellow)](https://github.com/easystats/datawizard/blob/master/inst/CITATION)
+[![DOI](https://joss.theoj.org/papers/10.21105/joss.04684/status.svg)](https://doi.org/10.21105/joss.04684)
 [![downloads](http://cranlogs.r-pkg.org/badges/datawizard)](https://cran.r-project.org/package=datawizard)
 [![total](https://cranlogs.r-pkg.org/badges/grand-total/datawizard)](https://cranlogs.r-pkg.org/) [![status](https://tinyverse.netlify.com/badge/datawizard)](https://CRAN.R-project.org/package=datawizard) [![lifecycle](https://img.shields.io/badge/lifecycle-maturing-blue.svg)](https://lifecycle.r-lib.org/articles/stages.html)
 

diff --git a/README.md b/README.md
@@ -1,7 +1,7 @@
 
 # `datawizard`: Easy Data Wrangling and Statistical Transformations <img src='man/figures/logo.png' align="right" height="139" />
 
-[![publication](https://img.shields.io/badge/Cite-Unpublished-yellow)](https://github.com/easystats/datawizard/blob/master/inst/CITATION)
+[![DOI](https://joss.theoj.org/papers/10.21105/joss.04684/status.svg)](https://doi.org/10.21105/joss.04684)
 [![downloads](http://cranlogs.r-pkg.org/badges/datawizard)](https://cran.r-project.org/package=datawizard)
 [![total](https://cranlogs.r-pkg.org/badges/grand-total/datawizard)](https://cranlogs.r-pkg.org/)
 [![status](https://tinyverse.netlify.com/badge/datawizard)](https://CRAN.R-project.org/package=datawizard)
@@ -58,22 +58,23 @@ To cite the package, run the following command:
 ``` r
 citation("datawizard")
 
-To cite datawizard in publications use:
+To cite package 'datawizard' in publications use:
 
-  Patil, Makowski, Ben-Shachar, Wiernik, Bacher, & Lüdecke (2022).
-  datawizard: An R Package for Easy Data Preparation and Statistical
-  Transformations. CRAN. Available from
-  https://easystats.github.io/datawizard/
+  Patil et al., (2022). datawizard: An R Package for Easy Data
+  Preparation and Statistical Transformations. Journal of Open Source
+  Software, 7(78), 4684, https://doi.org/10.21105/joss.04684
 
 A BibTeX entry for LaTeX users is
 
   @Article{,
-    title = {datawizard: An R Package for Easy Data Preparation and Statistical Transformations},
+    title = {{datawizard}: An {R} Package for Easy Data Preparation and Statistical Transformations},
     author = {Indrajeet Patil and Dominique Makowski and Mattan S. Ben-Shachar and Brenton M. Wiernik and Etienne Bacher and Daniel Lüdecke},
-    journal = {CRAN},
+    journal = {Journal of Open Source Software},
     year = {2022},
-    note = {R package},
-    url = {https://easystats.github.io/datawizard/},
+    volume = {7},
+    number = {78},
+    pages = {4684},
+    doi = {10.21105/joss.04684},
   }
 ```
 

diff --git a/inst/CITATION b/inst/CITATION
@@ -1,16 +1,12 @@
 bibentry(
   bibtype="Article",
-  title="datawizard: An R Package for Easy Data Preparation and Statistical Transformations",
+  title="{datawizard}: An {R} Package for Easy Data Preparation and Statistical Transformations",
   author=c(person("Indrajeet", "Patil"), person("Dominique", "Makowski"), person("Mattan S.", "Ben-Shachar"), person("Brenton M.", "Wiernik"), person("Etienne", "Bacher"), person("Daniel", "Lüdecke")),
-  journal="CRAN",
-  year="2022",
-  note="R package",
-  url="https://easystats.github.io/datawizard/",
-
-  textVersion =
-    paste("Patil, Makowski, Ben-Shachar, Wiernik, Bacher, & Lüdecke (2022). datawizard: An R Package for Easy Data Preparation and Statistical Transformations. CRAN.",
-          "Available from https://easystats.github.io/datawizard/"
-    ),
-  mheader = "To cite datawizard in publications use:"
+  journal="Journal of Open Source Software",
+  year = 2022,
+  volume = 7,
+  number = 78,
+  pages = 4684,
+  doi = "10.21105/joss.04684",
+  textVersion = "Patil et al., (2022). datawizard: An R Package for Easy Data Preparation and Statistical Transformations. Journal of Open Source Software, 7(78), 4684, https://doi.org/10.21105/joss.04684"
 )
-
diff --git a/man/datawizard-package.Rd b/man/datawizard-package.Rd
diff --git a/paper/apa.csl → paper/JOSS files/apa.csl b/paper/apa.csl → paper/JOSS files/apa.csl
diff --git a/paper/paper.Rmd → paper/JOSS files/paper.Rmd b/paper/paper.Rmd → paper/JOSS files/paper.Rmd
diff --git a/paper/paper.bib → paper/JOSS files/paper.bib b/paper/paper.bib → paper/JOSS files/paper.bib
diff --git a/paper/paper.log → paper/JOSS files/paper.log b/paper/paper.log → paper/JOSS files/paper.log
diff --git a/paper/paper.md → paper/JOSS files/paper.md b/paper/paper.md → paper/JOSS files/paper.md
diff --git a/paper/paper.pdf → paper/JOSS files/paper.pdf b/paper/paper.pdf → paper/JOSS files/paper.pdf
diff --git a/paper/Patil_et_al_2022_JOSS.pdf b/paper/Patil_et_al_2022_JOSS.pdf