Merge branch 'release/3.8.2'

psychelzh · Nov 8, 2023 · 6269251 · 6269251
2 parents a82f7e1 + c405c2a
commit 6269251
Show file tree

Hide file tree

Showing 11 changed files with 180 additions and 109 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: tarflow.iquizoo
 Title: Setup "targets" Workflows for "iquizoo" Data Processing
-Version: 3.8.1
+Version: 3.8.2
 Authors@R: c(
     person("Liang", "Zhang", , "[email protected]", role = c("aut", "cre"),
            comment = c(ORCID = "0000-0001-9041-1150")),
@@ -30,7 +30,6 @@ Imports:
     tarchetypes,
     targets,
     tidyr,
-    tidyselect,
     utils,
     vctrs
 Suggests:
@@ -44,6 +43,7 @@ Suggests:
     roxygen2,
     testthat (>= 3.0.0),
     tibble,
+    tidytable,
     withr
 Remotes:
     psychelzh/data.iquizoo,

diff --git a/NAMESPACE b/NAMESPACE
@@ -10,5 +10,6 @@ export(setup_templates)
 export(tar_prep_iquizoo)
 export(use_targets_pipeline)
 export(wrangle_data)
+import(dplyr)
 import(rlang)
-import(tidyselect)
+import(tidyr)
diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,11 @@
+# tarflow.iquizoo 3.8.2
+
+## Enhancements
+
+* Added workaround using tidytable package when the type compatibility triggers an error in `preproc_data()`.
+* Added a warning when no non-empty raw data found in `preproc_data()`.
+* Added support for triggering a warning for `wrangle_data()` parsing error and `preproc_data()` data binding error.
+
 # tarflow.iquizoo 3.8.1
 
 ## Bug Fixes

diff --git a/R/preproc.R b/R/preproc.R
@@ -1,6 +1,6 @@
 #' Wrangle Raw Data
 #'
-#' Data wrangling is the first step for data analysis.
+#' Parse raw json string data as [data.frame()] and store them in a list column.
 #'
 #' @param data The raw data.
 #' @param name_raw_json The column name in which stores user's raw data in
@@ -11,93 +11,105 @@
 wrangle_data <- function(data,
                          name_raw_json = "game_data",
                          name_raw_parsed = "raw_parsed") {
-  # make it error-proof to avoid trivial errors
-  parse_raw_json <- purrr::possibly(
-    ~ jsonlite::fromJSON(.) |>
-      dplyr::rename_with(tolower) |>
-      dplyr::mutate(dplyr::across(where(is.character), tolower)),
-    otherwise = NULL
+  data[[name_raw_parsed]] <- purrr::map(
+    data[[name_raw_json]],
+    parse_raw_json
   )
-  data |>
-    dplyr::mutate(
-      "{name_raw_parsed}" := purrr::map( # nolint
-        .data[[name_raw_json]],
-        parse_raw_json
-      ),
-      .keep = "unused"
-    )
+  select(data, !all_of(name_raw_json))
 }
 
 #' Feed Raw Data to Pre-processing
 #'
-#' Calculate indices using data returned by [wrangle_data()].
+#' Calculate indices using data typically returned by [wrangle_data()].
+#'
+#' @details
+#'
+#' Observations with empty raw data (empty vector, e.g. `NULL`, in
+#' `name_raw_parsed` column) are removed before calculating indices. If no
+#' observations left after removing, a warning is signaled and `NULL` is
+#' returned.
 #'
 #' @param data A [data.frame] contains raw data.
 #' @param fn This can be a function or formula. See [rlang::as_function()] for
 #'   more details.
+#' @param ... Additional arguments passed to `fn`.
 #' @param name_raw_parsed The column name in which stores user's raw data in
 #'   format of a list of [data.frame][data.frame()]s.
 #' @param out_name_index The column name used in output storing the name of each
 #'   calculated index.
 #' @param out_name_score The column name used in output storing the value of
 #'   each calculated index.
-#' @param ... Additional arguments passed to `fn`.
-#' @return A [data.frame] contains the calculated indices.
-#'   The index names are stored in the column of `out_name_index`, and index
-#'   values are stored in the column of `out_name_score`.
+#' @return A [data.frame] contains the calculated indices. The index names are
+#'   stored in the column of `out_name_index`, and index values are stored in
+#'   the column of `out_name_score`.
 #' @export
-preproc_data <- function(data, fn,
+preproc_data <- function(data, fn, ...,
                          name_raw_parsed = "raw_parsed",
                          out_name_index = "index_name",
-                         out_name_score = "score",
-                         ...) {
-  # do not add `possibly()` for early error is needed to check configurations
+                         out_name_score = "score") {
+  data <- filter(data, !purrr::map_lgl(.data[[name_raw_parsed]], is_empty))
+  if (nrow(data) == 0) {
+    warn("No non-empty data found.")
+    return()
+  }
   fn <- as_function(fn)
-  data_with_id <- dplyr::mutate(data, .id = seq_len(dplyr::n()))
-  groups <- dplyr::select(data_with_id, -all_of(name_raw_parsed))
-  raw_data <- dplyr::select(data_with_id, all_of(c(".id", name_raw_parsed)))
-  data_unnested <- try_fetch(
-    tidyr::unnest(raw_data, all_of(name_raw_parsed)),
+  data |>
+    mutate(
+      calc_indices(.data[[name_raw_parsed]], fn, ...),
+      .keep = "unused"
+    ) |>
+    pivot_longer(
+      cols = !any_of(names(data)),
+      names_to = out_name_index,
+      values_to = out_name_score
+    ) |>
+    vctrs::vec_restore(data)
+}
+
+# helper functions
+parse_raw_json <- function(jstr) {
+  parsed <- tryCatch(
+    jsonlite::fromJSON(jstr),
     error = function(cnd) {
-      pattern <- r"(Can't combine `.+\$.+` <.+> and `.+\$.+` <.+>)"
-      if (!grepl(pattern, conditionMessage(cnd))) {
-        abort(
-          "Don't know how to handle this error.",
-          class = "tarflow/unnest_incompatible",
-          parent = cnd
+      warn(
+        c(
+          "Failed to parse json string with the following error:",
+          conditionMessage(cnd),
+          i = "Will parse it as `NULL` instead."
         )
-      }
-      raw_data |>
-        dplyr::mutate(
-          "{name_raw_parsed}" := purrr::map( # nolint
-            .data[[name_raw_parsed]],
-            ~ dplyr::mutate(
-              .,
-              dplyr::across(
-                everything(),
-                as.character
-              )
-            )
-          )
-        ) |>
-        tidyr::unnest(all_of(name_raw_parsed)) |>
-        utils::type.convert(as.is = TRUE) |>
-        vctrs::vec_restore(raw_data)
+      )
+      return()
     }
   )
-  if (nrow(data_unnested) == 0) {
+  if (is_empty(parsed)) {
     return()
   }
-  dplyr::inner_join(
-    groups,
-    data_unnested |>
-      fn(.by = ".id", ...) |>
-      tidyr::pivot_longer(
-        cols = -all_of(".id"),
-        names_to = out_name_index,
-        values_to = out_name_score
-      ),
-    by = ".id"
+  parsed |>
+    rename_with(tolower) |>
+    mutate(across(where(is.character), tolower))
+}
+
+calc_indices <- function(l, fn, ...) {
+  # used as a temporary id for each element
+  name_id <- ".id"
+  tryCatch(
+    bind_rows(l, .id = name_id),
+    error = function(cnd) {
+      warn(
+        c(
+          "Failed to bind raw data with the following error:",
+          conditionMessage(cnd),
+          i = "Will try using tidytable package."
+        )
+      )
+      check_installed(
+        "tidytable",
+        "because tidyr package fails to bind raw data."
+      )
+      tidytable::bind_rows(l, .id = name_id) |>
+        utils::type.convert(as.is = TRUE)
+    }
   ) |>
-    dplyr::select(-all_of(".id"))
+    fn(.by = name_id, ...) |>
+    select(!all_of(name_id))
 }
diff --git a/R/targets.R b/R/targets.R
@@ -146,8 +146,8 @@ tar_projects_info <- function(contents, templates, check_progress) {
   c(
     tarchetypes::tar_map(
       contents |>
-        dplyr::distinct(.data$project_id) |>
-        dplyr::mutate(project_id = as.character(.data$project_id)),
+        distinct(.data$project_id) |>
+        mutate(project_id = as.character(.data$project_id)),
       targets::tar_target_raw(
         "progress_hash",
         expr(
@@ -177,11 +177,11 @@ tar_projects_info <- function(contents, templates, check_progress) {
 tar_fetch_data <- function(contents, templates, what) {
   tarchetypes::tar_map(
     contents |>
-      dplyr::distinct(.data$project_id, .data$game_id) |>
-      dplyr::mutate(
-        dplyr::across(c("project_id", "game_id"), as.character)
+      distinct(.data$project_id, .data$game_id) |>
+      mutate(
+        across(c("project_id", "game_id"), as.character)
       ) |>
-      dplyr::summarise(
+      summarise(
         progress_hash = list(
           syms(
             stringr::str_glue("progress_hash_{project_id}")
@@ -217,12 +217,12 @@ tar_action_raw_data <- function(contents,
                                 name_parsed = "raw_data_parsed",
                                 name_indices = "indices") {
   if (action_raw_data == "all") action_raw_data <- c("parse", "preproc")
-  contents <- dplyr::distinct(contents, .data$game_id)
+  contents <- distinct(contents, .data$game_id)
   c(
     if ("parse" %in% action_raw_data) {
       tarchetypes::tar_map(
         values = contents |>
-          dplyr::mutate(
+          mutate(
             game_id = as.character(.data$game_id),
             tar_data = syms(stringr::str_glue("{name_data}_{game_id}"))
           ),
@@ -238,7 +238,7 @@ tar_action_raw_data <- function(contents,
       tarchetypes::tar_map(
         values = contents |>
           data.iquizoo::match_preproc(type = "inner") |>
-          dplyr::mutate(
+          mutate(
             game_id = as.character(.data$game_id),
             tar_parsed = syms(stringr::str_glue("{name_parsed}_{game_id}"))
           ),

diff --git a/R/zzz.R b/R/zzz.R
@@ -1,5 +1,6 @@
 #' @import rlang
-#' @import tidyselect
+#' @import dplyr
+#' @import tidyr
 NULL
 
 # nocov start

diff --git a/man/preproc_data.Rd b/man/preproc_data.Rd
diff --git a/man/wrangle_data.Rd b/man/wrangle_data.Rd
diff --git a/tests/testthat/_snaps/preproc.md b/tests/testthat/_snaps/preproc.md
@@ -38,6 +38,46 @@
       ]
     }
 
+# Deal with `NULL` in parsed data
+
+    {
+      "type": "list",
+      "attributes": {
+        "names": {
+          "type": "character",
+          "attributes": {},
+          "value": ["user_id", "index_name", "score"]
+        },
+        "row.names": {
+          "type": "integer",
+          "attributes": {},
+          "value": [1, 2]
+        },
+        "class": {
+          "type": "character",
+          "attributes": {},
+          "value": ["tbl_df", "tbl", "data.frame"]
+        }
+      },
+      "value": [
+        {
+          "type": "integer",
+          "attributes": {},
+          "value": [1, 3]
+        },
+        {
+          "type": "character",
+          "attributes": {},
+          "value": ["nhit", "nhit"]
+        },
+        {
+          "type": "double",
+          "attributes": {},
+          "value": ["NaN", 1]
+        }
+      ]
+    }
+
 # Can deal with mismatch column types in raw data
 
     {

diff --git a/tests/testthat/helper-preproc.R b/tests/testthat/helper-preproc.R
@@ -1,7 +1,7 @@
 prep_fun <- function(data, .by = NULL) {
   data |>
-    dplyr::group_by(dplyr::pick(dplyr::all_of(.by))) |>
-    dplyr::summarise(
+    group_by(pick(all_of(.by))) |>
+    summarise(
       nhit = mean(.data$nhit[.data$feedback == 1]),
       .groups = "drop"
     )