diff --git a/DESCRIPTION b/DESCRIPTION index 754e001..511ddac 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: tarflow.iquizoo Title: Setup "targets" Workflows for "iquizoo" Data Processing -Version: 3.8.1 +Version: 3.8.2 Authors@R: c( person("Liang", "Zhang", , "psychelzh@outlook.com", role = c("aut", "cre"), comment = c(ORCID = "0000-0001-9041-1150")), @@ -30,7 +30,6 @@ Imports: tarchetypes, targets, tidyr, - tidyselect, utils, vctrs Suggests: @@ -44,6 +43,7 @@ Suggests: roxygen2, testthat (>= 3.0.0), tibble, + tidytable, withr Remotes: psychelzh/data.iquizoo, diff --git a/NAMESPACE b/NAMESPACE index 95e3188..c04d9fa 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -10,5 +10,6 @@ export(setup_templates) export(tar_prep_iquizoo) export(use_targets_pipeline) export(wrangle_data) +import(dplyr) import(rlang) -import(tidyselect) +import(tidyr) diff --git a/NEWS.md b/NEWS.md index 005223e..f146505 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,11 @@ +# tarflow.iquizoo 3.8.2 + +## Enhancements + +* Added workaround using tidytable package when the type compatibility triggers an error in `preproc_data()`. +* Added a warning when no non-empty raw data found in `preproc_data()`. +* Added support for triggering a warning for `wrangle_data()` parsing error and `preproc_data()` data binding error. + # tarflow.iquizoo 3.8.1 ## Bug Fixes diff --git a/R/preproc.R b/R/preproc.R index 80908fc..47b74b1 100644 --- a/R/preproc.R +++ b/R/preproc.R @@ -1,6 +1,6 @@ #' Wrangle Raw Data #' -#' Data wrangling is the first step for data analysis. +#' Parse raw json string data as [data.frame()] and store them in a list column. #' #' @param data The raw data. #' @param name_raw_json The column name in which stores user's raw data in @@ -11,93 +11,105 @@ wrangle_data <- function(data, name_raw_json = "game_data", name_raw_parsed = "raw_parsed") { - # make it error-proof to avoid trivial errors - parse_raw_json <- purrr::possibly( - ~ jsonlite::fromJSON(.) |> - dplyr::rename_with(tolower) |> - dplyr::mutate(dplyr::across(where(is.character), tolower)), - otherwise = NULL + data[[name_raw_parsed]] <- purrr::map( + data[[name_raw_json]], + parse_raw_json ) - data |> - dplyr::mutate( - "{name_raw_parsed}" := purrr::map( # nolint - .data[[name_raw_json]], - parse_raw_json - ), - .keep = "unused" - ) + select(data, !all_of(name_raw_json)) } #' Feed Raw Data to Pre-processing #' -#' Calculate indices using data returned by [wrangle_data()]. +#' Calculate indices using data typically returned by [wrangle_data()]. +#' +#' @details +#' +#' Observations with empty raw data (empty vector, e.g. `NULL`, in +#' `name_raw_parsed` column) are removed before calculating indices. If no +#' observations left after removing, a warning is signaled and `NULL` is +#' returned. #' #' @param data A [data.frame] contains raw data. #' @param fn This can be a function or formula. See [rlang::as_function()] for #' more details. +#' @param ... Additional arguments passed to `fn`. #' @param name_raw_parsed The column name in which stores user's raw data in #' format of a list of [data.frame][data.frame()]s. #' @param out_name_index The column name used in output storing the name of each #' calculated index. #' @param out_name_score The column name used in output storing the value of #' each calculated index. -#' @param ... Additional arguments passed to `fn`. -#' @return A [data.frame] contains the calculated indices. -#' The index names are stored in the column of `out_name_index`, and index -#' values are stored in the column of `out_name_score`. +#' @return A [data.frame] contains the calculated indices. The index names are +#' stored in the column of `out_name_index`, and index values are stored in +#' the column of `out_name_score`. #' @export -preproc_data <- function(data, fn, +preproc_data <- function(data, fn, ..., name_raw_parsed = "raw_parsed", out_name_index = "index_name", - out_name_score = "score", - ...) { - # do not add `possibly()` for early error is needed to check configurations + out_name_score = "score") { + data <- filter(data, !purrr::map_lgl(.data[[name_raw_parsed]], is_empty)) + if (nrow(data) == 0) { + warn("No non-empty data found.") + return() + } fn <- as_function(fn) - data_with_id <- dplyr::mutate(data, .id = seq_len(dplyr::n())) - groups <- dplyr::select(data_with_id, -all_of(name_raw_parsed)) - raw_data <- dplyr::select(data_with_id, all_of(c(".id", name_raw_parsed))) - data_unnested <- try_fetch( - tidyr::unnest(raw_data, all_of(name_raw_parsed)), + data |> + mutate( + calc_indices(.data[[name_raw_parsed]], fn, ...), + .keep = "unused" + ) |> + pivot_longer( + cols = !any_of(names(data)), + names_to = out_name_index, + values_to = out_name_score + ) |> + vctrs::vec_restore(data) +} + +# helper functions +parse_raw_json <- function(jstr) { + parsed <- tryCatch( + jsonlite::fromJSON(jstr), error = function(cnd) { - pattern <- r"(Can't combine `.+\$.+` <.+> and `.+\$.+` <.+>)" - if (!grepl(pattern, conditionMessage(cnd))) { - abort( - "Don't know how to handle this error.", - class = "tarflow/unnest_incompatible", - parent = cnd + warn( + c( + "Failed to parse json string with the following error:", + conditionMessage(cnd), + i = "Will parse it as `NULL` instead." ) - } - raw_data |> - dplyr::mutate( - "{name_raw_parsed}" := purrr::map( # nolint - .data[[name_raw_parsed]], - ~ dplyr::mutate( - ., - dplyr::across( - everything(), - as.character - ) - ) - ) - ) |> - tidyr::unnest(all_of(name_raw_parsed)) |> - utils::type.convert(as.is = TRUE) |> - vctrs::vec_restore(raw_data) + ) + return() } ) - if (nrow(data_unnested) == 0) { + if (is_empty(parsed)) { return() } - dplyr::inner_join( - groups, - data_unnested |> - fn(.by = ".id", ...) |> - tidyr::pivot_longer( - cols = -all_of(".id"), - names_to = out_name_index, - values_to = out_name_score - ), - by = ".id" + parsed |> + rename_with(tolower) |> + mutate(across(where(is.character), tolower)) +} + +calc_indices <- function(l, fn, ...) { + # used as a temporary id for each element + name_id <- ".id" + tryCatch( + bind_rows(l, .id = name_id), + error = function(cnd) { + warn( + c( + "Failed to bind raw data with the following error:", + conditionMessage(cnd), + i = "Will try using tidytable package." + ) + ) + check_installed( + "tidytable", + "because tidyr package fails to bind raw data." + ) + tidytable::bind_rows(l, .id = name_id) |> + utils::type.convert(as.is = TRUE) + } ) |> - dplyr::select(-all_of(".id")) + fn(.by = name_id, ...) |> + select(!all_of(name_id)) } diff --git a/R/targets.R b/R/targets.R index b033c57..100beeb 100644 --- a/R/targets.R +++ b/R/targets.R @@ -146,8 +146,8 @@ tar_projects_info <- function(contents, templates, check_progress) { c( tarchetypes::tar_map( contents |> - dplyr::distinct(.data$project_id) |> - dplyr::mutate(project_id = as.character(.data$project_id)), + distinct(.data$project_id) |> + mutate(project_id = as.character(.data$project_id)), targets::tar_target_raw( "progress_hash", expr( @@ -177,11 +177,11 @@ tar_projects_info <- function(contents, templates, check_progress) { tar_fetch_data <- function(contents, templates, what) { tarchetypes::tar_map( contents |> - dplyr::distinct(.data$project_id, .data$game_id) |> - dplyr::mutate( - dplyr::across(c("project_id", "game_id"), as.character) + distinct(.data$project_id, .data$game_id) |> + mutate( + across(c("project_id", "game_id"), as.character) ) |> - dplyr::summarise( + summarise( progress_hash = list( syms( stringr::str_glue("progress_hash_{project_id}") @@ -217,12 +217,12 @@ tar_action_raw_data <- function(contents, name_parsed = "raw_data_parsed", name_indices = "indices") { if (action_raw_data == "all") action_raw_data <- c("parse", "preproc") - contents <- dplyr::distinct(contents, .data$game_id) + contents <- distinct(contents, .data$game_id) c( if ("parse" %in% action_raw_data) { tarchetypes::tar_map( values = contents |> - dplyr::mutate( + mutate( game_id = as.character(.data$game_id), tar_data = syms(stringr::str_glue("{name_data}_{game_id}")) ), @@ -238,7 +238,7 @@ tar_action_raw_data <- function(contents, tarchetypes::tar_map( values = contents |> data.iquizoo::match_preproc(type = "inner") |> - dplyr::mutate( + mutate( game_id = as.character(.data$game_id), tar_parsed = syms(stringr::str_glue("{name_parsed}_{game_id}")) ), diff --git a/R/zzz.R b/R/zzz.R index 65df712..b2f7f08 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -1,5 +1,6 @@ #' @import rlang -#' @import tidyselect +#' @import dplyr +#' @import tidyr NULL # nocov start diff --git a/man/preproc_data.Rd b/man/preproc_data.Rd index f9314e5..2a1e0e1 100644 --- a/man/preproc_data.Rd +++ b/man/preproc_data.Rd @@ -7,10 +7,10 @@ preproc_data( data, fn, + ..., name_raw_parsed = "raw_parsed", out_name_index = "index_name", - out_name_score = "score", - ... + out_name_score = "score" ) } \arguments{ @@ -19,6 +19,8 @@ preproc_data( \item{fn}{This can be a function or formula. See \code{\link[rlang:as_function]{rlang::as_function()}} for more details.} +\item{...}{Additional arguments passed to \code{fn}.} + \item{name_raw_parsed}{The column name in which stores user's raw data in format of a list of \link[=data.frame]{data.frame}s.} @@ -27,14 +29,18 @@ calculated index.} \item{out_name_score}{The column name used in output storing the value of each calculated index.} - -\item{...}{Additional arguments passed to \code{fn}.} } \value{ -A \link{data.frame} contains the calculated indices. -The index names are stored in the column of \code{out_name_index}, and index -values are stored in the column of \code{out_name_score}. +A \link{data.frame} contains the calculated indices. The index names are +stored in the column of \code{out_name_index}, and index values are stored in +the column of \code{out_name_score}. } \description{ -Calculate indices using data returned by \code{\link[=wrangle_data]{wrangle_data()}}. +Calculate indices using data typically returned by \code{\link[=wrangle_data]{wrangle_data()}}. +} +\details{ +Observations with empty raw data (empty vector, e.g. \code{NULL}, in +\code{name_raw_parsed} column) are removed before calculating indices. If no +observations left after removing, a warning is signaled and \code{NULL} is +returned. } diff --git a/man/wrangle_data.Rd b/man/wrangle_data.Rd index ab5d174..7d6f296 100644 --- a/man/wrangle_data.Rd +++ b/man/wrangle_data.Rd @@ -18,5 +18,5 @@ format of json string.} A \link{data.frame} contains the parsed data. } \description{ -Data wrangling is the first step for data analysis. +Parse raw json string data as \code{\link[=data.frame]{data.frame()}} and store them in a list column. } diff --git a/tests/testthat/_snaps/preproc.md b/tests/testthat/_snaps/preproc.md index c387a02..0552a3f 100644 --- a/tests/testthat/_snaps/preproc.md +++ b/tests/testthat/_snaps/preproc.md @@ -38,6 +38,46 @@ ] } +# Deal with `NULL` in parsed data + + { + "type": "list", + "attributes": { + "names": { + "type": "character", + "attributes": {}, + "value": ["user_id", "index_name", "score"] + }, + "row.names": { + "type": "integer", + "attributes": {}, + "value": [1, 2] + }, + "class": { + "type": "character", + "attributes": {}, + "value": ["tbl_df", "tbl", "data.frame"] + } + }, + "value": [ + { + "type": "integer", + "attributes": {}, + "value": [1, 3] + }, + { + "type": "character", + "attributes": {}, + "value": ["nhit", "nhit"] + }, + { + "type": "double", + "attributes": {}, + "value": ["NaN", 1] + } + ] + } + # Can deal with mismatch column types in raw data { diff --git a/tests/testthat/helper-preproc.R b/tests/testthat/helper-preproc.R index 61bdf38..354db01 100644 --- a/tests/testthat/helper-preproc.R +++ b/tests/testthat/helper-preproc.R @@ -1,7 +1,7 @@ prep_fun <- function(data, .by = NULL) { data |> - dplyr::group_by(dplyr::pick(dplyr::all_of(.by))) |> - dplyr::summarise( + group_by(pick(all_of(.by))) |> + summarise( nhit = mean(.data$nhit[.data$feedback == 1]), .groups = "drop" ) diff --git a/tests/testthat/test-preproc.R b/tests/testthat/test-preproc.R index d6ae182..589ad77 100644 --- a/tests/testthat/test-preproc.R +++ b/tests/testthat/test-preproc.R @@ -12,15 +12,15 @@ test_that("Basic situation for `wrangle_data()`", { }) test_that("Can deal with invalid or empty json", { - data_case_invalid <- tibble::tibble( - game_data = c("[1", "[]", "{}") - ) + data_case_invalid <- data.frame(game_data = "[1") wrangle_data(data_case_invalid) |> - expect_silent() |> + expect_warning("Failed to parse json string") |> + purrr::pluck("raw_parsed", 1) |> + expect_null() + data_case_empty <- data.frame(game_data = c("[]", "{}")) + wrangle_data(data_case_empty) |> purrr::pluck("raw_parsed") |> - purrr::map_lgl(is.null) |> - all() |> - expect_true() + purrr::walk(expect_null) }) test_that("Change names and values to lowercase", { @@ -43,33 +43,36 @@ test_that("Basic situation in `preproc_data()`", { preproc_data(data, fn = prep_fun) |> expect_silent() |> expect_snapshot_value(style = "json2") - tibble::tibble(raw_parsed = list(NULL)) |> - preproc_data(prep_fun) |> - expect_null() }) -test_that("Can deal with mismatch column types in raw data", { - data <- tibble::tibble( +test_that("Deal with `NULL` in parsed data", { + tibble::tibble(raw_parsed = list(NULL)) |> + preproc_data(prep_fun) |> + expect_null() |> + expect_warning("No non-empty data found.") + tibble::tibble( user_id = 1:3, raw_parsed = list( data.frame(nhit = 1, feedback = 0), - data.frame(nhit = 2, feedback = 1), - data.frame(nhit = "3", feedback = 1) + NULL, + data.frame(nhit = 1, feedback = 1) ) - ) - preproc_data(data, fn = prep_fun) |> - expect_silent() |> + ) |> + preproc_data(fn = prep_fun) |> expect_snapshot_value(style = "json2") }) -test_that("Abort if unrecognized error occured", { +test_that("Can deal with mismatch column types in raw data", { + skip_if_not_installed("tidytable") data <- tibble::tibble( - user_id = 1:2, + user_id = 1:3, raw_parsed = list( data.frame(nhit = 1, feedback = 0), - 1 + data.frame(nhit = 2, feedback = 1), + data.frame(nhit = "3", feedback = 1) ) ) preproc_data(data, fn = prep_fun) |> - expect_error(class = "tarflow/unnest_incompatible") + expect_snapshot_value(style = "json2") |> + expect_warning("Failed to bind raw data") })