Skip to content

Commit

Permalink
Merge branch 'release/3.8.2'
Browse files Browse the repository at this point in the history
  • Loading branch information
psychelzh committed Nov 8, 2023
2 parents a82f7e1 + c405c2a commit 6269251
Show file tree
Hide file tree
Showing 11 changed files with 180 additions and 109 deletions.
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: tarflow.iquizoo
Title: Setup "targets" Workflows for "iquizoo" Data Processing
Version: 3.8.1
Version: 3.8.2
Authors@R: c(
person("Liang", "Zhang", , "[email protected]", role = c("aut", "cre"),
comment = c(ORCID = "0000-0001-9041-1150")),
Expand Down Expand Up @@ -30,7 +30,6 @@ Imports:
tarchetypes,
targets,
tidyr,
tidyselect,
utils,
vctrs
Suggests:
Expand All @@ -44,6 +43,7 @@ Suggests:
roxygen2,
testthat (>= 3.0.0),
tibble,
tidytable,
withr
Remotes:
psychelzh/data.iquizoo,
Expand Down
3 changes: 2 additions & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -10,5 +10,6 @@ export(setup_templates)
export(tar_prep_iquizoo)
export(use_targets_pipeline)
export(wrangle_data)
import(dplyr)
import(rlang)
import(tidyselect)
import(tidyr)
8 changes: 8 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,11 @@
# tarflow.iquizoo 3.8.2

## Enhancements

* Added workaround using tidytable package when the type compatibility triggers an error in `preproc_data()`.
* Added a warning when no non-empty raw data found in `preproc_data()`.
* Added support for triggering a warning for `wrangle_data()` parsing error and `preproc_data()` data binding error.

# tarflow.iquizoo 3.8.1

## Bug Fixes
Expand Down
140 changes: 76 additions & 64 deletions R/preproc.R
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#' Wrangle Raw Data
#'
#' Data wrangling is the first step for data analysis.
#' Parse raw json string data as [data.frame()] and store them in a list column.
#'
#' @param data The raw data.
#' @param name_raw_json The column name in which stores user's raw data in
Expand All @@ -11,93 +11,105 @@
wrangle_data <- function(data,
name_raw_json = "game_data",
name_raw_parsed = "raw_parsed") {
# make it error-proof to avoid trivial errors
parse_raw_json <- purrr::possibly(
~ jsonlite::fromJSON(.) |>
dplyr::rename_with(tolower) |>
dplyr::mutate(dplyr::across(where(is.character), tolower)),
otherwise = NULL
data[[name_raw_parsed]] <- purrr::map(
data[[name_raw_json]],
parse_raw_json
)
data |>
dplyr::mutate(
"{name_raw_parsed}" := purrr::map( # nolint
.data[[name_raw_json]],
parse_raw_json
),
.keep = "unused"
)
select(data, !all_of(name_raw_json))
}

#' Feed Raw Data to Pre-processing
#'
#' Calculate indices using data returned by [wrangle_data()].
#' Calculate indices using data typically returned by [wrangle_data()].
#'
#' @details
#'
#' Observations with empty raw data (empty vector, e.g. `NULL`, in
#' `name_raw_parsed` column) are removed before calculating indices. If no
#' observations left after removing, a warning is signaled and `NULL` is
#' returned.
#'
#' @param data A [data.frame] contains raw data.
#' @param fn This can be a function or formula. See [rlang::as_function()] for
#' more details.
#' @param ... Additional arguments passed to `fn`.
#' @param name_raw_parsed The column name in which stores user's raw data in
#' format of a list of [data.frame][data.frame()]s.
#' @param out_name_index The column name used in output storing the name of each
#' calculated index.
#' @param out_name_score The column name used in output storing the value of
#' each calculated index.
#' @param ... Additional arguments passed to `fn`.
#' @return A [data.frame] contains the calculated indices.
#' The index names are stored in the column of `out_name_index`, and index
#' values are stored in the column of `out_name_score`.
#' @return A [data.frame] contains the calculated indices. The index names are
#' stored in the column of `out_name_index`, and index values are stored in
#' the column of `out_name_score`.
#' @export
preproc_data <- function(data, fn,
preproc_data <- function(data, fn, ...,
name_raw_parsed = "raw_parsed",
out_name_index = "index_name",
out_name_score = "score",
...) {
# do not add `possibly()` for early error is needed to check configurations
out_name_score = "score") {
data <- filter(data, !purrr::map_lgl(.data[[name_raw_parsed]], is_empty))
if (nrow(data) == 0) {
warn("No non-empty data found.")
return()
}
fn <- as_function(fn)
data_with_id <- dplyr::mutate(data, .id = seq_len(dplyr::n()))
groups <- dplyr::select(data_with_id, -all_of(name_raw_parsed))
raw_data <- dplyr::select(data_with_id, all_of(c(".id", name_raw_parsed)))
data_unnested <- try_fetch(
tidyr::unnest(raw_data, all_of(name_raw_parsed)),
data |>
mutate(
calc_indices(.data[[name_raw_parsed]], fn, ...),
.keep = "unused"
) |>
pivot_longer(
cols = !any_of(names(data)),
names_to = out_name_index,
values_to = out_name_score
) |>
vctrs::vec_restore(data)
}

# helper functions
parse_raw_json <- function(jstr) {
parsed <- tryCatch(
jsonlite::fromJSON(jstr),
error = function(cnd) {
pattern <- r"(Can't combine `.+\$.+` <.+> and `.+\$.+` <.+>)"
if (!grepl(pattern, conditionMessage(cnd))) {
abort(
"Don't know how to handle this error.",
class = "tarflow/unnest_incompatible",
parent = cnd
warn(
c(
"Failed to parse json string with the following error:",
conditionMessage(cnd),
i = "Will parse it as `NULL` instead."
)
}
raw_data |>
dplyr::mutate(
"{name_raw_parsed}" := purrr::map( # nolint
.data[[name_raw_parsed]],
~ dplyr::mutate(
.,
dplyr::across(
everything(),
as.character
)
)
)
) |>
tidyr::unnest(all_of(name_raw_parsed)) |>
utils::type.convert(as.is = TRUE) |>
vctrs::vec_restore(raw_data)
)
return()
}
)
if (nrow(data_unnested) == 0) {
if (is_empty(parsed)) {
return()
}
dplyr::inner_join(
groups,
data_unnested |>
fn(.by = ".id", ...) |>
tidyr::pivot_longer(
cols = -all_of(".id"),
names_to = out_name_index,
values_to = out_name_score
),
by = ".id"
parsed |>
rename_with(tolower) |>
mutate(across(where(is.character), tolower))
}

calc_indices <- function(l, fn, ...) {
# used as a temporary id for each element
name_id <- ".id"
tryCatch(
bind_rows(l, .id = name_id),
error = function(cnd) {
warn(
c(
"Failed to bind raw data with the following error:",
conditionMessage(cnd),
i = "Will try using tidytable package."
)
)
check_installed(
"tidytable",
"because tidyr package fails to bind raw data."
)
tidytable::bind_rows(l, .id = name_id) |>
utils::type.convert(as.is = TRUE)
}
) |>
dplyr::select(-all_of(".id"))
fn(.by = name_id, ...) |>
select(!all_of(name_id))
}
18 changes: 9 additions & 9 deletions R/targets.R
Original file line number Diff line number Diff line change
Expand Up @@ -146,8 +146,8 @@ tar_projects_info <- function(contents, templates, check_progress) {
c(
tarchetypes::tar_map(
contents |>
dplyr::distinct(.data$project_id) |>
dplyr::mutate(project_id = as.character(.data$project_id)),
distinct(.data$project_id) |>
mutate(project_id = as.character(.data$project_id)),
targets::tar_target_raw(
"progress_hash",
expr(
Expand Down Expand Up @@ -177,11 +177,11 @@ tar_projects_info <- function(contents, templates, check_progress) {
tar_fetch_data <- function(contents, templates, what) {
tarchetypes::tar_map(
contents |>
dplyr::distinct(.data$project_id, .data$game_id) |>
dplyr::mutate(
dplyr::across(c("project_id", "game_id"), as.character)
distinct(.data$project_id, .data$game_id) |>
mutate(
across(c("project_id", "game_id"), as.character)
) |>
dplyr::summarise(
summarise(
progress_hash = list(
syms(
stringr::str_glue("progress_hash_{project_id}")
Expand Down Expand Up @@ -217,12 +217,12 @@ tar_action_raw_data <- function(contents,
name_parsed = "raw_data_parsed",
name_indices = "indices") {
if (action_raw_data == "all") action_raw_data <- c("parse", "preproc")
contents <- dplyr::distinct(contents, .data$game_id)
contents <- distinct(contents, .data$game_id)
c(
if ("parse" %in% action_raw_data) {
tarchetypes::tar_map(
values = contents |>
dplyr::mutate(
mutate(
game_id = as.character(.data$game_id),
tar_data = syms(stringr::str_glue("{name_data}_{game_id}"))
),
Expand All @@ -238,7 +238,7 @@ tar_action_raw_data <- function(contents,
tarchetypes::tar_map(
values = contents |>
data.iquizoo::match_preproc(type = "inner") |>
dplyr::mutate(
mutate(
game_id = as.character(.data$game_id),
tar_parsed = syms(stringr::str_glue("{name_parsed}_{game_id}"))
),
Expand Down
3 changes: 2 additions & 1 deletion R/zzz.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#' @import rlang
#' @import tidyselect
#' @import dplyr
#' @import tidyr
NULL

# nocov start
Expand Down
22 changes: 14 additions & 8 deletions man/preproc_data.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion man/wrangle_data.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

40 changes: 40 additions & 0 deletions tests/testthat/_snaps/preproc.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,46 @@
]
}

# Deal with `NULL` in parsed data

{
"type": "list",
"attributes": {
"names": {
"type": "character",
"attributes": {},
"value": ["user_id", "index_name", "score"]
},
"row.names": {
"type": "integer",
"attributes": {},
"value": [1, 2]
},
"class": {
"type": "character",
"attributes": {},
"value": ["tbl_df", "tbl", "data.frame"]
}
},
"value": [
{
"type": "integer",
"attributes": {},
"value": [1, 3]
},
{
"type": "character",
"attributes": {},
"value": ["nhit", "nhit"]
},
{
"type": "double",
"attributes": {},
"value": ["NaN", 1]
}
]
}

# Can deal with mismatch column types in raw data

{
Expand Down
4 changes: 2 additions & 2 deletions tests/testthat/helper-preproc.R
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
prep_fun <- function(data, .by = NULL) {
data |>
dplyr::group_by(dplyr::pick(dplyr::all_of(.by))) |>
dplyr::summarise(
group_by(pick(all_of(.by))) |>
summarise(
nhit = mean(.data$nhit[.data$feedback == 1]),
.groups = "drop"
)
Expand Down
Loading

0 comments on commit 6269251

Please sign in to comment.