diff --git a/DESCRIPTION b/DESCRIPTION index 511ddac..7ab6433 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: tarflow.iquizoo Title: Setup "targets" Workflows for "iquizoo" Data Processing -Version: 3.8.2 +Version: 3.9.0 Authors@R: c( person("Liang", "Zhang", , "psychelzh@outlook.com", role = c("aut", "cre"), comment = c(ORCID = "0000-0001-9041-1150")), @@ -22,23 +22,20 @@ Imports: data.iquizoo (>= 2023.10.22), DBI, dplyr, - jsonlite, memoise, purrr, rlang (>= 1.0.0), stringr, tarchetypes, targets, - tidyr, - utils, - vctrs + tidyr Suggests: bit64, covr, digest, lifecycle, odbc, - preproc.iquizoo (>= 2.4.0), + preproc.iquizoo (>= 2.6.0), RMariaDB (>= 1.3.1), roxygen2, testthat (>= 3.0.0), diff --git a/NAMESPACE b/NAMESPACE index c04d9fa..216bd3a 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -3,13 +3,11 @@ export(check_source) export(fetch_data) export(fetch_iquizoo) -export(preproc_data) export(setup_option_file) export(setup_source) export(setup_templates) export(tar_prep_iquizoo) export(use_targets_pipeline) -export(wrangle_data) import(dplyr) import(rlang) import(tidyr) diff --git a/NEWS.md b/NEWS.md index f146505..9803ce3 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,10 @@ +# tarflow.iquizoo 3.9.0 + +## Breaking Changes + +* Ensure all internal SQL query templates end with semicolon. This will unavoidably invalidate existing targets for old pipelines. +* Removed `preproc_data()` and `wrangle_data()` functions. Now all data preprocessing are done in `preproc.iquizoo` package. + # tarflow.iquizoo 3.8.2 ## Enhancements diff --git a/R/preproc.R b/R/preproc.R deleted file mode 100644 index 47b74b1..0000000 --- a/R/preproc.R +++ /dev/null @@ -1,115 +0,0 @@ -#' Wrangle Raw Data -#' -#' Parse raw json string data as [data.frame()] and store them in a list column. -#' -#' @param data The raw data. -#' @param name_raw_json The column name in which stores user's raw data in -#' format of json string. -#' @param name_raw_parsed The name used to store parsed data. -#' @return A [data.frame] contains the parsed data. -#' @export -wrangle_data <- function(data, - name_raw_json = "game_data", - name_raw_parsed = "raw_parsed") { - data[[name_raw_parsed]] <- purrr::map( - data[[name_raw_json]], - parse_raw_json - ) - select(data, !all_of(name_raw_json)) -} - -#' Feed Raw Data to Pre-processing -#' -#' Calculate indices using data typically returned by [wrangle_data()]. -#' -#' @details -#' -#' Observations with empty raw data (empty vector, e.g. `NULL`, in -#' `name_raw_parsed` column) are removed before calculating indices. If no -#' observations left after removing, a warning is signaled and `NULL` is -#' returned. -#' -#' @param data A [data.frame] contains raw data. -#' @param fn This can be a function or formula. See [rlang::as_function()] for -#' more details. -#' @param ... Additional arguments passed to `fn`. -#' @param name_raw_parsed The column name in which stores user's raw data in -#' format of a list of [data.frame][data.frame()]s. -#' @param out_name_index The column name used in output storing the name of each -#' calculated index. -#' @param out_name_score The column name used in output storing the value of -#' each calculated index. -#' @return A [data.frame] contains the calculated indices. The index names are -#' stored in the column of `out_name_index`, and index values are stored in -#' the column of `out_name_score`. -#' @export -preproc_data <- function(data, fn, ..., - name_raw_parsed = "raw_parsed", - out_name_index = "index_name", - out_name_score = "score") { - data <- filter(data, !purrr::map_lgl(.data[[name_raw_parsed]], is_empty)) - if (nrow(data) == 0) { - warn("No non-empty data found.") - return() - } - fn <- as_function(fn) - data |> - mutate( - calc_indices(.data[[name_raw_parsed]], fn, ...), - .keep = "unused" - ) |> - pivot_longer( - cols = !any_of(names(data)), - names_to = out_name_index, - values_to = out_name_score - ) |> - vctrs::vec_restore(data) -} - -# helper functions -parse_raw_json <- function(jstr) { - parsed <- tryCatch( - jsonlite::fromJSON(jstr), - error = function(cnd) { - warn( - c( - "Failed to parse json string with the following error:", - conditionMessage(cnd), - i = "Will parse it as `NULL` instead." - ) - ) - return() - } - ) - if (is_empty(parsed)) { - return() - } - parsed |> - rename_with(tolower) |> - mutate(across(where(is.character), tolower)) -} - -calc_indices <- function(l, fn, ...) { - # used as a temporary id for each element - name_id <- ".id" - tryCatch( - bind_rows(l, .id = name_id), - error = function(cnd) { - warn( - c( - "Failed to bind raw data with the following error:", - conditionMessage(cnd), - i = "Will try using tidytable package." - ) - ) - check_installed( - "tidytable", - "because tidyr package fails to bind raw data." - ) - tidytable::bind_rows(l, .id = name_id) |> - utils::type.convert(as.is = TRUE) - } - ) |> - fn(.by = name_id, ...) |> - select(!all_of(name_id)) -} diff --git a/R/targets.R b/R/targets.R index 100beeb..a99b968 100644 --- a/R/targets.R +++ b/R/targets.R @@ -230,7 +230,7 @@ tar_action_raw_data <- function(contents, targets::tar_target_raw( name_parsed, expr(wrangle_data(tar_data)), - packages = "tarflow.iquizoo" + packages = "preproc.iquizoo" ) ) }, @@ -251,7 +251,7 @@ tar_action_raw_data <- function(contents, .input = input, .extra = extra ) ), - packages = c("tarflow.iquizoo", "preproc.iquizoo") + packages = "preproc.iquizoo" ) ) } @@ -266,6 +266,7 @@ utils::globalVariables( c( "progress_hash", "project_id", "game_id", "tar_data", "tar_parsed", + "wrangle_data", "preproc_data", "prep_fun", "input", "extra" ) ) diff --git a/_pkgdown.yml b/_pkgdown.yml index 9bf9fae..d7b2571 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -7,13 +7,6 @@ reference: contents: - use_targets_pipeline - tar_prep_iquizoo -- title: "Functions Used for Actions on Raw-data" - desc: > - Data parsing and pre-processing. Typically used in combination with - [preproc.iquizoo](https://psychelzh.github.io/preproc.iquizoo/) package. - contents: - - wrangle_data - - preproc_data - title: "Low-level Database operations" desc: Functions to help you interact with database. contents: diff --git a/inst/sql/contents.sql b/inst/sql/contents.sql index 422a495..8dd0bd4 100644 --- a/inst/sql/contents.sql +++ b/inst/sql/contents.sql @@ -15,4 +15,4 @@ FROM INNER JOIN iquizoo_content_db.course_child_config ccc ON ccc.ChildCourseId = cc.Id AND ccc.Deleted <> 1 INNER JOIN iquizoo_content_db.content c2 ON c2.Id = ccc.ContentId AND c2.ContentType <> 4 AND c2.Deleted <> 1 INNER JOIN iquizoo_user_db.base_organization bo ON bo.Id = pcc.OrganizationId AND bo.Deleted <> 1 -WHERE bo.Name = ? AND pcc.Name = IFNULL(?, pcc.Name) +WHERE bo.Name = ? AND pcc.Name = IFNULL(?, pcc.Name); diff --git a/inst/sql/raw_data.sql b/inst/sql/raw_data.sql index e76609f..fd82bdb 100644 --- a/inst/sql/raw_data.sql +++ b/inst/sql/raw_data.sql @@ -9,4 +9,4 @@ SELECT FROM iquizoo_business_db.{ table_name } WHERE - ProjectCourseConfigId = ? AND ContentId = ? + ProjectCourseConfigId = ? AND ContentId = ?; diff --git a/inst/sql/scores.sql b/inst/sql/scores.sql index 258c997..2a86b26 100644 --- a/inst/sql/scores.sql +++ b/inst/sql/scores.sql @@ -12,4 +12,4 @@ SELECT FROM iquizoo_business_db.{ table_name } WHERE - ProjectCourseConfigId = ? AND ContentId = ? + ProjectCourseConfigId = ? AND ContentId = ?; diff --git a/man/preproc_data.Rd b/man/preproc_data.Rd deleted file mode 100644 index 2a1e0e1..0000000 --- a/man/preproc_data.Rd +++ /dev/null @@ -1,46 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/preproc.R -\name{preproc_data} -\alias{preproc_data} -\title{Feed Raw Data to Pre-processing} -\usage{ -preproc_data( - data, - fn, - ..., - name_raw_parsed = "raw_parsed", - out_name_index = "index_name", - out_name_score = "score" -) -} -\arguments{ -\item{data}{A \link{data.frame} contains raw data.} - -\item{fn}{This can be a function or formula. See \code{\link[rlang:as_function]{rlang::as_function()}} for -more details.} - -\item{...}{Additional arguments passed to \code{fn}.} - -\item{name_raw_parsed}{The column name in which stores user's raw data in -format of a list of \link[=data.frame]{data.frame}s.} - -\item{out_name_index}{The column name used in output storing the name of each -calculated index.} - -\item{out_name_score}{The column name used in output storing the value of -each calculated index.} -} -\value{ -A \link{data.frame} contains the calculated indices. The index names are -stored in the column of \code{out_name_index}, and index values are stored in -the column of \code{out_name_score}. -} -\description{ -Calculate indices using data typically returned by \code{\link[=wrangle_data]{wrangle_data()}}. -} -\details{ -Observations with empty raw data (empty vector, e.g. \code{NULL}, in -\code{name_raw_parsed} column) are removed before calculating indices. If no -observations left after removing, a warning is signaled and \code{NULL} is -returned. -} diff --git a/man/wrangle_data.Rd b/man/wrangle_data.Rd deleted file mode 100644 index 7d6f296..0000000 --- a/man/wrangle_data.Rd +++ /dev/null @@ -1,22 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/preproc.R -\name{wrangle_data} -\alias{wrangle_data} -\title{Wrangle Raw Data} -\usage{ -wrangle_data(data, name_raw_json = "game_data", name_raw_parsed = "raw_parsed") -} -\arguments{ -\item{data}{The raw data.} - -\item{name_raw_json}{The column name in which stores user's raw data in -format of json string.} - -\item{name_raw_parsed}{The name used to store parsed data.} -} -\value{ -A \link{data.frame} contains the parsed data. -} -\description{ -Parse raw json string data as \code{\link[=data.frame]{data.frame()}} and store them in a list column. -} diff --git a/tests/testthat/_snaps/preproc.md b/tests/testthat/_snaps/preproc.md deleted file mode 100644 index 0552a3f..0000000 --- a/tests/testthat/_snaps/preproc.md +++ /dev/null @@ -1,120 +0,0 @@ -# Basic situation in `preproc_data()` - - { - "type": "list", - "attributes": { - "names": { - "type": "character", - "attributes": {}, - "value": ["user_id", "index_name", "score"] - }, - "row.names": { - "type": "integer", - "attributes": {}, - "value": [1, 2] - }, - "class": { - "type": "character", - "attributes": {}, - "value": ["tbl_df", "tbl", "data.frame"] - } - }, - "value": [ - { - "type": "integer", - "attributes": {}, - "value": [1, 2] - }, - { - "type": "character", - "attributes": {}, - "value": ["nhit", "nhit"] - }, - { - "type": "double", - "attributes": {}, - "value": ["NaN", 1] - } - ] - } - -# Deal with `NULL` in parsed data - - { - "type": "list", - "attributes": { - "names": { - "type": "character", - "attributes": {}, - "value": ["user_id", "index_name", "score"] - }, - "row.names": { - "type": "integer", - "attributes": {}, - "value": [1, 2] - }, - "class": { - "type": "character", - "attributes": {}, - "value": ["tbl_df", "tbl", "data.frame"] - } - }, - "value": [ - { - "type": "integer", - "attributes": {}, - "value": [1, 3] - }, - { - "type": "character", - "attributes": {}, - "value": ["nhit", "nhit"] - }, - { - "type": "double", - "attributes": {}, - "value": ["NaN", 1] - } - ] - } - -# Can deal with mismatch column types in raw data - - { - "type": "list", - "attributes": { - "names": { - "type": "character", - "attributes": {}, - "value": ["user_id", "index_name", "score"] - }, - "row.names": { - "type": "integer", - "attributes": {}, - "value": [1, 2, 3] - }, - "class": { - "type": "character", - "attributes": {}, - "value": ["tbl_df", "tbl", "data.frame"] - } - }, - "value": [ - { - "type": "integer", - "attributes": {}, - "value": [1, 2, 3] - }, - { - "type": "character", - "attributes": {}, - "value": ["nhit", "nhit", "nhit"] - }, - { - "type": "double", - "attributes": {}, - "value": ["NaN", 2, 3] - } - ] - } - diff --git a/tests/testthat/_snaps/targets.md b/tests/testthat/_snaps/targets.md index dfd4938..44fc186 100644 --- a/tests/testthat/_snaps/targets.md +++ b/tests/testthat/_snaps/targets.md @@ -3,7 +3,7 @@ { "type": "character", "attributes": {}, - "value": ["contents_origin", "indices_375916542735109", "indices_380173961339781", "indices_381576542159749", "indices_383674715747205", "indices_383679060169477", "indices_383791726646021", "indices_383795602420485", "indices_386196539900677", "progress_hash_534141153280389", "progress_hash_555452035072389", "raw_data_375916542735109", "raw_data_380173961339781", "raw_data_381576542159749", "raw_data_383674715747205", "raw_data_383679060169477", "raw_data_383791726646021", "raw_data_383795602420485", "raw_data_386196539900677", "raw_data_388200929063813", "raw_data_388594665001861", "raw_data_391556354638725", "raw_data_parsed_375916542735109", "raw_data_parsed_380173961339781", "raw_data_parsed_381576542159749", "raw_data_parsed_383674715747205", "raw_data_parsed_383679060169477", "raw_data_parsed_383791726646021", "raw_data_parsed_383795602420485", "raw_data_parsed_386196539900677", "raw_data_parsed_388200929063813", "raw_data_parsed_388594665001861", "raw_data_parsed_391556354638725", "scores_375916542735109", "scores_380173961339781", "scores_381576542159749", "scores_383674715747205", "scores_383679060169477", "scores_383791726646021", "scores_383795602420485", "scores_386196539900677", "scores_388200929063813", "scores_388594665001861", "scores_391556354638725", "users"] + "value": ["contents_origin", "indices_383674715747205", "indices_383679060169477", "indices_383791726646021", "indices_383795602420485", "progress_hash_555452035072389", "raw_data_383674715747205", "raw_data_383679060169477", "raw_data_383791726646021", "raw_data_383795602420485", "raw_data_388200929063813", "raw_data_388594665001861", "raw_data_391556354638725", "raw_data_parsed_383674715747205", "raw_data_parsed_383679060169477", "raw_data_parsed_383791726646021", "raw_data_parsed_383795602420485", "raw_data_parsed_388200929063813", "raw_data_parsed_388594665001861", "raw_data_parsed_391556354638725", "scores_383674715747205", "scores_383679060169477", "scores_383791726646021", "scores_383795602420485", "scores_388200929063813", "scores_388594665001861", "scores_391556354638725", "users"] } # Works when single game on different projects diff --git a/tests/testthat/helper-preproc.R b/tests/testthat/helper-preproc.R deleted file mode 100644 index 354db01..0000000 --- a/tests/testthat/helper-preproc.R +++ /dev/null @@ -1,8 +0,0 @@ -prep_fun <- function(data, .by = NULL) { - data |> - group_by(pick(all_of(.by))) |> - summarise( - nhit = mean(.data$nhit[.data$feedback == 1]), - .groups = "drop" - ) -} diff --git a/tests/testthat/test-preproc.R b/tests/testthat/test-preproc.R deleted file mode 100644 index 589ad77..0000000 --- a/tests/testthat/test-preproc.R +++ /dev/null @@ -1,78 +0,0 @@ -test_that("Basic situation for `wrangle_data()`", { - js_str <- r"([{"a": 1, "b": 2}])" - data <- tibble::tibble(game_data = js_str) - wrangle_data(data) |> - expect_silent() |> - expect_named("raw_parsed") |> - purrr::pluck("raw_parsed", 1) |> - expect_identical(jsonlite::fromJSON(js_str)) - wrangle_data(data, name_raw_parsed = "parsed") |> - expect_silent() |> - expect_named("parsed") -}) - -test_that("Can deal with invalid or empty json", { - data_case_invalid <- data.frame(game_data = "[1") - wrangle_data(data_case_invalid) |> - expect_warning("Failed to parse json string") |> - purrr::pluck("raw_parsed", 1) |> - expect_null() - data_case_empty <- data.frame(game_data = c("[]", "{}")) - wrangle_data(data_case_empty) |> - purrr::pluck("raw_parsed") |> - purrr::walk(expect_null) -}) - -test_that("Change names and values to lowercase", { - js_str <- r"([{"A": "A"}, {"A": "B"}])" - data <- tibble::tibble(game_data = js_str) - wrangle_data(data) |> - expect_silent() |> - purrr::pluck("raw_parsed", 1) |> - expect_identical(data.frame(a = c("a", "b"))) -}) - -test_that("Basic situation in `preproc_data()`", { - data <- tibble::tibble( - user_id = 1:2, - raw_parsed = list( - data.frame(nhit = 1, feedback = 0), - data.frame(nhit = 1, feedback = 1) - ) - ) - preproc_data(data, fn = prep_fun) |> - expect_silent() |> - expect_snapshot_value(style = "json2") -}) - -test_that("Deal with `NULL` in parsed data", { - tibble::tibble(raw_parsed = list(NULL)) |> - preproc_data(prep_fun) |> - expect_null() |> - expect_warning("No non-empty data found.") - tibble::tibble( - user_id = 1:3, - raw_parsed = list( - data.frame(nhit = 1, feedback = 0), - NULL, - data.frame(nhit = 1, feedback = 1) - ) - ) |> - preproc_data(fn = prep_fun) |> - expect_snapshot_value(style = "json2") -}) - -test_that("Can deal with mismatch column types in raw data", { - skip_if_not_installed("tidytable") - data <- tibble::tibble( - user_id = 1:3, - raw_parsed = list( - data.frame(nhit = 1, feedback = 0), - data.frame(nhit = 2, feedback = 1), - data.frame(nhit = "3", feedback = 1) - ) - ) - preproc_data(data, fn = prep_fun) |> - expect_snapshot_value(style = "json2") |> - expect_warning("Failed to bind raw data") -}) diff --git a/tests/testthat/test-targets.R b/tests/testthat/test-targets.R index 7de3f45..e15cfa9 100644 --- a/tests/testthat/test-targets.R +++ b/tests/testthat/test-targets.R @@ -53,7 +53,6 @@ test_that("Workflow works", { library(targets) params <- tibble::tribble( ~organization_name, ~project_name, - "北京师范大学测试用账号", "难度测试", "北京师范大学", "4.19-4.20夜晚睡眠test" ) tar_prep_iquizoo(params)