diff --git a/.covrignore b/.covrignore index c828d73..5d32955 100644 --- a/.covrignore +++ b/.covrignore @@ -2,3 +2,4 @@ R/globals.R R/import.R R/participants.R R/connect.R +R/responses.R \ No newline at end of file diff --git a/NEWS.md b/NEWS.md index 4381d5a..09955bc 100644 --- a/NEWS.md +++ b/NEWS.md @@ -4,6 +4,8 @@ * Add examples and tests for `get_time_stamp()` and `get_doe()` * Add documentation for new version of `bvq_norms()` in its vignette * Use .covrignore +* Replace `group_by()`/`ungroup()` for experimental argument `.by` in dplyr functions + # bvq 0.3.0 diff --git a/R/logs.R b/R/logs.R index c0efdde..039f469 100644 --- a/R/logs.R +++ b/R/logs.R @@ -7,7 +7,7 @@ #' many times a participant has been sent the questionnaire, independently of #' whether a response was obtained from them later. #' -#' @import dplyr +#' @import dplyr #' @importFrom lubridate as_date #' @importFrom lubridate today #' @importFrom lubridate as_datetime @@ -57,63 +57,61 @@ bvq_logs <- function(participants = NULL, other_threshold = 0.10, ...) { - suppressMessages({ - - # get participant information - if (is.null(participants)) participants <- bvq_participants() - if (is.null(responses)) responses <- bvq_responses(participants) - - # get n items answered by participants (depends on the questionnaire version) - total_items <- studies %>% - distinct(version, language, n) %>% - summarise(total_items = sum(n), - .by = version) - - grouping_vars <- c("id", "date_birth", "time", - "edu_parent1", "edu_parent2", - "date_birth", "date_started", "date_finished", - "doe_spanish", "doe_catalan", - "doe_others", "date_birth", "code", "study", - "version") - - vars <- c("code", "time", "study", "version", "age", - "date_birth", "date_started", "date_finished", - "duration", "dominance", "lp", - "edu_parent1", "edu_parent2", - "doe_spanish", "doe_catalan", "doe_others", "completed") - - # generate logs - logs <- responses %>% - # total items to fill by each participant (varies across versions) - summarise(complete_items = sum(!is.na(response)), - .by = one_of(grouping_vars)) %>% - left_join(total_items, - by = join_by(version)) %>% - left_join(select(participants, -c(date_birth, version)), - by = join_by(id, time, code, study)) %>% - drop_na(id) %>% - mutate( - # define language profiles based on thresholds - lp = case_when(doe_catalan >= bilingual_threshold ~ "Monolingual", - doe_spanish >= bilingual_threshold ~ "Monolingual", - doe_others > other_threshold ~ "Other", - .default = "Bilingual"), - # define language dominance - dominance = case_when(doe_catalan > doe_spanish ~ "Catalan", - doe_spanish > doe_catalan ~ "Spanish", - doe_catalan == doe_spanish ~ sample(c("Catalan", "Spanish"), 1)), - age = diff_in_time(date_finished, date_birth, "months"), - duration = diff_in_time(date_finished, date_started, "days") - ) %>% - # compute participant's progress through the questionnaire - rowwise() %>% - mutate(progress = complete_items / total_items, - completed = progress >= 0.95) %>% - ungroup() %>% - # select relevant columns and reorder them - select(id, one_of(vars)) %>% - arrange(desc(date_finished)) - }) + # get participant information + if (is.null(participants)) participants <- bvq_participants() + if (is.null(responses)) responses <- bvq_responses(participants) + + # get n items answered by participants (depends on the questionnaire version) + total_items <- studies %>% + distinct(version, language, n) %>% + summarise(total_items = sum(n), + .by = version) + + grouping_vars <- c("id", "date_birth", "time", + "edu_parent1", "edu_parent2", + "date_birth", "date_started", "date_finished", + "doe_spanish", "doe_catalan", + "doe_others", "date_birth", "code", "study", + "version") + + vars <- c("code", "time", "study", "version", "age", + "date_birth", "date_started", "date_finished", + "duration", "dominance", "lp", + "edu_parent1", "edu_parent2", + "doe_spanish", "doe_catalan", "doe_others", "completed") + + # generate logs + logs <- responses %>% + # total items to fill by each participant (varies across versions) + summarise(complete_items = sum(!is.na(response)), + .by = one_of(grouping_vars)) %>% + left_join(total_items, + by = join_by(version)) %>% + left_join(select(participants, -c(date_birth, version)), + by = join_by(id, time, code, study)) %>% + drop_na(id) %>% + mutate( + # define language profiles based on thresholds + lp = case_when( + doe_catalan >= bilingual_threshold ~ "Monolingual", + doe_spanish >= bilingual_threshold ~ "Monolingual", + doe_others > other_threshold ~ "Other", + .default = "Bilingual" + ), + # define language dominance + dominance = case_when( + doe_catalan > doe_spanish ~ "Catalan", + doe_spanish > doe_catalan ~ "Spanish", + doe_catalan == doe_spanish ~ sample(c("Catalan", "Spanish"), 1)), + age = diff_in_time(date_finished, date_birth, "months"), + duration = diff_in_time(date_finished, date_started, "days") + ) %>% + # compute participant's progress through the questionnaire + mutate(progress = complete_items / total_items, + completed = progress >= 0.95) %>% + # select relevant columns and reorder them + select(id, one_of(vars)) %>% + arrange(desc(date_finished)) return(logs) } diff --git a/R/responses.R b/R/responses.R index 22d26e1..ba02479 100644 --- a/R/responses.R +++ b/R/responses.R @@ -51,52 +51,33 @@ bvq_responses <- function(participants = NULL, longitudinal = "all", ...) { - bvq_connect() # get credentials to Google and formr - - # get participant information - if (is.null(participants)) participants <- bvq_participants() - # retrieve data from formr formr2 <- import_formr2() # formr2 formr_lockdown <- import_formr_lockdown() # formr-lockdown formr_short <- import_formr_short() # formr-lockdown - # merge data - suppressMessages({ - - cbc_studies <- c("CBC", "Signs", "Negation", "Inhibition") - - responses <- list(formr1, formr2, formr_short, formr_lockdown) %>% - bind_rows() %>% - distinct(id, code, item, .keep_all = TRUE) %>% - mutate(across(c(starts_with("date_"), time_stamp), as_date), - date_finished = coalesce(time_stamp, date_finished), - version = case_when( - study %in% "DevLex" ~ "DevLex", - study %in% cbc_studies ~ "CBC", - .default = version - ), - time = ifelse(is.na(time), 1, time), - version = trimws(version, whitespace = "[\\h\\v]"), - ) %>% - fix_item() %>% - fix_doe() %>% - mutate(across(starts_with("doe_"), - function(x) x / 100)) %>% - fix_sex() %>% - mutate(study = ifelse(is.na(study), "BiLexicon", study)) %>% - fix_id_exp() %>% - drop_na(date_finished) %>% - get_longitudinal(longitudinal = longitudinal) %>% - arrange(desc(date_finished)) %>% - select(id, time, code, study, - version, randomisation, - starts_with("date_"), - item, response, sex, - starts_with("doe_"), - starts_with("edu_")) - }) - + responses <- list(formr1, formr2, formr_short, formr_lockdown) %>% + bind_rows() %>% + distinct(id, code, item, .keep_all = TRUE) %>% + mutate(across(c(starts_with("date_"), time_stamp), as_date), + date_finished = coalesce(time_stamp, date_finished), + time = ifelse(is.na(time), 1, time), + version = trimws(version, whitespace = "[\\h\\v]")) %>% + fix_item() %>% + fix_doe() %>% + mutate(across(starts_with("doe_"), function(x) x / 100)) %>% + fix_sex() %>% + mutate(study = ifelse(is.na(study), "BiLexicon", study)) %>% + fix_id_exp() %>% + drop_na(date_finished) %>% + get_longitudinal(longitudinal = longitudinal) %>% + arrange(desc(date_finished)) %>% + select(id, time, code, study, + version, randomisation, + starts_with("date_"), + item, response, sex, + starts_with("doe_"), + starts_with("edu_")) return(responses) } diff --git a/R/utils.R b/R/utils.R index 3d4ccef..09119bb 100644 --- a/R/utils.R +++ b/R/utils.R @@ -88,12 +88,19 @@ get_doe <- function(...) { #' #' @param x Vector of `code` whose values should be fixed. #' @author Gonzalo Garcia-Castro +#' +#' @examples +#' code_vctr <- c("BL0123", "bl0123", "Bl0123", "BLBL0123", +#' "bi0123", "blo123", "B0123", "BI0123", +#' " BL0123") +#' +#' fix_code(code_vctr) fix_code <- function(x) { x <- toupper(trimws(x)) - x <- gsub(".*BL", "", x) x <- gsub("O", "0", x) x <- gsub("I", "L", x) - x <- ifelse(!grepl("BL", x), paste0("BL", x), x) + x <- gsub("[^\\d]+", "", x, perl=TRUE) + x <- paste0("BL", x) return(x) } @@ -163,24 +170,11 @@ fix_sex <- function(x) { # nocov start "Female", x$sex) - x$sex <- ifelse(x$id_bvq %in% c("bilexicon_1447"), "Male", x$sex) + x$sex <- ifelse(x$id_bvq=="bilexicon_1447", "Male", x$sex) return(x) } # nocov end -#' Fix postcode -#' -#' @param x Vector of `postcode` whose values should be fixed -#' @author Gonzalo Garcia-Castro -fix_postcode <- function(x) { - - pcd <- x$postcode - pcd <- ifelse(nchar(pcd) < 5, paste0("0", pcd), pcd) - pcd <- ifelse(nchar(pcd) < 5, NA_character_, pcd) - x$postcode <- pcd - - return(x) -} #' Fix item #' @@ -232,11 +226,9 @@ fix_id_exp <- function(x) { # nocov start #' * `"first"` returns the first response of each participant (participants with only one appearance are #' included). #' * `"last"` returns the last response from each participant (participants with only one response are included). -#' @importFrom dplyr group_by #' @importFrom dplyr distinct #' @importFrom dplyr n #' @importFrom dplyr filter -#' @importFrom dplyr ungroup #' @returns A subset of the data frame `x` with only the selected cases, #' according to `longitudinal`. #' @author Gonzalo Garcia-Castro @@ -244,6 +236,9 @@ fix_id_exp <- function(x) { # nocov start #' id <- c(1, 1, 1, 2, 2, 3, 4, 4, 4, 4, 5, 6, 7, 7, 8, 9, 10, 10) #' sums <- rle(sort(id))[["lengths"]] #' dat <- data.frame(id, time = unlist(sapply(sums, function(x) seq(1, x)))) +#' +#' (dat) +#' #' get_longitudinal(dat, "first") #' get_longitudinal(dat, "only") get_longitudinal <- function(x, longitudinal = "all") { @@ -251,30 +246,16 @@ get_longitudinal <- function(x, longitudinal = "all") { longitudinal_opts <- c("all", "no", "first", "last", "only") if (!(longitudinal %in% longitudinal_opts) && interactive()) { - cli_abort(paste0("longitudinal must be one of: ", - paste0(longitudinal_opts, collapse = ", "))) - } - - repeated <- distinct(x, id, time) %>% - group_by(id) %>% - filter(n() > 1) %>% - ungroup() - - if (longitudinal == "no") x <- x[!(x$id %in% repeated$id), ] - - if (longitudinal == "first") { - x <- group_by(x, id) %>% - filter(time == min(time, na.rm = TRUE)) %>% - ungroup() + long_colapsed <- paste0(longitudinal_opts, collapse = ", ") + cli_abort(paste0("longitudinal must be one of: ", long_colapsed)) } - if (longitudinal == "last") { - x <- group_by(x, id) %>% - filter(time == max(time, na.rm = TRUE)) %>% - ungroup() - } + repeated <- filter(distinct(x, id, time), n() > 1, .by = id) - if (longitudinal == "only") x <- x[x$id %in% repeated$id, ] + if (longitudinal == "no") x <- filter(x, !(id %in% repeated$id)) + if (longitudinal == "first") x <- filter(x, time==min(time), .by = id) + if (longitudinal == "last") x <- filter(x, time==max(time), .by = id) + if (longitudinal == "only") x <- filter(x, id %in% repeated$id) return(x) } diff --git a/R/vocabulary.R b/R/vocabulary.R index 47b521b..e94361e 100644 --- a/R/vocabulary.R +++ b/R/vocabulary.R @@ -26,8 +26,7 @@ #' `"prop"`. If `"prop"` (default), vocabulary metrics are calculated as #' proportions. If `"count"`, vocabulary metrics are reported as counts #' (number of words). -#' @param ... Extra arguments that will be passed to the `bvq_responses`. -#' function. +#' @param ... Extra arguments that will be passed to [bvq::bvq_responses()]`. #' @returns A dataset (actually, a [tibble::tibble] with each participant's #' comprehensive and/or vocabulary size in each language. This data frame #' contains the following variables: @@ -170,8 +169,8 @@ bvq_vocabulary <- function(participants, by = join_by(id, time, type, !!!.by)) %>% mutate(across(matches("concept|te"), function(x) ifelse(is.na(x), as.integer(0), x))) %>% - select(any_of(c("id", "time", "age", "type", .by)), matches(.scale)) %>% - select(-ends_with(which_col_not)) + select(any_of(c("id", "time", "age", "type", .by)), + matches(.scale), (-ends_with(which_col_not))) return(vocabulary) diff --git a/tests/testthat/test-utils.R b/tests/testthat/test-utils.R index d844eba..42e8fa5 100644 --- a/tests/testthat/test-utils.R +++ b/tests/testthat/test-utils.R @@ -41,7 +41,7 @@ test_that("get_doe works", { doe_other = 1-get_doe(matches("cat|spa")), doe_cat = get_doe(doe_cat_1, doe_cat_2), doe_spa = get_doe(matches("spa"))) - + expect_true(all(vapply(y, class, "character")=="numeric")) expect_true(all(sapply(y, \(x) (x>=0) & (x <= 1)))) expect_equal(y$doe_other, 1-rowSums(x)) @@ -49,7 +49,7 @@ test_that("get_doe works", { expect_equal(y$doe_spa, rowSums(x[, c("doe_spa_1", "doe_spa_2")])) }) -test_that("get_longitudinal works correctly", { +test_that("get_longitudinal works", { id <- c(1, 1, 1, 2, 2, 3, 4, 4, 4, 4, 5, 6, 7, 7, 8, 9, 10, 10) sums <- rle(sort(id))[["lengths"]] dat <- data.frame(id, time = unlist(sapply(sums, function(x) seq(1, x)))) @@ -62,3 +62,13 @@ test_that("get_longitudinal works correctly", { expect_identical(get_longitudinal(dat, "last")$id, unique(dat$id)) }) + + +test_that("fix_code works", { + code_vctr <- c("BL0123", "bl0123", "Bl0123", "BLBL0123", + "bi0123", "blo123", "B0123", "BI0123", + " BL0123") + + expect_equal(unique(fix_code(code_vctr)), "BL0123") +}) +