diff --git a/DESCRIPTION b/DESCRIPTION index 308ca5a..fe5990b 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,60 +1,61 @@ -Package: bvq -Title: Barcelona Vocabulary Questionnaire Database and Helper Functions -Version: 0.4.0 -Authors@R: c( - person(given = "Gonzalo", family = "Garcia-Castro", - email = "gongarciacastro@gmail.com", - role = c("cre", "aut"), - comment = c(ORCID = "0000-0002-8553-4209")), - person(given = "Daniela S. ", family = "Ávila-Varela", - email = "avila.varela.daniela@gmail.com", - role = "aut", - comment = c(ORCID = "0000-0002-3518-8117")), - person(given = "Nuria", - family = "Sebastian-Galles", - email = "nuria.sebastian@upf.edu", - role = "ctb", - comment = c(ORCID = "0000-0001-6938-2498"))) -Maintainer: Gonzalo Garcia-Castro -Description: Download, clean, and process the Barcelona Vocabulary - Questionnaire (BVQ) data. BVQ is a vocabulary inventory developed for - assesing the vocabulary of Catalan-Spanish bilinguals infants from the - Metropolitan Area of Barcelona (Spain). This - package includes functions to download the data from formr servers, - and return the processed data in multiple formats. -License: MIT + file LICENSE -URL: https://gongcastro.github.io/bvq/, - https://github.com/gongcastro/bvq/ -BugReports: https://github.com/gongcastro/bvq/issues -Depends: - R (>= 3.5.0), -Imports: - cli (>= 3.6.1), - dplyr (>= 1.1.0), - formr (>= 0.9.1), - googlesheets4 (>= 1.0.0), - janitor (>= 2.2.0), - lifecycle (>= 1.0.3), - lubridate (>= 1.8.0), - magrittr (>= 2.0.3), - rlang (>= 1.1.1), - tibble (>= 3.2.1), - tidyr (>= 1.2.0) -Suggests: - knitr, - readxl, - rmarkdown, - roxygen2, - testthat (>= 3.0.0), - ggplot2 (>= 3.4.2), - withr -VignetteBuilder: - knitr -Remotes: - github::rubenarslan/formr -Config/testthat/edition: 3 -Encoding: UTF-8 -Language: en-US -LazyData: true -Roxygen: list(markdown = TRUE) -RoxygenNote: 7.2.3 +Package: bvq +Title: Barcelona Vocabulary Questionnaire Database and Helper Functions +Version: 0.4.0 +Authors@R: c( + person(given = "Gonzalo", family = "Garcia-Castro", + email = "gongarciacastro@gmail.com", + role = c("cre", "aut"), + comment = c(ORCID = "0000-0002-8553-4209")), + person(given = "Daniela S. ", family = "Ávila-Varela", + email = "avila.varela.daniela@gmail.com", + role = "aut", + comment = c(ORCID = "0000-0002-3518-8117")), + person(given = "Nuria", + family = "Sebastian-Galles", + email = "nuria.sebastian@upf.edu", + role = "ctb", + comment = c(ORCID = "0000-0001-6938-2498"))) +Maintainer: Gonzalo Garcia-Castro +Description: Download, clean, and process the Barcelona Vocabulary + Questionnaire (BVQ) data. BVQ is a vocabulary inventory developed for + assesing the vocabulary of Catalan-Spanish bilinguals infants from the + Metropolitan Area of Barcelona (Spain). This + package includes functions to download the data from formr servers, + and return the processed data in multiple formats. +License: MIT + file LICENSE +URL: https://gongcastro.github.io/bvq/, + https://github.com/gongcastro/bvq/ +BugReports: https://github.com/gongcastro/bvq/issues +Depends: + R (>= 3.5.0), +Imports: + cli (>= 3.6.1), + dplyr (>= 1.1.0), + formr (>= 0.9.1), + googlesheets4 (>= 1.0.0), + janitor (>= 2.2.0), + lifecycle (>= 1.0.3), + lubridate (>= 1.8.0), + magrittr (>= 2.0.3), + purrr (>= 1.0.1), + rlang (>= 1.1.1), + tibble (>= 3.2.1), + tidyr (>= 1.2.0) +Suggests: + knitr, + readxl, + rmarkdown, + roxygen2, + testthat (>= 3.0.0), + ggplot2 (>= 3.4.2), + withr +VignetteBuilder: + knitr +Remotes: + github::rubenarslan/formr +Config/testthat/edition: 3 +Encoding: UTF-8 +Language: en-US +LazyData: true +Roxygen: list(markdown = TRUE) +RoxygenNote: 7.2.3 diff --git a/NAMESPACE b/NAMESPACE index 4525107..d2bc6e9 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -26,22 +26,17 @@ importFrom(cli,cli_progress_step) importFrom(cli,cli_progress_update) importFrom(cli,qty) importFrom(dplyr,across) -importFrom(dplyr,arrange) importFrom(dplyr,case_when) importFrom(dplyr,distinct) importFrom(dplyr,filter) importFrom(dplyr,if_any) -importFrom(dplyr,left_join) importFrom(dplyr,mutate) importFrom(dplyr,n) -importFrom(dplyr,select) -importFrom(dplyr,summarise) importFrom(formr,formr_connect) importFrom(formr,formr_raw_results) importFrom(googlesheets4,gs4_auth) importFrom(googlesheets4,gs4_has_token) importFrom(googlesheets4,read_sheet) -importFrom(janitor,clean_names) importFrom(janitor,make_clean_names) importFrom(lifecycle,deprecated) importFrom(lubridate,as_date) @@ -49,13 +44,14 @@ importFrom(lubridate,as_datetime) importFrom(lubridate,time_length) importFrom(lubridate,today) importFrom(magrittr,"%>%") +importFrom(purrr,map) +importFrom(purrr,set_names) importFrom(rlang,":=") importFrom(rlang,.data) importFrom(rlang,as_label) importFrom(rlang,as_name) importFrom(rlang,enquo) importFrom(rlang,enquos) -importFrom(stats,time) importFrom(tibble,tibble) importFrom(tidyr,pivot_longer) importFrom(tidyr,pivot_wider) diff --git a/R/globals.R b/R/globals.R index 2af1c70..1f62095 100644 --- a/R/globals.R +++ b/R/globals.R @@ -16,6 +16,8 @@ utils::globalVariables(unique(c( "date_birth", "date_sent", "date_test", + "x", + "time", "link", # bvq_responses: "response_id", diff --git a/R/import.R b/R/import.R index f6f1529..d23aef6 100644 --- a/R/import.R +++ b/R/import.R @@ -227,9 +227,9 @@ fix_logs_df <- function(raw, participants_tmp) { # variables to correct types logs[c("created", "ended")] <- lapply(logs[c("created", "ended")], as.POSIXct) # remove if missing any critical variable - logs$code <- fix_response_id(ifelse(logs$response_id=="", - NA_character_, - logs$response_id)) + logs$response_id <- fix_response_id(if_else(logs$response_id=="", + NA_character_, + gsub("BL", "", logs$response_id))) logs <- logs[!is.na(logs$response_id) & !is.na(logs$ended) & !is.na(logs$session), ] # fix codes known to be wrong logs <- fix_code_raw(logs) diff --git a/R/logs.R b/R/logs.R index f7ee111..f291e9e 100644 --- a/R/logs.R +++ b/R/logs.R @@ -132,7 +132,7 @@ bvq_logs <- function(participants = bvq_participants(), age = diff_in_time(date_finished, date_birth, "months"), duration = diff_in_time(date_finished, date_started, "days") ) %>% - # compute participant's progreslos through the questionnaire + # compute participant's progres through the questionnaire mutate(progress = complete_items / total_items, completed = progress >= 0.95) %>% # select relevant columns and reorder them diff --git a/R/norms.R b/R/norms.R index 897c55f..50a832c 100644 --- a/R/norms.R +++ b/R/norms.R @@ -8,17 +8,13 @@ #' #' @export bvq_norms #' -#' @importFrom dplyr mutate -#' @importFrom dplyr filter -#' @importFrom dplyr select -#' @importFrom dplyr left_join -#' @importFrom dplyr summarise -#' @importFrom dplyr arrange +#' @import dplyr #' @importFrom tidyr pivot_longer +#' #' @param responses Responses data frame, as generated by [bvq::bvq_responses()]. #' @param participants Participants data frame, as generated by #' [bvq::bvq_participants()] -#' @param ... <[`dynamic-dots`][rlang::dyn-dots]>. Unquote name of the variable(s) to group +#' @param ... <[`dynamic-dots`][rlang::dyn-dots]>. Unquoted name of the variable(s) to group #' data into. Norms will be calculated by aggregating responses #' within the groups that result from the combination of crossing of the #' variables provided in `...`. These variables can refer to item properties @@ -82,10 +78,7 @@ bvq_norms <- function(participants = bvq_participants(), ..., te = NULL, item = NULL, - age = c(0, 100)) { - - if (is.null(participants)) participants <- bvq_participants() - if (is.null(responses)) responses <- bvq_responses(participants) + age = c(0, Inf)) { # collect ... into a character vector for `any_of` dots_vctr <- as.character(match.call(expand.dots = FALSE)$`...`) @@ -94,24 +87,29 @@ bvq_norms <- function(participants = bvq_participants(), # retrieve participants and logs ------------------------------------------- - logs_tmp <- bvq_logs(participants = participants, responses = responses) - cols.keep <- colnames(logs_tmp) %in% c("child_id", "time", "dominance", group_vars) - logs_tmp <- logs_tmp[, cols.keep] - logs_tmp$age <- floor(logs_tmp$age) - logs_tmp <- logs_tmp[logs_tmp$age >= min(age) & logs_tmp$age <= max(age), ] + logs_tmp <- bvq_logs(participants = participants, + responses = responses) %>% + select(child_id, time, dominance, any_of(group_vars)) %>% + mutate(age = floor(age)) %>% + dplyr::filter(between(age, min(.env$age), max(.env$age))) - cols.keep <- colnames(bvq::pool) %in% c("language", group_vars) - pool_tmp <- bvq::pool[, cols.keep] + pool_tmp <- select(bvq::pool, language, any_of(group_vars)) # check arguments ---------------------------------------------------------- if (is.null(item)) { + item <- unique(responses$item) + } else { + item_in_pool <- item %in% unique(pool_tmp$item) + if (!all(item_in_pool)) { - item_not_in_pool <- paste0(item[which(!item_in_pool)], collapse = ", ") - cli_abort("item(?s) `{item_not_in_pool}` {?does/do} not exist in pool") + + item_not_in_pool <- paste0(item[which(!item_in_pool)], + collapse = ", ") + cli::cli_abort("item(?s) `{item_not_in_pool}` {?does/do} not exist in pool") } } @@ -125,32 +123,27 @@ bvq_norms <- function(participants = bvq_participants(), # compute norms ------------------------------------------------------------ - responses_tmp <- responses[responses$item %in% item & - !is.na(responses$response), ] - responses_tmp$understands <- responses_tmp$response > 1 - responses_tmp$produces <- responses_tmp$response > 2 - responses_tmp <- responses_tmp[, colnames(responses_tmp)!="response"] - responses_tmp <- tidyr::pivot_longer(responses_tmp, - c(understands, produces), - names_to = "type", - values_to = "response") - - - norms <- merge(responses_tmp, logs_tmp) - norms <- merge(norms, pool_tmp, all.x = TRUE) - - norms$item_dominance <- ifelse(norms$language==norms$dominance, "L1", "L2") - norms <- summarise(norms, - .sum = sum(response, na.rm = TRUE), - .n = n(), - .by = any_of(group_vars)) - norms$.prop <- prop_adj(norms$.sum, norms$.n) - norms <- norms[order(norms$te, - norms$item, - norms$type, - norms$age, - decreasing = TRUE), , drop = FALSE] - norms <- tibble::as_tibble(norms) + responses_tmp <- responses %>% + filter(item %in% .env$item, + !is.na(response)) %>% + mutate(understands = response > 1, + produces = response > 2) %>% + select(-response) %>% + tidyr::pivot_longer(c(understands, produces), + names_to = "type", + values_to = "response") + + suppressMessages({ + norms <- list(responses_tmp, logs_tmp, pool_tmp) %>% + reduce(inner_join, + relationship = "many-to-many") %>% + mutate(item_dominance = if_else(language==dominance, "L1", "L2")) %>% + summarise(.sum = sum(response, na.rm = TRUE), + .n = n(), + .by = any_of(group_vars)) %>% + mutate(.prop = prop_adj(.sum, .n)) %>% + arrange(te, item, type, age) + }) return(norms) } @@ -172,32 +165,47 @@ bvq_norms <- function(participants = bvq_participants(), #' @keywords internal #' check_arg_te <- function(te, item) { # nocov start + if (is.logical(te)) { + if (te) { item <- pool$item[pool$te %in% pool$te[pool$item %in% item]] return(item) } + } else if (is.numeric(te)) { + te_in_pool <- te %in% unique(pool$te) if (!all(te_in_pool)) { + te_not_in_pool <- te[which(!te_in_pool)] - cli_abort("te{?s} {te_not_in_pool} {?does/do} not exist in pool") + cli::cli_abort("te{?s} {te_not_in_pool} {?does/do} not exist in pool") + } else { + items_not_in_te <- item %in% pool$item[pool$te %in% te] + if (!all(items_not_in_te)) { + which_items_not_in_te <- item[which(!items_not_in_te)] error_msg <- "Item{?s} '{which_items_not_in_te}' {?was/were} not included in `te`. \\ Its norms will not be returned. To compute the norms for all `items` \\ and their translations, set {.code te = TRUE}." - cli_alert_warning(error_msg) + cli::cli_alert_warning(error_msg) } + item <- pool$item[pool$te %in% te] + return(item) } + } else { + if (!is.null(te)) { - cli_abort("`te` must have class {.cls logical} or {.cls integer}") + + cli::cli_abort("`te` must have class {.cls logical} or {.cls integer}") } + } return(item) } # nocov end diff --git a/R/participants.R b/R/participants.R index 9f9bcdf..26f2fad 100644 --- a/R/participants.R +++ b/R/participants.R @@ -50,40 +50,35 @@ #' #' @md bvq_participants <- function(...) { + bvq_connect() # get credentials to Google and formr # download Sheets suppressMessages({ - ss <- "164DMKLRO0Xju0gdfkCS3evAq9ihTgEgFiuJopmqt7mo" - x <- googlesheets4::read_sheet(ss, sheet = "Participants") + sheet <- read_sheet(ss = "164DMKLRO0Xju0gdfkCS3evAq9ihTgEgFiuJopmqt7mo", + sheet = "Participants", + col_types = "cccciDncccDDclcc", + .name_repair = janitor::make_clean_names) }) - # change classes - cols.dates <- grepl("date_", names(x)) - x[, cols.dates] <- lapply(x[, cols.dates], as.Date) - x[, "include"] <- lapply(x[, "include"], as.logical) - - # filter rows - cols.keep <- !(names(x) %in% c("link", "comments", "include", - "id_bvq", "id_exp", "study")) - x <- subset(x, !is.na(x$code) & x$include, cols.keep) - names(x)[names(x)=="randomisation"] <- "version_list" - names(x)[names(x)=="id"] <- "child_id" - names(x)[names(x)=="code"] <- "response_id" - - # reorder rows - code.sorted <- as.numeric(gsub("BL", "", x$response_id)) - x <- x[order(code.sorted, decreasing = TRUE), , drop = FALSE] - - # fix version values - x$version <- gsub("bl-", "", tolower(x$version)) + participants <- sheet %>% + filter(!is.na(code), + include) %>% + select(child_id = id, response_id = code, + time, date_birth, date_sent, version, + version_list = randomisation, call) %>% + mutate(response_id = gsub("BL", "", response_id), + version = gsub("bl-", "", tolower(version))) %>% + # reorder rows + arrange(desc(as.numeric(response_id))) # make sure no columns are lists # (probably due to inconsistent cell types) - is_col_list <- vapply(x, is.list, logical(1)) + is_col_list <- vapply(participants, is.list, logical(1)) if (any(is_col_list)) { col <- names(which(is_col_list)) - cli_abort("{col} {?has/have} class {.cls list}") + cli::cli_abort("{col} {?has/have} class {.cls list}") } - return(x) + + return(participants) } diff --git a/R/responses.R b/R/responses.R index a72dda1..631264c 100644 --- a/R/responses.R +++ b/R/responses.R @@ -5,7 +5,8 @@ #' (the output of [bvq::bvq_participants()]) and `runs` (a character vector that can #' take zero, one, or multiple of the following values: `"formr2"`, #' `"formr-short"`, `"formr-lockdown"`) as arguments. -#' @importFrom stats time +#' @importFrom purrr map +#' @importFrom purrr set_names #' #' @export bvq_responses #' @@ -24,16 +25,16 @@ #' versions of BVQ, such as bvq-short, bvq-long, bvq-lockdown, or bvq-1.0.0, where the #' list of items was partitioned into several versions.#' * item: character string indicating the item identifier (e.g., `spa_mesa`). This value is unique for each item. Responses to the same item from different participants are linked by the same `item` value. #' * response: integer indicating the participant's response to a give item: `1` if `"No"` (the participant does not understand or produce the word), `2` if "Understands" (the participants understands the word), or `3` if "Understands and Says" (the participant understands and produces the item). -#' * date_birth: a date value (see lubridate package) in `yyyy/mm/dd` format indicating participants birth date. -#' * date_started: a date value (see lubridate package) in `yyyy/mm/dd` format indicating when participants logged to the questionnaire for the first time. -#' * date_finished: a date value (see lubridate package) in `yyyy/mm/dd` format indicating when participants logged to the questionnaire for the last time. +#' * date_birth: [lubridate::Date] indicating participants birth date. +#' * date_started: [lubridate::Date] indicating when participants logged to the questionnaire for the first time. +#' * date_finished: [lubridate::Date] indicating when participants logged to the questionnaire for the last time. #' * sex: a character string indicating participants' biological sex, as reported by the parents. -#' * edu_parent1: a character string indicating the educational attainment of one of the parents/caretakers. -#' * edu_parent2: a character string indicating the educational attainment of the other parent/caretaker, if any. #' * doe_spanish: a numeric value ranging from 0 to 1 indicating participants' daily exposure to Spanish, as estimated by parents/caregivers This value aggregates participants' exposure to any variant of Spanish (e.g., European and American Spanish). #' * doe_catalan: a numeric value ranging from 0 to 1 indicating participants' daily exposure to Catalan, as estimated by parents/caregivers This value aggregates participants' exposure to any variant of Catalan (e.g., Catalan from Majorca or Barcelona). #' * doe_others: a numeric value ranging from 0 to 1 indicating participants' daily exposure to languages other than Spanish or Catalan, as estimated by parents/caretakers, aggregating participants' exposure to all those other languages (e.g., Norwegian, Arab, Swahili). -#' +#' * edu_parent1: a character string indicating the educational attainment of one of the parents/caretakers. +#' * edu_parent2: a character string indicating the educational attainment of the other parent/caretaker, if any. + #' @author Gonzalo Garcia-Castro #' #' @examples @@ -44,48 +45,29 @@ #' @md bvq_responses <- function(participants = bvq_participants()) { - bvq.list <- lapply(names(get_bvq_runs()), - function(x) collect_survey(x, participants)) - names(bvq.list) <- names(get_bvq_runs()) - - # merge dataframes - responses <- do.call(dplyr::bind_rows, bvq.list) - - # remove duplicated combinations of child_id, response_id, and item - cols.undup <- c("child_id", "response_id", "item") - responses <- responses[!duplicated(responses[cols.undup]), , drop = FALSE] - - # fix date columns - date.cols <- grepl("^date_", colnames(responses)) - responses[, date.cols] <- lapply(responses[, date.cols], - as.Date, - origin = c("1970-01-01")) - - # fix other variables - responses$time <- ifelse(is.na(responses$time), 1, responses$time) - responses$version <- trimws(responses$version, whitespace = "[\\h\\v]") - responses <- fix_item(responses) - responses <- fix_doe(responses) - responses <- fix_sex(responses) - - # fix degree of exposure columns - doe.cols <- grepl("^doe_", colnames(responses)) - responses[, doe.cols] <- lapply(responses[, doe.cols], function(x) x / 100) - - # remove rows with missing date_finished and reorder - responses <- responses[!is.na(responses$date_finished), ] - - # reorder dataframe and select columns - responses <- responses[order(responses$date_finished, decreasing = TRUE), , drop = FALSE] - cols.keep <- c( - "child_id", "response_id", "time", - "version", "version_list", - colnames(responses)[grepl("^date_", colnames(responses))], - "item", "response", "sex", - colnames(responses)[grepl("^doe_", colnames(responses))], - colnames(responses)[grepl("^edu_", colnames(responses))] - ) - responses <- responses[, cols.keep] + responses <- map(names(get_bvq_runs()), + function(x) collect_survey(x, participants)) %>% + bind_rows() %>% + # remove duplicated combinations + distinct(child_id, response_id, item, .keep_all = TRUE) %>% + # fix variables + fix_item() %>% + fix_doe() %>% + fix_sex() %>% + mutate(across(matches("date_"), + function(x) as.Date(x, origin = "1970-01-01")), + time = if_else(is.na(time), 1, time), + version = trimws(version, whitespace = "[\\h\\v]"), + across(matches("doe_"), function(x) x / 100)) %>% + # remove rows with missing date_finished and reorder + dplyr::filter(!is.na(date_finished)) %>% + # reorder by finishing date + arrange(desc(date_finished)) %>% + # select columns + select(child_id, response_id, time, + version, version_list, matches("date_"), + item, response, sex, + matches("doe_"), matches("edu_")) return(responses) } diff --git a/R/utils.R b/R/utils.R index 26328cc..fdd453f 100644 --- a/R/utils.R +++ b/R/utils.R @@ -99,9 +99,9 @@ get_doe <- function(...) { rowSums(across(c(...)), dims = 1, na.rm = TRUE) } -#' Fix codes +#' Fix response IDs #' -#' @param x Vector of `code` whose values should be fixed. +#' @param x Vector of `response_id` whose values should be fixed. #' #' @author Gonzalo Garcia-Castro #' @@ -118,9 +118,9 @@ fix_response_id <- function(x) { # nocov start } # nocov end -#' Fix raw codes +#' Fix raw response IDs #' -#' @param x Vector of `code` whose values should be fixed, based on `session`. +#' @param x Vector of `response_id` whose values should be fixed, based on `session`. #' #' @author Gonzalo Garcia-Castro #' @@ -128,14 +128,14 @@ fix_response_id <- function(x) { # nocov start #' @keywords internal #' fix_code_raw <- function(x) { # nocov start - x[x$session == "-OYU0wA9FPQ9-ugKUpyrz1A0usJZIuM5hb-cbV2yMgGBal5S9q3ReRgphBDDxFEY", "code"] <- "BL1674" - x[x$session == "ZZiRT3JN4AdKnXMxjEMtU3CzRkniH0hOSZzS-0kzquRt_Ls9PJzmKsY3qm8tQ7Z2", "code"] <- "BL1671" - x[x$session == "TW8vSEn7YTtbZoe9BaEtRgwNvryWTwSv49dLKb5W0_6bFL306Eiw0Ehg72Q9nqLx", "code"] <- "BL1672" - x[x$session == "DDjiYrPl-WD951rocaSKH9grkn2T4ZJKjhdCAPDzqNBWyhc8E8wwjOY0CcruNn1m", "code"] <- "BL1673" - x[x$session == "c9fPw4Lbm5WS0AbBRppv4NVHh4eETxvEitH8lUC1pkt2ulxxHCvXgAYopCGRQSa_", "code"] <- "BL1569" - x[x$session == "I8ldNrILmQd7Vhtwqk99Y9YaKWrZzlExKeXsRv9_toIEi43BtlfuLI-PmdU4iY7G", "code"] <- "BL1788" - x[x$session == "dU5CZLLkvmY7SDpe8d0jFQO3xzpmeS0lCOFF_ErjETC1tyyYbv3ZhyaDmlfdJwHc", "code"] <- "BL1876" - x[x$session == "L4F1jd13H4wyFf6QYGy8hfSURneFr-zfzMn1YFFeBTbTZWWjxYPRbC-rPY6U1qdr", "code"] <- "remove" + x[x$session == "-OYU0wA9FPQ9-ugKUpyrz1A0usJZIuM5hb-cbV2yMgGBal5S9q3ReRgphBDDxFEY", "response_id"] <- "BL1674" + x[x$session == "ZZiRT3JN4AdKnXMxjEMtU3CzRkniH0hOSZzS-0kzquRt_Ls9PJzmKsY3qm8tQ7Z2", "response_id"] <- "BL1671" + x[x$session == "TW8vSEn7YTtbZoe9BaEtRgwNvryWTwSv49dLKb5W0_6bFL306Eiw0Ehg72Q9nqLx", "response_id"] <- "BL1672" + x[x$session == "DDjiYrPl-WD951rocaSKH9grkn2T4ZJKjhdCAPDzqNBWyhc8E8wwjOY0CcruNn1m", "response_id"] <- "BL1673" + x[x$session == "c9fPw4Lbm5WS0AbBRppv4NVHh4eETxvEitH8lUC1pkt2ulxxHCvXgAYopCGRQSa_", "response_id"] <- "BL1569" + x[x$session == "I8ldNrILmQd7Vhtwqk99Y9YaKWrZzlExKeXsRv9_toIEi43BtlfuLI-PmdU4iY7G", "response_id"] <- "BL1788" + x[x$session == "dU5CZLLkvmY7SDpe8d0jFQO3xzpmeS0lCOFF_ErjETC1tyyYbv3ZhyaDmlfdJwHc", "response_id"] <- "BL1876" + x[x$session == "L4F1jd13H4wyFf6QYGy8hfSURneFr-zfzMn1YFFeBTbTZWWjxYPRbC-rPY6U1qdr", "response_id"] <- "remove" return(x) } # nocov end @@ -222,20 +222,6 @@ fix_item <- function(x) { # nocov start } # nocov end -#' Fix id_exp -#' -#' @param x Vector of `id_exp` whose values should be fixed -#' -#' @author Gonzalo Garcia-Castro -#' -#' @noRd -#' @keywords internal -#' -fix_id_exp <- function(x) { # nocov start - x$id_exp[x$code %in% "BL547"] <- "bilexicon_189" - return(x) -} # nocov end - #' Deal with repeated measures #' #' @export get_longitudinal diff --git a/R/vocabulary.R b/R/vocabulary.R index 5394429..d5849dd 100644 --- a/R/vocabulary.R +++ b/R/vocabulary.R @@ -10,7 +10,6 @@ #' @import dplyr #' @importFrom tidyr pivot_longer #' @importFrom tidyr pivot_wider -#' @importFrom janitor clean_names #' @importFrom janitor make_clean_names #' #' @param participants Participants data frame, as generated by @@ -32,7 +31,7 @@ #' comprehensive and/or vocabulary size in each language. This data frame #' contains the following variables: #' * child_id: a character string with five digits indicating a participant's identifier in the database from the [Laboratori de Recerca en Infància](https://www.upf.edu/web/cbclab) at Universitat Pompeu Fabra. This value is always the same for each participant, so that different responses from the same participant share the same `child_id`. -#' * time: a numeric value indicating how many times a given participant has been sent the questionnaire, regardless of whether they completed it or not. +#' * response_id: a character string identifying a single response to the questionnaire. This value is always unique for each response to the questionnaire, even for responses from the same participant. #' * age: a numeric value indicating the number of months elapsed since participants' birth date until they filled in the last item of their questionnaire response. #' * type: a character string indicating the vocabulary type computed: `"understands"` if option "Understands" was selected, and `"produces"` if option "Understands & Says" was selected. #' * total_count: integer indicating the number of items selected as "Understands" or "Understands and Says" in both languages. @@ -47,6 +46,7 @@ #' * te_prop: numeric value ranging from 0 to 1 (both included) indicating the proportion of of translation equivalents (aka. cross-language synonyms or doublets) in which both items were selected as "Understands" or "Understands and Says". This is a measure of the number of lexicalised concepts. #' The specific subset of columns returned by [bvq::bvq_vocabulary()] depends #' on the elements of `...` and `.scale`. +#' * contents: list containing the items marked as acquired. #' #' @author Gonzalo Garcia-Castro #' @@ -57,107 +57,90 @@ bvq_vocabulary <- function(participants = bvq_participants(), ..., .scale = "prop") { - if (missing(participants)) participants <- bvq_participants() - if (missing(responses)) responses <- bvq_responses(participants) + # collect ... into a character vector for `any_of` + dots_vctr <- as.character(match.call(expand.dots = FALSE)$`...`) - # get logs logs <- bvq_logs(participants, responses) - logs <- logs[logs$child_id %in% unique(responses$child_id), ] - # collect ... into a character vector for `any_of` - dots_vctr <- as.character(match.call(expand.dots = FALSE)$`...`) + # check arguments + possible_colnames <- list(responses, logs, bvq::pool) %>% + purrr::map(colnames) %>% + purrr::list_c() - # check if all extra ... are valid column names - possible_colnames <- unlist(lapply(list(responses, logs, bvq::pool), colnames)) check_arg_dots(dots_vctr, .cols = possible_colnames) - if (!any(.scale %in% c("count", "prop"))) { - cli_abort("Argument .scale must be 'count' and/or 'prop'") + if (!all(.scale %in% c("prop", "count"))) { + cli::cli_abort(".scale must be 'prop' and/or 'count'") } - # get main dataset - base <- responses - base$understands <- base$response > 1 - base$produces <- base$response > 2 - base <- base[, names(base) != "response"] - base <- tidyr::pivot_longer(base, - c(understands, produces), - names_to = "type", - values_to = "response") - base <- base[!is.na(base$response), ] - # join TE-level properties - cols.keep <- names(bvq::pool) %in% c("item", "te", "language", dots_vctr) - pool_tmp <- bvq::pool[, cols.keep] - base <- dplyr::inner_join(base, - pool_tmp, - multiple = "all", - by = join_by(item), - relationship = "many-to-many") + logs_tmp <- logs %>% + dplyr::filter(child_id %in% responses$child_id) %>% + select(child_id, response_id, dominance, any_of(dots_vctr)) + + pool_tmp <- select(bvq::pool, item, te, language, any_of(dots_vctr)) - # join participant-level properties - cols.keep <- names(logs) %in% c("child_id", "time", "dominance", dots_vctr) - logs_tmp <- logs[, cols.keep] - base <- dplyr::inner_join(base, logs_tmp, - multiple = "all", - by = join_by(child_id, time), - relationship = "many-to-many") + # get main dataset + base <- responses %>% + mutate(understands = response > 1, + produces = response > 2) %>% + dplyr::filter(!is.na(response)) %>% + select(-response) %>% + tidyr::pivot_longer(c(understands, produces), + names_to = "type", + values_to = "response") + + suppressMessages({ + base <- list(base, pool_tmp, logs_tmp) %>% + # join datasets + purrr::reduce(inner_join, + multiple = "all", + relationship = "many-to-many") %>% + # define TE-by-participant properties + mutate(item_dominance = if_else(language==dominance, "L1", "L2")) %>% + select(child_id, response_id, dominance, item_dominance, + type, te, item, any_of(dots_vctr), response) + }) - # define TE-by-participant properties - base$item_dominance <- ifelse(base$language==base$dominance, "L1", "L2") - cols.keep <- c("child_id", "time", "dominance", "item_dominance", "type", - "te", "item", dots_vctr, "response") - base <- base[, cols.keep] # compute total denominator - cols.distinct <- names(base) %in% c("child_id", "time", "te", dots_vctr) - base_n <- base[!duplicated(base[, cols.distinct]), , drop = FALSE] - base_n <- dplyr::count(base_n, - dplyr::pick(child_id, time, type, ...), - name = "n_total") - cols.keep <- names(base_n) %in% c("child_id", "time", dots_vctr, "n_total") - base_n <- base_n[, cols.keep] + base_n <- base %>% + distinct(pick(c(child_id, response_id, te, ...)), + .keep_all = TRUE) %>% + count(pick(c(child_id, response_id, type, ...)), + name = "n_total") %>% + select(child_id, response_id, any_of(dots_vctr), n_total) # compute TE-wise denominator - base_te <- base[base$response, ] - base_te <- dplyr::left_join(base_te, base_n, - by = join_by(child_id, time, ...)) - - base_te <- tidyr::pivot_wider(base_te, - names_from = item_dominance, - values_from = response, - values_fn = sum, - values_fill = 0, - id_cols = c(child_id, time, type, n_total, te, ...)) + base_te <- base %>% + dplyr::filter(response) %>% + left_join(base_n, + by = join_by(child_id, response_id, ...)) %>% + tidyr::pivot_wider(names_from = item_dominance, + values_from = response, + values_fn = sum, + values_fill = 0, + id_cols = c(child_id, response_id, type, n_total, te, ...)) # total vocabulary - total <- vocab_total(base, dots_vctr) - dominance <- vocab_dominance(base, dots_vctr) - concept <- vocab_concept(base_te, dots_vctr) - te <- vocab_te(base_te, dots_vctr) + total <- vocab_total(base, ...) + dominance <- vocab_dominance(base, ...) + concept <- vocab_concept(base_te, ...) + te <- vocab_te(base_te, ...) + contents <- vocab_contents(base, ...) # merge all datasets - which_col_not <- c("count", "prop")[which(!(c("count", "prop") %in% .scale))] - - vocabulary <- list(total, dominance, concept, te) - vocabulary <- reduce(vocabulary, - dplyr::left_join, - multiple = "all", - by = join_by(child_id, time, type, ...)) - cols.integer <- names(vocabulary)[grepl("concept|te", names(vocabulary))] - vocabulary[, cols.integer] <- lapply(vocabulary[, cols.integer], - function(x) { - ifelse(is.na(x), as.integer(0), x) - }) - cols.scale <- names(vocabulary)[grepl(paste(.scale, collapse = "|"), - names(vocabulary))] - cols.keep <- c("child_id", "time", "type", - dots_vctr, - cols.scale[cols.scale %in% cols.scale[grepl("prop$", cols.scale)]], - cols.scale[cols.scale %in% cols.scale[grepl("count$", cols.scale)]]) - vocabulary <- vocabulary[, cols.keep] - vocabulary <- vocabulary[, !(names(vocabulary) %in% which_col_not)] - vocabulary <- tibble::as_tibble(vocabulary) + vocabulary <- list(total, dominance, concept, te, contents) %>% + reduce(left_join, + multiple = "all", + by = join_by(child_id, response_id, type, ...)) %>% + mutate(across(starts_with("concept|te"), + function(x){ + if_else(is.na(x), as.integer(0), x) + })) %>% + select(child_id, response_id, type, ..., + matches(paste0(.scale, collapse = "|")), + contents) return(vocabulary) } @@ -194,12 +177,11 @@ check_arg_dots <- function(x, .cols) { # nocov start #' vocab_total <- function(x, ...) { # nocov start - out <- summarise(x, - total_count = sum(response, na.rm = TRUE), - n_total = n(), - .by = c(child_id, time, type, any_of(...)) - ) - out$total_prop <- ifelse(out$n_total==0, 0, out$total_count/out$n_total) + out <- x %>% + summarise(total_count = sum(response, na.rm = TRUE), + n_total = n(), + .by = c(child_id, response_id, type, ...)) %>% + mutate(total_prop = if_else(n_total==0, 0, total_count/n_total)) return(out) } # nocov end @@ -215,19 +197,17 @@ vocab_total <- function(x, ...) { # nocov start vocab_dominance <- function(x, ...) { # nocov start out <- x %>% - dplyr::summarise(count = sum(response, na.rm = TRUE), - n_total = sum(!is.na(response)), - .by = c(child_id, time, type, item_dominance, any_of(...))) %>% - dplyr::mutate(prop = ifelse(n_total == 0, 0, count / n_total)) %>% + summarise(count = sum(response, na.rm = TRUE), + n_total = sum(!is.na(response)), + .by = c(child_id, response_id, type, item_dominance, ...)) %>% + mutate(prop = ifelse(n_total == 0, 0, count / n_total)) %>% tidyr::pivot_wider(names_from = item_dominance, values_from = c(n_total, matches("count|prop")), names_glue = "{item_dominance}_{.value}", - names_repair = make_clean_names) %>% - janitor::clean_names() %>% - dplyr::select( - child_id, time, type, starts_with("l1_"), starts_with("l2_"), - -ends_with("n_total"), any_of(...) - ) + names_repair = janitor::make_clean_names) %>% + select(child_id, response_id, type, + starts_with("l1_"), starts_with("l2_"), + -ends_with("n_total"), ...) return(out) } # nocov end @@ -243,20 +223,15 @@ vocab_dominance <- function(x, ...) { # nocov start vocab_concept <- function(x, ...) { # nocov start out <- x %>% - dplyr::mutate(across(c(L1, L2), function(x) x > 0), - is_any = rowSums(pick(L2:L1)), - is_any = is_any > 0 - ) %>% - dplyr::summarise( - n = sum(is_any), - .by = c(child_id, time, type, n_total, any_of(...)) - ) %>% - dplyr::rename(concept_count = n) %>% - dplyr::mutate( - concept_count = as.integer(concept_count), - concept_prop = ifelse(n_total == 0, 0, concept_count / n_total) - ) %>% - dplyr::select(child_id, time, type, concept_count, concept_prop, any_of(...)) + mutate(across(c(L1, L2), function(x) x > 0), + is_any = rowSums(pick(L2:L1)), + is_any = is_any > 0) %>% + summarise( n = sum(is_any), + .by = c(child_id, response_id, type, n_total, ...)) %>% + rename(concept_count = n) %>% + mutate(concept_count = as.integer(concept_count), + concept_prop = if_else(n_total==0, 0, concept_count / n_total)) %>% + select(child_id, response_id, type, concept_count, concept_prop, ...) return(out) } # nocov end @@ -272,20 +247,33 @@ vocab_concept <- function(x, ...) { # nocov start vocab_te <- function(x, ...) { # nocov start out <- x %>% - dplyr::mutate(across(c(L1, L2), function(x) x > 0), - is_both = rowSums(dplyr::pick(c(L2, L1))), - is_both = is_both > 1 - ) %>% - dplyr::summarise( - n = sum(is_both), - .by = c(child_id, time, type, n_total, any_of(...)) - ) %>% - dplyr::rename(te_count = n) %>% - dplyr::mutate( - te_count = as.integer(te_count), - te_prop = ifelse(n_total == 0, 0, te_count / n_total) - ) %>% - dplyr::select(child_id, time, type, te_count, te_prop, any_of(...)) + mutate(across(c(L1, L2), function(x) x > 0), + is_both = rowSums(pick(c(L2, L1))), + is_both = is_both > 1) %>% + summarise(n = sum(is_both), + .by = c(child_id, response_id, type, n_total, ...)) %>% + rename(te_count = n) %>% + mutate(te_count = as.integer(te_count), + te_prop = if_else(n_total==0, 0, te_count / n_total)) + + return(out) +} # nocov end + + +#' Gather vocabulary contents +#' +#' @inheritParams bvq_vocabulary +#' @inheritParams vocab_total +#' +#' @noRd +#' @keywords internal +#' +vocab_contents <- function(x, ...) { # nocov start + + out <- x %>% + dplyr::filter(response) %>% + summarise(contents = list(item[response]), + .by = c(child_id, response_id, type, ...)) return(out) } # nocov end diff --git a/inst/figures/logo.png b/inst/figures/logo.png index e017449..f4211eb 100644 Binary files a/inst/figures/logo.png and b/inst/figures/logo.png differ diff --git a/inst/fixtures/vocabulary.rds b/inst/fixtures/vocabulary.rds index 57bf65d..9638ef1 100644 Binary files a/inst/fixtures/vocabulary.rds and b/inst/fixtures/vocabulary.rds differ diff --git a/man/bvq-package.Rd b/man/bvq-package.Rd index 62690f9..bb4e755 100644 --- a/man/bvq-package.Rd +++ b/man/bvq-package.Rd @@ -6,6 +6,8 @@ \alias{bvq-package} \title{bvq: Barcelona Vocabulary Questionnaire Database and Helper Functions} \description{ +\if{html}{\figure{logo.png}{options: style='float: right' alt='logo' width='120'}} + Download, clean, and process the Barcelona Vocabulary Questionnaire (BVQ) data. BVQ is a vocabulary inventory developed for assesing the vocabulary of Catalan-Spanish bilinguals infants from the Metropolitan Area of Barcelona (Spain). This package includes functions to download the data from formr servers, and return the processed data in multiple formats. } \seealso{ diff --git a/man/bvq_norms.Rd b/man/bvq_norms.Rd index 7040129..ac31e1a 100644 --- a/man/bvq_norms.Rd +++ b/man/bvq_norms.Rd @@ -10,7 +10,7 @@ bvq_norms( ..., te = NULL, item = NULL, - age = c(0, 100) + age = c(0, Inf) ) } \arguments{ @@ -19,7 +19,7 @@ bvq_norms( \item{responses}{Responses data frame, as generated by \code{\link[=bvq_responses]{bvq_responses()}}.} -\item{...}{<\code{\link[rlang:dyn-dots]{dynamic-dots}}>. Unquote name of the variable(s) to group +\item{...}{<\code{\link[rlang:dyn-dots]{dynamic-dots}}>. Unquoted name of the variable(s) to group data into. Norms will be calculated by aggregating responses within the groups that result from the combination of crossing of the variables provided in \code{...}. These variables can refer to item properties diff --git a/man/bvq_responses.Rd b/man/bvq_responses.Rd index 62ede61..0d3209f 100644 --- a/man/bvq_responses.Rd +++ b/man/bvq_responses.Rd @@ -24,15 +24,15 @@ items a participant was assigned to. Only applies in the case of short versions of BVQ, such as bvq-short, bvq-long, bvq-lockdown, or bvq-1.0.0, where the list of items was partitioned into several versions.#' * item: character string indicating the item identifier (e.g., \code{spa_mesa}). This value is unique for each item. Responses to the same item from different participants are linked by the same \code{item} value. \item response: integer indicating the participant's response to a give item: \code{1} if \code{"No"} (the participant does not understand or produce the word), \code{2} if "Understands" (the participants understands the word), or \code{3} if "Understands and Says" (the participant understands and produces the item). -\item date_birth: a date value (see lubridate package) in \code{yyyy/mm/dd} format indicating participants birth date. -\item date_started: a date value (see lubridate package) in \code{yyyy/mm/dd} format indicating when participants logged to the questionnaire for the first time. -\item date_finished: a date value (see lubridate package) in \code{yyyy/mm/dd} format indicating when participants logged to the questionnaire for the last time. +\item date_birth: \link[lubridate:date_utils]{lubridate::Date} indicating participants birth date. +\item date_started: \link[lubridate:date_utils]{lubridate::Date} indicating when participants logged to the questionnaire for the first time. +\item date_finished: \link[lubridate:date_utils]{lubridate::Date} indicating when participants logged to the questionnaire for the last time. \item sex: a character string indicating participants' biological sex, as reported by the parents. -\item edu_parent1: a character string indicating the educational attainment of one of the parents/caretakers. -\item edu_parent2: a character string indicating the educational attainment of the other parent/caretaker, if any. \item doe_spanish: a numeric value ranging from 0 to 1 indicating participants' daily exposure to Spanish, as estimated by parents/caregivers This value aggregates participants' exposure to any variant of Spanish (e.g., European and American Spanish). \item doe_catalan: a numeric value ranging from 0 to 1 indicating participants' daily exposure to Catalan, as estimated by parents/caregivers This value aggregates participants' exposure to any variant of Catalan (e.g., Catalan from Majorca or Barcelona). \item doe_others: a numeric value ranging from 0 to 1 indicating participants' daily exposure to languages other than Spanish or Catalan, as estimated by parents/caretakers, aggregating participants' exposure to all those other languages (e.g., Norwegian, Arab, Swahili). +\item edu_parent1: a character string indicating the educational attainment of one of the parents/caretakers. +\item edu_parent2: a character string indicating the educational attainment of the other parent/caretaker, if any. } } \description{ diff --git a/man/bvq_vocabulary.Rd b/man/bvq_vocabulary.Rd index bd7d808..1b35769 100644 --- a/man/bvq_vocabulary.Rd +++ b/man/bvq_vocabulary.Rd @@ -36,7 +36,7 @@ comprehensive and/or vocabulary size in each language. This data frame contains the following variables: \itemize{ \item child_id: a character string with five digits indicating a participant's identifier in the database from the \href{https://www.upf.edu/web/cbclab}{Laboratori de Recerca en Infància} at Universitat Pompeu Fabra. This value is always the same for each participant, so that different responses from the same participant share the same \code{child_id}. -\item time: a numeric value indicating how many times a given participant has been sent the questionnaire, regardless of whether they completed it or not. +\item response_id: a character string identifying a single response to the questionnaire. This value is always unique for each response to the questionnaire, even for responses from the same participant. \item age: a numeric value indicating the number of months elapsed since participants' birth date until they filled in the last item of their questionnaire response. \item type: a character string indicating the vocabulary type computed: \code{"understands"} if option "Understands" was selected, and \code{"produces"} if option "Understands & Says" was selected. \item total_count: integer indicating the number of items selected as "Understands" or "Understands and Says" in both languages. @@ -51,6 +51,7 @@ contains the following variables: \item te_prop: numeric value ranging from 0 to 1 (both included) indicating the proportion of of translation equivalents (aka. cross-language synonyms or doublets) in which both items were selected as "Understands" or "Understands and Says". This is a measure of the number of lexicalised concepts. The specific subset of columns returned by \code{\link[=bvq_vocabulary]{bvq_vocabulary()}} depends on the elements of \code{...} and \code{.scale}. +\item contents: list containing the items marked as acquired. } } \description{ diff --git a/man/figures/logo.png b/man/figures/logo.png index 24f27e9..2bc87d4 100644 Binary files a/man/figures/logo.png and b/man/figures/logo.png differ diff --git a/man/logo.png b/man/logo.png deleted file mode 100644 index 582d7f9..0000000 Binary files a/man/logo.png and /dev/null differ diff --git a/tests/testthat/test-norms.R b/tests/testthat/test-norms.R index ef1a5a0..c8e4780 100644 --- a/tests/testthat/test-norms.R +++ b/tests/testthat/test-norms.R @@ -181,3 +181,4 @@ test_that("test that the ... argument works", { age = c(10, 12)) expect_in(c("lp", "semantic_category"), colnames(norms)) }) + diff --git a/tests/testthat/test-vocabulary.R b/tests/testthat/test-vocabulary.R index 24ca904..4c2db4c 100644 --- a/tests/testthat/test-vocabulary.R +++ b/tests/testthat/test-vocabulary.R @@ -4,60 +4,61 @@ responses <- readRDS(system.file("fixtures/responses.rds", participants <- readRDS(system.file("fixtures/participants.rds", package = "bvq")) -test_that("vocabulary proportions are plausible", { - vocabulary <- bvq_vocabulary(participants, - responses, - .scale = c("prop", "count")) +vocabulary <- bvq_vocabulary(participants, + responses, + .scale = c("prop", "count")) + +test_that("bvq_vocabulary throws errors when appropriate", { + + expect_error(bvq_vocabulary(participants, responses, .scale = "XXX")) + expect_error(bvq_vocabulary(participants, responses, lp, XXXX)) + +}) + +test_that("vocabulary scores are plausible", { + + expect_lte(max(vocabulary$total_prop, na.rm = TRUE), 1) + expect_gte(max(vocabulary$total_prop, na.rm = TRUE), 0) + expect_gte(max(vocabulary$total_count, na.rm = TRUE), 0) + + expect_lte(max(vocabulary$l1_prop, na.rm = TRUE), 1) + expect_gte(max(vocabulary$l1_prop, na.rm = TRUE), 0) + expect_gte(max(vocabulary$l1_count, na.rm = TRUE), 0) - n_total <- studies %>% - distinct(version, language, .keep_all = TRUE) %>% - summarise(n_total = sum(n), .by = version) + expect_lte(max(vocabulary$l2_prop, na.rm = TRUE), 1) + expect_gte(max(vocabulary$l2_prop, na.rm = TRUE), 0) + expect_gte(max(vocabulary$l2_count, na.rm = TRUE), 0) - vocabulary <- vocabulary %>% - left_join( - select(participants, child_id, time, version, version_list), - by = join_by(child_id, time), - multiple = "all" - ) %>% - filter(!is.na(version),!is.na(version_list)) %>% - mutate(version = case_when( - grepl("cbc", child_id) ~ "CBC", - grepl("devlex", child_id) ~ "DevLex", - .default = paste(version, version_list, sep = "-") - )) %>% - left_join(n_total, - by = join_by(version), - multiple = "all") + expect_lte(max(vocabulary$concept_prop, na.rm = TRUE), 1) + expect_gte(max(vocabulary$concept_prop, na.rm = TRUE), 0) + expect_gte(max(vocabulary$concept_count, na.rm = TRUE), 0) + + expect_lte(max(vocabulary$te_prop, na.rm = TRUE), 1) + expect_gte(max(vocabulary$te_prop, na.rm = TRUE), 0) + expect_gte(max(vocabulary$te_count, na.rm = TRUE), 0) - expect_true(all(between(vocabulary$total_prop, 0, 1))) - expect_true(all(between(vocabulary$l1_prop[!is.na(vocabulary$l1_prop)], 0, 1))) - expect_true(all(between(vocabulary$l2_prop[!is.na(vocabulary$l2_prop)], 0, 1))) - expect_true(all(between(vocabulary$concept_prop, 0, 1))) - expect_true(all(between(vocabulary$te_prop, 0, 1))) - expect_false(any(vocabulary$total_count[!is.na(vocabulary$total_count)] < 0)) - expect_false(any(vocabulary$l1_count[!is.na(vocabulary$l1_count)] < 0)) - expect_false(any(vocabulary$l2_count[!is.na(vocabulary$l2_count)] < 0)) - expect_false(any(vocabulary$concept_count[!is.na(vocabulary$concept_count)] < 0)) - expect_false(any(vocabulary$te_count[!is.na(vocabulary$te_count)] < 0)) }) test_that("column classes are the right ones", { - vocabulary <- - bvq_vocabulary(participants, responses, .scale = c("prop", "count")) - expect_true(all(class(vocabulary$child_id) == "character")) - expect_true(all(class(vocabulary$time) == "numeric")) - expect_true(all(class(vocabulary$type) == "character")) - expect_true(all(class(vocabulary$total_count) == "integer")) - expect_true(all(class(vocabulary$l1_count) == "integer")) - expect_true(all(class(vocabulary$l2_count) == "integer")) - expect_true(all(class(vocabulary$concept_count) == "integer")) - expect_true(all(class(vocabulary$te_count) == "integer")) - expect_true(all(class(vocabulary$total_prop) == "numeric")) - expect_true(all(class(vocabulary$l1_prop) == "numeric")) - expect_true(all(class(vocabulary$l2_prop) == "numeric")) - expect_true(all(class(vocabulary$concept_prop) == "numeric")) - expect_true(all(class(vocabulary$te_prop) == "numeric")) + expect_true(all(class(vocabulary$child_id)=="character")) + expect_true(all(class(vocabulary$response_id)=="character")) + expect_true(all(class(vocabulary$type)=="character")) + + expect_true(all(class(vocabulary$total_count)=="integer")) + expect_true(all(class(vocabulary$l1_count)=="integer")) + expect_true(all(class(vocabulary$l2_count)=="integer")) + expect_true(all(class(vocabulary$concept_count)=="integer")) + expect_true(all(class(vocabulary$te_count)=="integer")) + + expect_true(all(class(vocabulary$total_prop)=="numeric")) + expect_true(all(class(vocabulary$l1_prop)=="numeric")) + expect_true(all(class(vocabulary$l2_prop)=="numeric")) + expect_true(all(class(vocabulary$concept_prop)=="numeric")) + expect_true(all(class(vocabulary$te_prop)=="numeric")) + + expect_true(all(class(vocabulary$contents)=="list")) + }) test_that("the ... argument works", { @@ -67,5 +68,5 @@ test_that("the ... argument works", { semantic_category) expect_in(c("lp", "semantic_category"), colnames(vocabulary)) - expect_error(bvq_vocabulary(participants, responses, lp, XXXX)) + }) diff --git a/vignettes/bvq-vocabulary.Rmd b/vignettes/bvq-vocabulary.Rmd index 81b3c9c..480bd48 100644 --- a/vignettes/bvq-vocabulary.Rmd +++ b/vignettes/bvq-vocabulary.Rmd @@ -72,6 +72,16 @@ Finally, two types of vocabulary sizes are computed: These two measures are returned in the long format under the `type` column. +## Vocabulary contents + +In additional to the vocabulary size scores, `bvq_vocabulary()` also returns the column `contents`. This column is a list containing the items marked as acquired for comprehension or production. For instance: + +```{r contents, echo=FALSE, message=FALSE, warning=FALSE, paged.print=FALSE} +library(dplyr) + +contents <- bvq_vocabulary(participants, responses) %>% + select(child_id, response_id, contents) +``` ## Conditional vocabulary size: the `...` extra arguments