Skip to content

Commit

Permalink
Merge pull request #29 from gongcastro/v1.0.0
Browse files Browse the repository at this point in the history
v1.0.0
  • Loading branch information
gongcastro authored Oct 6, 2023
2 parents b640424 + 49a2a1f commit 0228e37
Show file tree
Hide file tree
Showing 106 changed files with 465 additions and 356 deletions.
38 changes: 38 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,41 @@
# bvq 1.0.0

## formr implementation

* New formr run named BVQ (`bvq-1.0.0`).
* Remove items about COVID-19 lockdown
* Fix the following item names (according to utils function `fix_item()`):
- `cat_parc` -> `cat_parc1`
- `cat_voler` -> `cat_voler1`
- `cat_voler3` -> `cat_voler2`
- `cat_despres1` -> `cat_despres`
- `cat_peix` -> `cat_peix1`
- `cat_estar` -> `cat_estar1`
- `cat_anar` -> `cat_anar1`
- `spa_querer` -> `spa_querer1`
- `spa_ir` -> `spa_ir1`
* Importing function incorporate this run to the workflow

## New ID scheme:

* `id` is now `child_id`
* `code` is now `response_id`
* Remove any other participant-level identifier (the crossing between `child_id` and `response` is already unambiguous).

## New versioning system

* Subsequent versions of `bvq` will be named using the numeric `0.0.0` format
* Previous version names have been changed in the package too:
- `BL-Short` -> `bvq-short`
- `BL-Long` -> `bvq-long`
- `BL-Lockdown` -> `bvq-lockdown`
* `collect_survey()` now retrieves survey names from new util function `get_bvq_runs()`

## Other changes

* Fixtures have been made smaller in size
* formr surveys are now stored in `inst/formr` for reproducibility

# bvq 0.4.0

* Switch most functions to base R. I made this decision in order to learn R base a bit better. Performance differences are negligible, and the user interface remains the same.
Expand Down
5 changes: 4 additions & 1 deletion R/globals.R
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ utils::globalVariables(unique(c(
"complete_items",
"date_birth",
"date_sent",
"id_db",
"child_id",
"language",
"progress",
"response",
Expand All @@ -18,7 +18,9 @@ utils::globalVariables(unique(c(
"date_test",
"link",
# bvq_responses:
"response_id",
"code",
"id",
"date_birth",
"item",
"time_stamp",
Expand All @@ -27,6 +29,7 @@ utils::globalVariables(unique(c(
"verbose",
# bvq_norms:
"pool_tmp",
"version_list",
"dominance",
"understands",
"produces",
Expand Down
257 changes: 140 additions & 117 deletions R/import.R
Original file line number Diff line number Diff line change
@@ -1,3 +1,62 @@
#' Collect survey data
#'
#' @importFrom cli cli_alert_success
#'
#' @param version Character string indicating the name fo the formr run (must be one of "lockdown", "long", or "short")
#' @param participants Participants dataset, as returned by [bvq::bvq_participants()]
#' @param ... Unused.
#'
#' @author Gonzalo Garcia-Castro
#'
#' @noRd
#' @keywords internal
#'
#' @md
collect_survey <- function(version, participants, ...) {

# validate version name
survey_options <- names(get_bvq_runs())
if (!(version %in% survey_options)) {
cli_abort("survey must be one of {survey_options}")
}

# process participant info
if (missing(participants)) {
participants_tmp <- bvq_participants()
} else {
participants_tmp <- participants
}

# correct version names
version_names <- attr(get_bvq_runs(), "versions")
participants_tmp$version <- paste0("bvq-", participants_tmp$version)

# get relevant participants
participants_tmp <- participants_tmp[participants_tmp$version %in% version, ]
if (version=="bvq-long") {
participants_tmp <- participants_tmp[participants_tmp$randomisation=="2", ]
}
participants_tmp <- participants_tmp[, colnames(participants_tmp)!="version"]

# download and process survey data
raw <- download_surveys(get_bvq_runs()[[version]]) # fetch responses

# fix Spanish dataframe colnames
colnames(raw[[7]]) <- gsub("cat_", "spa_", colnames(raw[[7]]))
raw[[1]] <- fix_logs_df(raw, participants_tmp) # fix logs data frame

# process responses
processed <- process_survey(raw, participants_tmp, version)

if (interactive()) {
n_responses <- nrow(distinct(processed, response_id))
msg <- "{version} updated: {n_responses} response{?s} retrieved"
cli_alert_success(msg)
}

return(processed)
}

#' Download formr surveys
#'
#' @importFrom cli cli_progress_done
Expand Down Expand Up @@ -42,7 +101,7 @@ download_surveys <- function(surveys, ...) {
#'
#' @param raw Raw survey data, as generated by [bvq::download_surveys()]
#' @param participants_tmp Participants dataset, inherited from inside the `import_*()` function environment.
#' @param survey_name character string indicating the name of the survey being processed (must be `"BL-Lockdown"`, `"BL-Short"`, or `"BL-Long-2"`).
#' @param version character string indicating the name of the BVQ version being processed (must be `"BL-Lockdown"`, `"BL-Short"`, or `"BL-Long-2"`).
#'
#' @importFrom dplyr if_any
#'
Expand All @@ -52,145 +111,106 @@ download_surveys <- function(surveys, ...) {
#' @keywords internal
#'
#' @md
process_survey <- function(raw, participants_tmp, survey_name) {
words_cat <- select(raw[[6]], session, created_cat = created, ended_cat = ended)
words_spa <- select(raw[[7]], session, created_spa = created, ended_spa = ended)
process_survey <- function(raw, participants_tmp, version) {

raw_tmp <- raw %>%
lapply(select, -one_of(c("created", "modified", "ended", "expired"))) %>%
# if a single session ID has multiple entries, select most recent
reduce(inner_join, by = join_by(session), multiple = "all") %>%
mutate(
code = fix_code(code),
survey_name = .env$survey_name,
version = ifelse(survey_name=="long",
survey_name,
paste(survey_name,
trimws(tolower(version), whitespace = "[\\h\\v]"),
sep = "-"
)
)
) %>%
left_join(participants_tmp, by = join_by(code))

items_to_keep <- c(
"time", "code", "study", "version", "randomisation",
"date_birth", "date_started", "date_finished", "sex",
"edu_parent1", "edu_parent2", "language_doe_catalan",
"language_doe_spanish", "language_doe_others"
)
# subset columns
words_cat <- select(raw[[6]], session,
created_cat = created, ended_cat = ended)
words_spa <- select(raw[[7]], session,
created_spa = created, ended_spa = ended)

raw_tmp <- merge_surveys(raw, participants_tmp, version)

items_to_keep <- c("child_id", "response_id", "time", "version", "version_list",
"date_birth", "date_started", "date_finished", "sex",
"edu_parent1", "edu_parent2", "language_doe_catalan",
"language_doe_spanish", "language_doe_others")

# process data
processed <- raw_tmp %>%
select(-matches("lockdown")) %>%
left_join(words_cat, by = join_by(session), multiple = "all") %>%
left_join(words_spa, by = join_by(session), multiple = "all") %>%
filter(code %in% participants_tmp$code,
!if_any(matches("created_|ended_"), is.na)) %>%
mutate(
across(c(matches("created_|ended_"), date_birth), as.POSIXct),
across(starts_with("language_doe"), function(x) ifelse(is.na(x), 0, x)),
survey_name = .env$survey_name,
date_started = get_time_stamp(ended_cat, ended_spa, which = "first"),
date_finished = get_time_stamp(ended_cat, ended_spa, which = "last"),
language_doe_catalan = get_doe(matches("catalan")),
language_doe_spanish = get_doe(matches("spanish")),
language_doe_others = 100 - rowSums(across(c(
language_doe_catalan,
language_doe_spanish
)))
) %>%
dplyr::filter(response_id %in% participants_tmp$response_id,
!if_any(matches("created_|ended_"), is.na)) %>%
mutate(across(c(matches("created_|ended_"), date_birth),
as_datetime),
across(starts_with("language_doe"),
function(x) ifelse(is.na(x), 0, x)),
version = .env$version,
date_started = get_time_stamp(ended_cat,
ended_spa,
which = "first"),
date_finished = get_time_stamp(ended_cat,
ended_spa,
which = "last"),
language_doe_catalan = get_doe(matches("catalan")),
language_doe_spanish = get_doe(matches("spanish")),
language_doe_others = 100 - rowSums(across(c(
language_doe_catalan,
language_doe_spanish
)))) %>%
arrange(desc(date_finished)) %>%
distinct(session, .keep_all = TRUE) %>%
rename(
edu_parent1 = demo_parent1,
edu_parent2 = demo_parent2
) %>%
select(
starts_with("id"),
one_of(items_to_keep),
starts_with("cat_"),
starts_with("spa_")
) %>%
pivot_longer(
cols = matches("cat_|spa_"),
names_to = "item",
values_to = "response"
) %>%
rename(edu_parent1 = demo_parent1,
edu_parent2 = demo_parent2) %>%
select(child_id,
one_of(items_to_keep),
starts_with("cat_"),
starts_with("spa_")) %>%
pivot_longer(cols = matches("cat_|spa_"),
names_to = "item",
values_to = "response") %>%
rename_with(function(x) gsub("language_", "", x), everything()) %>%
mutate(
language = ifelse(grepl("cat_", item), "Catalan", "Spanish"),
sex = ifelse(sex == 1, "Male", "Female"),
across(
starts_with("edu_"),
function(x) na_if(x, "")
)
) %>%
arrange(desc(date_finished)) %>%
distinct(id, code, item, .keep_all = TRUE)
distinct(child_id, response_id, time, item, .keep_all = TRUE) %>%
mutate(language = ifelse(grepl("cat_", item), "Catalan", "Spanish"),
response_id = fix_response_id(response_id),
sex = ifelse(sex == 1, "Male", "Female"),
across(starts_with("edu_"),
function(x) na_if(x, "")),
date_birth = lubridate::as_date(date_birth),
across(c(child_id, response_id, version, version_list,
sex, edu_parent1, edu_parent2, item, language),
as.character)) %>%
relocate(child_id, response_id, time, version, version_list, item, response, language,
starts_with("date_"), sex, starts_with("doe_")) %>%
filter(!is.na(response))

return(processed)
}

#' Collect survey data
#'
#' @importFrom cli cli_alert_success
#' Merge surveys from the same formr run
#'
#' @param version Character string indicating the name fo the formr run (must be one of "lockdown", "long", or "short")
#' @param participants Participants dataset, as returned by [bvq::bvq_participants()]
#' @param ... Unused.
#' @param raw Raw survey data, as generated by [bvq::download_surveys()]
#' @param participants_tmp Participants dataset, inherited from inside the `import_*()` function environment.
#' @param version character string indicating the name of the BVQ version being processed (must be `"BL-Lockdown"`, `"BL-Short"`, or `"BL-Long-2"`).
#'
#' @import dplyr
#'
#' @author Gonzalo Garcia-Castro
#'
#' @noRd
#' @keywords internal
#'
#' @md
collect_survey <- function(version, participants, ...) {

# validate version name
survey_options <- c("long", "short", "lockdown")
if (!(version %in% survey_options)) {
cli_abort("survey must be one of {survey_options}")
}

# get survey names
survey_names <- c("_log", "_welcome", "_consent", "_demo", "_language",
"_words_catalan", "_words_spanish")

# get survey names
surveys <- paste0("bilexicon_", version, "_0", c(1:6, 6), survey_names)
if (version=="long") {
surveys <- gsub("_long", "", surveys)
surveys <- gsub("catalan", "cat", surveys)
surveys <- gsub("spanish", "spa", surveys)
}

# process participant info
if (missing(participants)) participants <- bvq_participants()
participants_tmp <- participants[participants$version %in% version, ]
if (version=="long") {
participants_tmp <- participants_tmp[participants_tmp$randomisation=="2", ]
}
participants_tmp <- participants_tmp[, colnames(participants_tmp)!="version"]

# download and process survey data
raw <- download_surveys(surveys) # fetch responses
# fix Spanish dataframe colnames
colnames(raw[[7]]) <- gsub("cat_", "spa_", colnames(raw[[7]]))
raw[[1]] <- fix_logs_df(raw, participants_tmp) # fix logs dataframe
processed <- process_survey(raw, participants_tmp, version)
merge_surveys <- function(raw, participants_tmp, version) {

if (interactive()) {
n_responses <- nrow(distinct(processed, code))
msg <- "{version} updated: {n_responses} response{?s} retrieved"
cli_alert_success(msg)
}
# merge data frames
raw_tmp <- raw %>%
lapply(select, -one_of(c("created", "modified", "ended", "expired"))) %>%
# if a single session ID has multiple entries, select most recent
reduce(inner_join, by = join_by(session), multiple = "all") %>%
mutate(version_list = version,
version = .env$version,
version_list = trimws(version_list, whitespace = "[\\h\\v]")) %>%
left_join(participants_tmp, by = join_by(response_id))

return(processed)
return(raw_tmp)
}



#' Fix logs dataframe
#'
#' @param raw Named list with the contents of the surveys, as returned by [bvq::download_surveys()].
Expand All @@ -205,20 +225,23 @@ fix_logs_df <- function(raw, participants_tmp) {

# fix logs dataframe
logs <- raw[[1]]
names(logs)[names(logs) == "bl_code"] <- "code"
names(logs)[names(logs) %in% c("code", "bl_code")] <- "response_id"
# variables to correct types
logs[c("created", "ended")] <- lapply(logs[c("created", "ended")], as.POSIXct)
# remove if missing any critical variable
logs$code <- fix_code(ifelse(logs$code=="", NA_character_, logs$code))
logs <- logs[!is.na(logs$code) & !is.na(logs$ended) & !is.na(logs$session), ]
logs$code <- fix_response_id(ifelse(logs$response_id=="",
NA_character_,
logs$response_id))
logs <- logs[!is.na(logs$response_id) & !is.na(logs$ended) & !is.na(logs$session), ]
# fix codes known to be wrong
logs <- fix_code_raw(logs)
# remove codes not included in participants
logs <- logs[logs$code %in% participants_tmp$code, ]
logs <- logs[logs$response_id %in% participants_tmp$response_id, ]
# get only last response of each code
logs <- logs[order(logs$created, decreasing = TRUE), , drop = FALSE]
logs <- logs[!duplicated(logs$code), , drop = FALSE]
logs <- logs[!duplicated(logs$response_id), , drop = FALSE]

return(logs)

}

Loading

0 comments on commit 0228e37

Please sign in to comment.