Merge pull request #29 from gongcastro/v1.0.0

v1.0.0
gongcastro · Oct 6, 2023 · 0228e37 · 0228e37
2 parents b640424 + 49a2a1f
commit 0228e37
Show file tree

Hide file tree

Showing 106 changed files with 465 additions and 356 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,41 @@
+# bvq 1.0.0
+
+## formr implementation
+
+* New formr run named BVQ (`bvq-1.0.0`).
+* Remove items about COVID-19 lockdown
+* Fix the following item names (according to utils function `fix_item()`):
+    - `cat_parc` -> `cat_parc1`
+    - `cat_voler` -> `cat_voler1`
+    - `cat_voler3` -> `cat_voler2`
+    - `cat_despres1` -> `cat_despres`
+    - `cat_peix` -> `cat_peix1`
+    - `cat_estar` -> `cat_estar1`
+    - `cat_anar` -> `cat_anar1`
+    - `spa_querer` -> `spa_querer1`
+    - `spa_ir` -> `spa_ir1`
+* Importing function incorporate this run to the workflow
+
+## New ID scheme:
+
+* `id` is now `child_id`
+* `code` is now `response_id`
+* Remove any other participant-level identifier (the crossing between `child_id` and `response` is already unambiguous).
+
+## New versioning system
+
+* Subsequent versions of `bvq` will be named using the numeric `0.0.0` format
+* Previous version names have been changed in the package too:
+    - `BL-Short` -> `bvq-short`
+    - `BL-Long` -> `bvq-long`
+    - `BL-Lockdown` -> `bvq-lockdown`
+* `collect_survey()` now retrieves survey names from new util function `get_bvq_runs()` 
+
+## Other changes
+
+* Fixtures have been made smaller in size
+* formr surveys are now stored in `inst/formr` for reproducibility
+
 # bvq 0.4.0
 
 * Switch most functions to base R. I made this decision in order to learn R base a bit better. Performance differences are negligible, and the user interface remains the same.

diff --git a/R/globals.R b/R/globals.R
@@ -5,7 +5,7 @@ utils::globalVariables(unique(c(
   "complete_items",
   "date_birth",
   "date_sent",
-  "id_db",
+  "child_id",
   "language",
   "progress",
   "response",
@@ -18,7 +18,9 @@ utils::globalVariables(unique(c(
   "date_test",
   "link",
   # bvq_responses:
+  "response_id",
   "code",
+  "id",
   "date_birth",
   "item",
   "time_stamp",
@@ -27,6 +29,7 @@ utils::globalVariables(unique(c(
   "verbose",
   # bvq_norms:
   "pool_tmp",
+  "version_list",
   "dominance",
   "understands",
   "produces",

diff --git a/R/import.R b/R/import.R
@@ -1,3 +1,62 @@
+#' Collect survey data
+#'
+#' @importFrom cli cli_alert_success
+#' 
+#' @param version Character string indicating the name fo the formr run (must be one of "lockdown", "long", or "short")
+#' @param participants Participants dataset, as returned by [bvq::bvq_participants()]
+#' @param ... Unused.
+#' 
+#' @author Gonzalo Garcia-Castro
+#' 
+#' @noRd
+#' @keywords internal
+#' 
+#' @md
+collect_survey <- function(version, participants, ...) {
+
+    # validate version name
+    survey_options <- names(get_bvq_runs())
+    if (!(version %in% survey_options)) {
+        cli_abort("survey must be one of {survey_options}")
+    }
+
+    # process participant info
+    if (missing(participants)) {
+        participants_tmp <- bvq_participants()
+    } else {
+        participants_tmp <- participants
+    }
+
+    # correct version names
+    version_names <- attr(get_bvq_runs(), "versions")
+    participants_tmp$version <- paste0("bvq-", participants_tmp$version)
+
+    # get relevant participants
+    participants_tmp <- participants_tmp[participants_tmp$version %in% version, ]
+    if (version=="bvq-long") {
+        participants_tmp <- participants_tmp[participants_tmp$randomisation=="2", ]
+    }
+    participants_tmp <- participants_tmp[, colnames(participants_tmp)!="version"]
+
+    # download and process survey data
+    raw <- download_surveys(get_bvq_runs()[[version]]) # fetch responses
+
+    # fix Spanish dataframe colnames
+    colnames(raw[[7]]) <- gsub("cat_", "spa_", colnames(raw[[7]])) 
+    raw[[1]] <- fix_logs_df(raw, participants_tmp) # fix logs data frame
+
+    # process responses
+    processed <- process_survey(raw, participants_tmp, version)
+
+    if (interactive()) {
+        n_responses <- nrow(distinct(processed, response_id))
+        msg <- "{version} updated: {n_responses} response{?s} retrieved"
+        cli_alert_success(msg)
+    }
+
+    return(processed)
+}
+
 #' Download formr surveys
 #'
 #' @importFrom cli cli_progress_done
@@ -42,7 +101,7 @@ download_surveys <- function(surveys, ...) {
 #'
 #' @param raw Raw survey data, as generated by [bvq::download_surveys()]
 #' @param participants_tmp Participants dataset, inherited from inside the `import_*()` function environment.
-#' @param survey_name character string indicating the name of the survey being processed (must be `"BL-Lockdown"`, `"BL-Short"`, or `"BL-Long-2"`).
+#' @param version character string indicating the name of the BVQ version being processed (must be `"BL-Lockdown"`, `"BL-Short"`, or `"BL-Long-2"`).
 #' 
 #' @importFrom dplyr if_any
 #'
@@ -52,145 +111,106 @@ download_surveys <- function(surveys, ...) {
 #' @keywords internal
 #' 
 #' @md
-process_survey <- function(raw, participants_tmp, survey_name) {
-    words_cat <- select(raw[[6]], session, created_cat = created, ended_cat = ended)
-    words_spa <- select(raw[[7]], session, created_spa = created, ended_spa = ended)
+process_survey <- function(raw, participants_tmp, version) {
 
-    raw_tmp <- raw %>%
-        lapply(select, -one_of(c("created", "modified", "ended", "expired"))) %>%
-        # if a single session ID has multiple entries, select most recent
-        reduce(inner_join, by = join_by(session), multiple = "all") %>%
-        mutate(
-            code = fix_code(code),
-            survey_name = .env$survey_name,
-            version = ifelse(survey_name=="long",
-                             survey_name,
-                             paste(survey_name,
-                                   trimws(tolower(version), whitespace = "[\\h\\v]"),
-                                   sep = "-"
-                             )
-            )
-        ) %>%
-        left_join(participants_tmp, by = join_by(code))
-
-    items_to_keep <- c(
-        "time", "code", "study", "version", "randomisation",
-        "date_birth", "date_started", "date_finished", "sex",
-        "edu_parent1", "edu_parent2", "language_doe_catalan",
-        "language_doe_spanish", "language_doe_others"
-    )
+    # subset columns
+    words_cat <- select(raw[[6]], session,
+                        created_cat = created, ended_cat = ended)
+    words_spa <- select(raw[[7]], session,
+                        created_spa = created, ended_spa = ended)
+
+    raw_tmp <- merge_surveys(raw, participants_tmp, version)
+
+    items_to_keep <- c("child_id", "response_id", "time", "version", "version_list",
+                       "date_birth", "date_started", "date_finished", "sex",
+                       "edu_parent1", "edu_parent2", "language_doe_catalan",
+                       "language_doe_spanish", "language_doe_others")
 
     # process data
     processed <- raw_tmp %>%
         select(-matches("lockdown")) %>%
         left_join(words_cat, by = join_by(session), multiple = "all") %>%
         left_join(words_spa, by = join_by(session), multiple = "all") %>%
-        filter(code %in% participants_tmp$code,
-               !if_any(matches("created_|ended_"), is.na)) %>% 
-        mutate(
-            across(c(matches("created_|ended_"), date_birth), as.POSIXct),
-            across(starts_with("language_doe"), function(x) ifelse(is.na(x), 0, x)),
-            survey_name = .env$survey_name,
-            date_started = get_time_stamp(ended_cat, ended_spa, which = "first"),
-            date_finished = get_time_stamp(ended_cat, ended_spa, which = "last"),
-            language_doe_catalan = get_doe(matches("catalan")),
-            language_doe_spanish = get_doe(matches("spanish")),
-            language_doe_others = 100 - rowSums(across(c(
-                language_doe_catalan,
-                language_doe_spanish
-            )))
-        ) %>%
+        dplyr::filter(response_id %in% participants_tmp$response_id,
+                      !if_any(matches("created_|ended_"), is.na)) %>% 
+        mutate(across(c(matches("created_|ended_"), date_birth),
+                      as_datetime),
+               across(starts_with("language_doe"),
+                      function(x) ifelse(is.na(x), 0, x)),
+               version = .env$version,
+               date_started = get_time_stamp(ended_cat,
+                                             ended_spa,
+                                             which = "first"),
+               date_finished = get_time_stamp(ended_cat,
+                                              ended_spa,
+                                              which = "last"),
+               language_doe_catalan = get_doe(matches("catalan")),
+               language_doe_spanish = get_doe(matches("spanish")),
+               language_doe_others = 100 - rowSums(across(c(
+                   language_doe_catalan,
+                   language_doe_spanish
+               )))) %>% 
         arrange(desc(date_finished)) %>%
         distinct(session, .keep_all = TRUE) %>%
-        rename(
-            edu_parent1 = demo_parent1,
-            edu_parent2 = demo_parent2
-        ) %>%
-        select(
-            starts_with("id"),
-            one_of(items_to_keep),
-            starts_with("cat_"),
-            starts_with("spa_")
-        ) %>%
-        pivot_longer(
-            cols = matches("cat_|spa_"),
-            names_to = "item",
-            values_to = "response"
-        ) %>%
+        rename(edu_parent1 = demo_parent1,
+               edu_parent2 = demo_parent2) %>%
+        select(child_id,
+               one_of(items_to_keep),
+               starts_with("cat_"),
+               starts_with("spa_")) %>%
+        pivot_longer(cols = matches("cat_|spa_"),
+                     names_to = "item",
+                     values_to = "response") %>%
         rename_with(function(x) gsub("language_", "", x), everything()) %>%
-        mutate(
-            language = ifelse(grepl("cat_", item), "Catalan", "Spanish"),
-            sex = ifelse(sex == 1, "Male", "Female"),
-            across(
-                starts_with("edu_"),
-                function(x) na_if(x, "")
-            )
-        ) %>%
         arrange(desc(date_finished)) %>%
-        distinct(id, code, item, .keep_all = TRUE)
+        distinct(child_id, response_id, time, item, .keep_all = TRUE) %>% 
+        mutate(language = ifelse(grepl("cat_", item), "Catalan", "Spanish"),
+               response_id = fix_response_id(response_id),
+               sex = ifelse(sex == 1, "Male", "Female"),
+               across(starts_with("edu_"),
+                      function(x) na_if(x, "")),
+               date_birth = lubridate::as_date(date_birth),
+               across(c(child_id, response_id, version, version_list,
+                        sex, edu_parent1, edu_parent2, item, language),
+                      as.character)) %>% 
+        relocate(child_id, response_id, time, version, version_list, item, response, language,
+                 starts_with("date_"), sex, starts_with("doe_")) %>% 
+        filter(!is.na(response))
 
     return(processed)
 }
 
-#' Collect survey data
-#'
-#' @importFrom cli cli_alert_success
+#' Merge surveys from the same formr run
 #' 
-#' @param version Character string indicating the name fo the formr run (must be one of "lockdown", "long", or "short")
-#' @param participants Participants dataset, as returned by [bvq::bvq_participants()]
-#' @param ... Unused.
+#' @param raw Raw survey data, as generated by [bvq::download_surveys()]
+#' @param participants_tmp Participants dataset, inherited from inside the `import_*()` function environment.
+#' @param version character string indicating the name of the BVQ version being processed (must be `"BL-Lockdown"`, `"BL-Short"`, or `"BL-Long-2"`).
 #' 
+#' @import dplyr
+#'
 #' @author Gonzalo Garcia-Castro
 #' 
 #' @noRd
 #' @keywords internal
 #' 
 #' @md
-collect_survey <- function(version, participants, ...) {
-
-    # validate version name
-    survey_options <- c("long", "short", "lockdown")
-    if (!(version %in% survey_options)) {
-        cli_abort("survey must be one of {survey_options}")
-    }
-
-    # get survey names
-    survey_names <- c("_log", "_welcome", "_consent", "_demo", "_language", 
-                      "_words_catalan", "_words_spanish")
-
-    # get survey names
-    surveys <- paste0("bilexicon_", version, "_0", c(1:6, 6), survey_names) 
-    if (version=="long") {
-        surveys <- gsub("_long", "", surveys)
-        surveys <- gsub("catalan", "cat", surveys)
-        surveys <- gsub("spanish", "spa", surveys)
-    }
-
-    # process participant info
-    if (missing(participants)) participants <- bvq_participants()
-    participants_tmp <- participants[participants$version %in% version, ]
-    if (version=="long") {
-        participants_tmp <- participants_tmp[participants_tmp$randomisation=="2", ]
-    }
-    participants_tmp <- participants_tmp[, colnames(participants_tmp)!="version"]
-
-    # download and process survey data
-    raw <- download_surveys(surveys) # fetch responses
-    # fix Spanish dataframe colnames
-    colnames(raw[[7]]) <- gsub("cat_", "spa_", colnames(raw[[7]])) 
-    raw[[1]] <- fix_logs_df(raw, participants_tmp) # fix logs dataframe
-    processed <- process_survey(raw, participants_tmp, version)
+merge_surveys <- function(raw, participants_tmp, version) {
 
-    if (interactive()) {
-        n_responses <- nrow(distinct(processed, code))
-        msg <- "{version} updated: {n_responses} response{?s} retrieved"
-        cli_alert_success(msg)
-    }
+    # merge data frames
+    raw_tmp <- raw %>%
+        lapply(select, -one_of(c("created", "modified", "ended", "expired"))) %>%
+        # if a single session ID has multiple entries, select most recent
+        reduce(inner_join, by = join_by(session), multiple = "all") %>%
+        mutate(version_list = version,
+               version = .env$version,
+               version_list = trimws(version_list, whitespace = "[\\h\\v]")) %>%
+        left_join(participants_tmp, by = join_by(response_id))
 
-    return(processed)
+    return(raw_tmp)
 }
 
 
+
 #' Fix logs dataframe
 #' 
 #' @param raw Named list with the contents of the surveys, as returned by [bvq::download_surveys()].
@@ -205,20 +225,23 @@ fix_logs_df <- function(raw, participants_tmp) {
 
     # fix logs dataframe
     logs <- raw[[1]]
-    names(logs)[names(logs) == "bl_code"] <- "code"
+    names(logs)[names(logs) %in% c("code", "bl_code")] <- "response_id"
     # variables to correct types
     logs[c("created", "ended")] <- lapply(logs[c("created", "ended")], as.POSIXct)
     # remove if missing any critical variable
-    logs$code <- fix_code(ifelse(logs$code=="", NA_character_, logs$code))
-    logs <- logs[!is.na(logs$code) & !is.na(logs$ended) & !is.na(logs$session), ]
+    logs$code <- fix_response_id(ifelse(logs$response_id=="", 
+                                        NA_character_, 
+                                        logs$response_id))
+    logs <- logs[!is.na(logs$response_id) & !is.na(logs$ended) & !is.na(logs$session), ]
     # fix codes known to be wrong
     logs <- fix_code_raw(logs)
     # remove codes not included in participants
-    logs <- logs[logs$code %in% participants_tmp$code, ]
+    logs <- logs[logs$response_id %in% participants_tmp$response_id, ]
     # get only last response of each code
     logs <- logs[order(logs$created, decreasing = TRUE), , drop = FALSE]
-    logs <- logs[!duplicated(logs$code), , drop = FALSE]
+    logs <- logs[!duplicated(logs$response_id), , drop = FALSE]
 
     return(logs)
 
 }
+