Merge pull request #19 from gongcastro/test

Replace `group_by()`/`ungroup()` for experimental argument `.by` in d…
gongcastro · May 11, 2023 · be17390 · be17390
2 parents 5280fb6 + c061ed7
commit be17390
Show file tree

Hide file tree

Showing 7 changed files with 116 additions and 144 deletions.
diff --git a/.covrignore b/.covrignore
@@ -2,3 +2,4 @@ R/globals.R
 R/import.R
 R/participants.R
 R/connect.R
+R/responses.R
diff --git a/NEWS.md b/NEWS.md
@@ -4,6 +4,8 @@
 * Add examples and tests for `get_time_stamp()` and `get_doe()`
 * Add documentation for new version of `bvq_norms()` in its vignette
 * Use .covrignore
+* Replace `group_by()`/`ungroup()` for experimental argument `.by` in dplyr functions
+
 
 # bvq 0.3.0
 

diff --git a/R/logs.R b/R/logs.R
@@ -7,7 +7,7 @@
 #'   many times a participant has been sent the questionnaire, independently of
 #'   whether a response was obtained from them later.
 #'   
-#' @import dplyr
+#' @import dplyr 
 #' @importFrom lubridate as_date
 #' @importFrom lubridate today
 #' @importFrom lubridate as_datetime
@@ -57,63 +57,61 @@ bvq_logs <- function(participants = NULL,
                      other_threshold = 0.10,
                      ...) {
 
-    suppressMessages({
-
-        # get participant information
-        if (is.null(participants)) participants <- bvq_participants()
-        if (is.null(responses)) responses <- bvq_responses(participants)
-
-        # get n items answered by participants (depends on the questionnaire version)
-        total_items <- studies %>%
-            distinct(version, language, n) %>%
-            summarise(total_items = sum(n), 
-                      .by = version)
-
-        grouping_vars <- c("id", "date_birth", "time",
-                           "edu_parent1", "edu_parent2",
-                           "date_birth", "date_started", "date_finished", 
-                           "doe_spanish", "doe_catalan",
-                           "doe_others", "date_birth", "code", "study", 
-                           "version")
-
-        vars <- c("code", "time", "study", "version", "age",
-                  "date_birth", "date_started", "date_finished", 
-                  "duration", "dominance", "lp", 
-                  "edu_parent1", "edu_parent2",
-                  "doe_spanish", "doe_catalan", "doe_others", "completed")
-
-        # generate logs
-        logs <- responses %>%
-            # total items to fill by each participant (varies across versions)
-            summarise(complete_items = sum(!is.na(response)),
-                      .by = one_of(grouping_vars)) %>%
-            left_join(total_items,
-                      by = join_by(version)) %>%
-            left_join(select(participants, -c(date_birth, version)),
-                      by = join_by(id, time, code, study)) %>%
-            drop_na(id) %>%
-            mutate(
-                # define language profiles based on thresholds
-                lp = case_when(doe_catalan >= bilingual_threshold ~ "Monolingual",
-                               doe_spanish >= bilingual_threshold ~ "Monolingual",
-                               doe_others > other_threshold ~ "Other",
-                               .default = "Bilingual"),
-                # define language dominance
-                dominance = case_when(doe_catalan > doe_spanish ~ "Catalan",
-                                      doe_spanish > doe_catalan ~ "Spanish",
-                                      doe_catalan == doe_spanish ~ sample(c("Catalan", "Spanish"), 1)),
-                age = diff_in_time(date_finished, date_birth, "months"),
-                duration = diff_in_time(date_finished, date_started, "days")
-            ) %>% 
-            # compute participant's progress through the questionnaire
-            rowwise() %>%
-            mutate(progress = complete_items / total_items,
-                   completed = progress >= 0.95) %>%
-            ungroup() %>%
-            # select relevant columns and reorder them
-            select(id, one_of(vars)) %>%
-            arrange(desc(date_finished))
-    })
+    # get participant information
+    if (is.null(participants)) participants <- bvq_participants()
+    if (is.null(responses)) responses <- bvq_responses(participants)
+
+    # get n items answered by participants (depends on the questionnaire version)
+    total_items <- studies %>%
+        distinct(version, language, n) %>%
+        summarise(total_items = sum(n), 
+                  .by = version)
+
+    grouping_vars <- c("id", "date_birth", "time",
+                       "edu_parent1", "edu_parent2",
+                       "date_birth", "date_started", "date_finished", 
+                       "doe_spanish", "doe_catalan",
+                       "doe_others", "date_birth", "code", "study", 
+                       "version")
+
+    vars <- c("code", "time", "study", "version", "age",
+              "date_birth", "date_started", "date_finished", 
+              "duration", "dominance", "lp", 
+              "edu_parent1", "edu_parent2",
+              "doe_spanish", "doe_catalan", "doe_others", "completed")
+
+    # generate logs
+    logs <- responses %>%
+        # total items to fill by each participant (varies across versions)
+        summarise(complete_items = sum(!is.na(response)),
+                  .by = one_of(grouping_vars)) %>%
+        left_join(total_items,
+                  by = join_by(version)) %>%
+        left_join(select(participants, -c(date_birth, version)),
+                  by = join_by(id, time, code, study)) %>%
+        drop_na(id) %>%
+        mutate(
+            # define language profiles based on thresholds
+            lp = case_when(
+                doe_catalan >= bilingual_threshold ~ "Monolingual",
+                doe_spanish >= bilingual_threshold ~ "Monolingual",
+                doe_others > other_threshold ~ "Other",
+                .default = "Bilingual"
+            ),
+            # define language dominance
+            dominance = case_when(
+                doe_catalan > doe_spanish ~ "Catalan",
+                doe_spanish > doe_catalan ~ "Spanish",
+                doe_catalan == doe_spanish ~ sample(c("Catalan", "Spanish"), 1)),
+            age = diff_in_time(date_finished, date_birth, "months"),
+            duration = diff_in_time(date_finished, date_started, "days")
+        ) %>% 
+        # compute participant's progress through the questionnaire
+        mutate(progress = complete_items / total_items,
+               completed = progress >= 0.95) %>%
+        # select relevant columns and reorder them
+        select(id, one_of(vars)) %>%
+        arrange(desc(date_finished))
 
     return(logs)
 }
diff --git a/R/responses.R b/R/responses.R
@@ -51,52 +51,33 @@ bvq_responses <- function(participants = NULL,
                           longitudinal = "all",
                           ...) {
 
-    bvq_connect() # get credentials to Google and formr
-
-    # get participant information
-    if (is.null(participants)) participants <- bvq_participants()
-
     # retrieve data from formr
     formr2 <- import_formr2() # formr2
     formr_lockdown <- import_formr_lockdown() # formr-lockdown
     formr_short <- import_formr_short() # formr-lockdown
 
-    # merge data
-    suppressMessages({
-
-        cbc_studies <- c("CBC", "Signs", "Negation", "Inhibition")
-
-        responses <- list(formr1, formr2, formr_short, formr_lockdown) %>%
-            bind_rows() %>%
-            distinct(id, code, item, .keep_all = TRUE) %>%  
-            mutate(across(c(starts_with("date_"), time_stamp), as_date),
-                   date_finished = coalesce(time_stamp, date_finished),
-                   version = case_when(
-                       study %in% "DevLex" ~ "DevLex",
-                       study %in% cbc_studies ~ "CBC",
-                       .default = version
-                   ),
-                   time = ifelse(is.na(time), 1, time),
-                   version = trimws(version, whitespace = "[\\h\\v]"),
-            ) %>%
-            fix_item() %>%
-            fix_doe() %>%
-            mutate(across(starts_with("doe_"), 
-                          function(x) x / 100)) %>%
-            fix_sex() %>%
-            mutate(study = ifelse(is.na(study), "BiLexicon", study)) %>% 
-            fix_id_exp() %>%
-            drop_na(date_finished) %>%
-            get_longitudinal(longitudinal = longitudinal) %>%
-            arrange(desc(date_finished)) %>% 
-            select(id, time, code, study,
-                   version, randomisation,
-                   starts_with("date_"),
-                   item, response, sex,
-                   starts_with("doe_"),
-                   starts_with("edu_"))
-    })
-
+    responses <- list(formr1, formr2, formr_short, formr_lockdown) %>%
+        bind_rows() %>%
+        distinct(id, code, item, .keep_all = TRUE) %>%  
+        mutate(across(c(starts_with("date_"), time_stamp), as_date),
+               date_finished = coalesce(time_stamp, date_finished),
+               time = ifelse(is.na(time), 1, time),
+               version = trimws(version, whitespace = "[\\h\\v]")) %>%
+        fix_item() %>%
+        fix_doe() %>%
+        mutate(across(starts_with("doe_"), function(x) x / 100)) %>%
+        fix_sex() %>%
+        mutate(study = ifelse(is.na(study), "BiLexicon", study)) %>% 
+        fix_id_exp() %>%
+        drop_na(date_finished) %>%
+        get_longitudinal(longitudinal = longitudinal) %>%
+        arrange(desc(date_finished)) %>% 
+        select(id, time, code, study,
+               version, randomisation,
+               starts_with("date_"),
+               item, response, sex,
+               starts_with("doe_"),
+               starts_with("edu_"))
 
     return(responses)
 }
diff --git a/R/utils.R b/R/utils.R
@@ -88,12 +88,19 @@ get_doe <- function(...) {
 #' 
 #' @param x Vector of `code` whose values should be fixed.
 #' @author Gonzalo Garcia-Castro
+#' 
+#' @examples
+#' code_vctr <- c("BL0123", "bl0123", "Bl0123", "BLBL0123",
+#'                 "bi0123", "blo123", "B0123", "BI0123",
+#'                 " BL0123")
+#'                 
+#' fix_code(code_vctr)
 fix_code <- function(x) {
     x <- toupper(trimws(x))
-    x <- gsub(".*BL", "", x) 
     x <- gsub("O", "0", x)
     x <- gsub("I", "L", x)
-    x <- ifelse(!grepl("BL", x), paste0("BL", x), x)
+    x <- gsub("[^\\d]+", "", x, perl=TRUE)
+    x <- paste0("BL", x)
 
     return(x)
 }
@@ -163,24 +170,11 @@ fix_sex <- function(x) { # nocov start
                     "Female",
                     x$sex)
 
-    x$sex <- ifelse(x$id_bvq %in% c("bilexicon_1447"), "Male", x$sex)
+    x$sex <- ifelse(x$id_bvq=="bilexicon_1447", "Male", x$sex)
 
     return(x)
 } # nocov end
 
-#' Fix postcode
-#' 
-#' @param x Vector of `postcode` whose values should be fixed
-#' @author Gonzalo Garcia-Castro
-fix_postcode <- function(x) { 
-
-    pcd <- x$postcode
-    pcd <- ifelse(nchar(pcd) < 5, paste0("0", pcd), pcd)
-    pcd <- ifelse(nchar(pcd) < 5, NA_character_, pcd)
-    x$postcode <- pcd
-
-    return(x)
-}
 
 #' Fix item
 #' 
@@ -232,49 +226,36 @@ fix_id_exp <- function(x) { # nocov start
 #'   * `"first"` returns the first response of each participant (participants with only one appearance are
 #'   included).
 #'   * `"last"` returns the last response from each participant (participants with only one response are included).
-#' @importFrom dplyr group_by
 #' @importFrom dplyr distinct
 #' @importFrom dplyr n
 #' @importFrom dplyr filter
-#' @importFrom dplyr ungroup
 #' @returns A subset of the data frame `x` with only the selected cases,
 #'   according to `longitudinal`.
 #' @author Gonzalo Garcia-Castro
 #' @examples 
 #' id <- c(1, 1, 1, 2, 2, 3, 4, 4, 4, 4, 5, 6, 7, 7, 8, 9, 10, 10)
 #' sums <- rle(sort(id))[["lengths"]]
 #' dat <- data.frame(id, time = unlist(sapply(sums, function(x) seq(1, x))))
+#' 
+#' (dat)
+#' 
 #' get_longitudinal(dat, "first")
 #' get_longitudinal(dat, "only")
 get_longitudinal <- function(x, longitudinal = "all") {
 
     longitudinal_opts <- c("all", "no", "first", "last", "only")
 
     if (!(longitudinal %in% longitudinal_opts) && interactive()) {
-        cli_abort(paste0("longitudinal must be one of: ", 
-                         paste0(longitudinal_opts, collapse = ", ")))
-    }
-
-    repeated <- distinct(x, id, time) %>%
-        group_by(id) %>%
-        filter(n() > 1) %>%
-        ungroup()
-
-    if (longitudinal == "no") x <- x[!(x$id %in% repeated$id), ]
-
-    if (longitudinal == "first") {
-        x <- group_by(x, id) %>%
-            filter(time == min(time, na.rm = TRUE)) %>%
-            ungroup()
+        long_colapsed <- paste0(longitudinal_opts, collapse = ", ")
+        cli_abort(paste0("longitudinal must be one of: ", long_colapsed))
     }
 
-    if (longitudinal == "last") {
-        x <- group_by(x, id) %>%
-            filter(time == max(time, na.rm = TRUE)) %>%
-            ungroup()
-    } 
+    repeated <- filter(distinct(x, id, time), n() > 1, .by = id) 
 
-    if (longitudinal == "only") x <- x[x$id %in% repeated$id, ]
+    if (longitudinal == "no") x <- filter(x, !(id %in% repeated$id))
+    if (longitudinal == "first") x <- filter(x, time==min(time), .by = id)
+    if (longitudinal == "last") x <- filter(x, time==max(time), .by = id) 
+    if (longitudinal == "only") x <- filter(x, id %in% repeated$id)
 
     return(x)
 }

diff --git a/R/vocabulary.R b/R/vocabulary.R
@@ -26,8 +26,7 @@
 #'   `"prop"`. If `"prop"` (default), vocabulary metrics are calculated as
 #'   proportions. If `"count"`, vocabulary metrics are reported as counts
 #'   (number of words). 
-#' @param ... Extra arguments that will be passed to the `bvq_responses`.
-#'   function.
+#' @param ... Extra arguments that will be passed to [bvq::bvq_responses()]`.
 #' @returns A dataset (actually, a [tibble::tibble] with each participant's
 #'   comprehensive and/or vocabulary size in each language. This data frame
 #'   contains the following variables:
@@ -170,8 +169,8 @@ bvq_vocabulary <- function(participants,
                    by = join_by(id, time, type, !!!.by)) %>%
             mutate(across(matches("concept|te"), 
                           function(x) ifelse(is.na(x), as.integer(0), x))) %>%
-            select(any_of(c("id", "time", "age", "type", .by)), matches(.scale)) %>%
-            select(-ends_with(which_col_not))
+            select(any_of(c("id", "time", "age", "type", .by)),
+                   matches(.scale), (-ends_with(which_col_not)))
 
         return(vocabulary)
 

diff --git a/tests/testthat/test-utils.R b/tests/testthat/test-utils.R
@@ -41,15 +41,15 @@ test_that("get_doe works", {
                 doe_other = 1-get_doe(matches("cat|spa")),
                 doe_cat = get_doe(doe_cat_1, doe_cat_2),
                 doe_spa = get_doe(matches("spa")))
-                
+
     expect_true(all(vapply(y, class, "character")=="numeric"))
     expect_true(all(sapply(y, \(x) (x>=0) & (x <= 1))))
     expect_equal(y$doe_other, 1-rowSums(x))
     expect_equal(y$doe_cat, rowSums(x[, c("doe_cat_1", "doe_cat_2")]))
     expect_equal(y$doe_spa, rowSums(x[, c("doe_spa_1", "doe_spa_2")]))
 })
 
-test_that("get_longitudinal works correctly", {
+test_that("get_longitudinal works", {
     id <- c(1, 1, 1, 2, 2, 3, 4, 4, 4, 4, 5, 6, 7, 7, 8, 9, 10, 10)
     sums <- rle(sort(id))[["lengths"]]
     dat <- data.frame(id, time = unlist(sapply(sums, function(x) seq(1, x))))
@@ -62,3 +62,13 @@ test_that("get_longitudinal works correctly", {
     expect_identical(get_longitudinal(dat, "last")$id, unique(dat$id))
 
 })
+
+
+test_that("fix_code works", {
+    code_vctr <- c("BL0123", "bl0123", "Bl0123", "BLBL0123",
+                   "bi0123", "blo123", "B0123", "BI0123",
+                   " BL0123")
+
+    expect_equal(unique(fix_code(code_vctr)), "BL0123")
+})
+