R/utils.R

#' Save an R object as data or results as CSV, or RDS files
#'
#' @param x A tabular R object
#' @param folder Folder in which to write the file. If the resulting path does not exist, a new directory will be generated.
#' @param formats Formats in which to write the files. Must be at least one of 'csv', or 'rds'
#' @param .sep Path separator, takes "/" by default
save_files <- function(x,
                       folder,
                       file_name = deparse(substitute(x)),
                       formats = c("csv", "rds"),
                       .sep = "/") {
  # check arguments
  if (!all(formats %in% c("csv", "rds"))) {
    cli::cli_abort("formats must be 'csv' or 'rds'")
  }

  # create directories if missing
  file_paths <- glue::glue("{folder}{.sep}{file_name}.{formats}")

  if (length(formats) > 1) {
    dirs <- glue::glue("{folder}{.sep}{formats}")
    dirs_exist <- dir.exists(dirs)

    if (any(!dirs_exist)) {
      which_missing <- formats[which(!dirs_exist)]
      missing_dir <- glue::glue("{folder}{.sep}{which_missing}{.sep}")

      invisible(purrr::map(missing_dir, dir.create))

      cli::cli_alert_warning("Created {.path {missing_dir}}")

      file_paths <-
        glue::glue("{folder}{.sep}{formats}{.sep}{file_name}.{formats}")
    }
  }

  # save files
  if ("csv" %in% formats) {
    write.csv(flatten_columns(x),
      file_paths[grepl(".csv", file_paths)],
      row.names = FALSE
    )
  }

  if ("rds" %in% formats) {
    saveRDS(x, file_paths[grepl(".rds", file_paths)])
  }

  if (interactive()) cli::cli_alert_success("Saved to {.path {folder}}")
}

#' Transform any list column in a data frame to collapsed character vector
#'
#' @param x A data frame
flatten_columns <- function(x) {
  mutate(
    x,
    across(
      where(is.list),
      function(x) {
        out <- purrr::map(x, function(x) {
          paste0(x, collapse = ", ")
        })
        unlist(out)
      }
    )
  )
}

#' Dictionary for predictor names
#'
#' @param data Dataset of responses as generated by the [get_responses()] function
#' @returns A named character vector
get_vars_dict <- function(responses) {
  var.names <- c(
    "b_Intercept[1]",
    "b_Intercept[2]",
    "b_n_phon_std",
    "b_freq_std",
    "b_age_std",
    "b_exposure_std",
    "b_doe_std",
    "b_lv_std",
    "b_exposure_std:lv_std",
    "b_age_std:exposure_std",
    "b_age_std:lv_std",
    "b_age_std:exposure_std:lv_std",
    "b_age_std:doe_std",
    "b_doe_std:lv_std",
    "b_age_std:doe_std:lv_std"
  )

  var.labels <- c(
    "Comprehension and Production",
    "Comprehension",
    "Length",
    "Frequency",
    "Age",
    "LEWF",
    "Exposure",
    "Cognateness",
    "Exposure \u00d7 Cognateness",
    "Age \u00d7 LEWF",
    "Age \u00d7 Cognateness",
    "Age \u00d7 LEWF \u00d7 Cognateness",
    "Age \u00d7 Exposure",
    "Exposure \u00d7 Cognateness",
    "Age \u00d7 Exposure \u00d7 Cognateness"
  )

  vars_dict <- var.labels
  names(vars_dict) <- var.names

  return(vars_dict)
}

#' Dictionary for predictor names
#'
#' @param data Dataset of responses as generated by the [get_responses()] function
#' @returns A named character vector
get_vars_dict_composite <- function(responses) {
  var.names <- c(
    "b_Intercept[1]",
    "b_Intercept[2]",
    "b_n_phon_std",
    "b_age_std",
    "b_exposure_std",
    "b_lv_std",
    "b_age_std:exposure_std",
    "b_exposure_std:lv_std",
    "b_age_std:lv_std",
    "b_age_std:exposure_std:lv_std"
  )

  var.labels <- c(
    "Comprehension and Production",
    "Comprehension",
    glue::glue("Phonemes (+1 SD, {round(sd(responses$n_phon), 2)} phonemes)"),
    glue::glue("Age (+1 SD, {round(sd(responses$age), 2)}, months)"),
    glue::glue("LEWF (+1 SD, {round(sd(responses$exposure), 2)})"),
    glue::glue("Cognateness (+1 SD, {round(sd(responses$lv), 2)})"),
    "Age \u00d7 LEWF",
    "LEWF \u00d7 Cognateness",
    "Age \u00d7 Cognateness",
    "Age \u00d7 LEWF \u00d7 Cognateness"
  )

  vars_dict <- var.labels
  names(vars_dict) <- var.names

  return(vars_dict)
}

#' Syllabify phonological transcriptions in X-SAMPA formats
#'
#' @param x A character string with a phonological transcription in X-SAMPA.
#' @param .sep A vector of character strings indicating the characters that will be used to separate syllables. Takes `"\\."` and `"\\\""` by default.
#'
#' @return A vector of characters in which each element is a syllable.
syllabify_xsampa <- function(x, .sep = c("\\.", "\\\"")) {
  syll <- strsplit(x, split = paste0(.sep, collapse = "|"))
  syll <- lapply(syll, function(x) x[x != ""])
  return(syll)
}

#' Remove punctuation from X-SAMPA transcriptions.
#'
#' @details Note that this function will effectively remove information about
#'   syllabification and stress from the phonological representations.
#'
#' @param x A character string with a phonological transcription in X-SAMPA format.
#'
#' @return A character string containing a phonological transcription in X-SAMPA format in which punctuation characters
#'   have been removed.
flatten_xsampa <- function(x) {
  str_rm <- c("\\.", "\\\\", ",", "/", "?", "'", '"')
  str <- gsub(paste0(str_rm, collapse = "|"), "", x)
  str <- gsub("\\{", "\\\\{", str)
  return(str)
}


#' Deal with repeated measures
#'
#' @param x A data frame containing a column for participants (each participant
#'   gets a unique ID), and a column for times (a numeric value indicating how
#'   many times each participant appears in the data frame counting this one).
#'   One participant may appear several times in the data frame, with each time
#'   with a unique value of `time`.
#' @param longitudinal A character string indicating what subset of the
#'   participants should be returned:
#'   * `"all"` (default) returns all participants.
#'   * `"no"` remove all participants with more than one response.
#'   * `"only"` returns only participants with more than one response in the
#'   dataset (i.e., longitudinal participants).
#'   * `"first"` returns the first response of each participant (participants with only one appearance are
#'   included).
#'   * `"last"` returns the last response from each participant (participants with only one response are included).
#' @returns A subset of the data frame `x` with only the selected cases,
#'   according to `longitudinal`.
get_longitudinal <- function(x, longitudinal = "all") {
  longitudinal_opts <- c("all", "no", "first", "last", "only")

  if (!(longitudinal %in% longitudinal_opts) && interactive()) {
    long_colapsed <- paste0(longitudinal_opts, collapse = ", ")
    cli_abort(paste0("longitudinal must be one of: ", long_colapsed))
  }

  repeated <- filter(distinct(x, child_id, time), n() > 1, .by = child_id)

  if (longitudinal == "no") {
    x <- filter(x, !(child_id
    %in% repeated$id))
  }
  if (longitudinal == "first") x <- filter(x, time == min(time), .by = child_id)
  if (longitudinal == "last") x <- filter(x, time == max(time), .by = child_id)
  if (longitudinal == "only") {
    x <- filter(x, child_id
    %in% repeated$child_id)
  }

  return(x)
}


#' Get CHILDES lexical frequencies
#'
#' @param collection CHILDES corpora from where to fetch transcriptions. Takes "Eng-NA" (North American English by default). See [CHILDES Index to corpora(https://childes.talkbank.org/access/) to see options
#' @param age_range Numeric vector of length two indicating the minimum and maximum age range of interest for which to compute lexical frequencies in the CHILDES corpora. Frequencies will be summarised across this age range using the mean
#' @paran ... Additional arguments passed to [childesr::get_types()]
get_childes_frequencies <- function(collection = "Eng-NA",
                                    age_range = c(10, 36),
                                    ...) {
  suppressMessages({
    roles <- c(
      "Mother",
      "Father",
      "Investigator",
      "Sibling",
      "Sister",
      "Grandmother",
      "Adult",
      "Friend",
      "Brother",
      "Visitor",
      "Relative",
      "Grandfather",
      "Teacher",
      "Student"
    )

    counts <- childesr::get_types(
      collection = collection,
      role = roles,
      ...
    )

    speaker_ids <- distinct(
      counts,
      collection_id,
      corpus_id,
      transcript_id,
      speaker_id
    )

    speakers <- speaker_ids |>
      left_join(
        childesr::get_speaker_statistics(collection = collection),
        by = c(
          "collection_id",
          "corpus_id",
          "speaker_id",
          "transcript_id"
        )
      ) |>
      select(
        collection_id,
        corpus_id,
        transcript_id,
        speaker_id,
        num_tokens
      )

    childes <- counts |>
      left_join(speakers,
        by = c(
          "collection_id",
          "corpus_id",
          "speaker_id",
          "transcript_id"
        )
      ) |>
      mutate(
        id = as.character(id),
        age_months = target_child_age,
        age_bin = as.integer(floor(age_months / 6) * 6),
        token = tolower(gloss)
      ) |>
      summarise(
        transcript_count = sum(count),
        transcript_num_tokens = sum(num_tokens),
        .by = c(age_bin, token, target_child_id, transcript_id)
      ) |>
      dplyr::filter(between(
        age_bin,
        age_range[1],
        age_range[2]
      )) |>
      summarise(
        freq_count = mean(transcript_count),
        total_count = mean(transcript_num_tokens),
        n_children = length(unique(target_child_id)),
        .by = token
      ) |>
      mutate(
        freq_million = freq_count / total_count * 1e6,
        freq_zipf = log10(freq_million) + 3
      ) |>
      relocate(
        token,
        n_children,
        freq_count,
        freq_million,
        freq_zipf
      )
  })

  return(childes)
}