-
Notifications
You must be signed in to change notification settings - Fork 0
/
utils.R
329 lines (296 loc) · 9.89 KB
/
utils.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
#' Save an R object as data or results as CSV, or RDS files
#'
#' @param x A tabular R object
#' @param folder Folder in which to write the file. If the resulting path does not exist, a new directory will be generated.
#' @param formats Formats in which to write the files. Must be at least one of 'csv', or 'rds'
#' @param .sep Path separator, takes "/" by default
save_files <- function(x,
folder,
file_name = deparse(substitute(x)),
formats = c("csv", "rds"),
.sep = "/") {
# check arguments
if (!all(formats %in% c("csv", "rds"))) {
cli::cli_abort("formats must be 'csv' or 'rds'")
}
# create directories if missing
file_paths <- glue::glue("{folder}{.sep}{file_name}.{formats}")
if (length(formats) > 1) {
dirs <- glue::glue("{folder}{.sep}{formats}")
dirs_exist <- dir.exists(dirs)
if (any(!dirs_exist)) {
which_missing <- formats[which(!dirs_exist)]
missing_dir <- glue::glue("{folder}{.sep}{which_missing}{.sep}")
invisible(purrr::map(missing_dir, dir.create))
cli::cli_alert_warning("Created {.path {missing_dir}}")
file_paths <-
glue::glue("{folder}{.sep}{formats}{.sep}{file_name}.{formats}")
}
}
# save files
if ("csv" %in% formats) {
write.csv(flatten_columns(x),
file_paths[grepl(".csv", file_paths)],
row.names = FALSE
)
}
if ("rds" %in% formats) {
saveRDS(x, file_paths[grepl(".rds", file_paths)])
}
if (interactive()) cli::cli_alert_success("Saved to {.path {folder}}")
}
#' Transform any list column in a data frame to collapsed character vector
#'
#' @param x A data frame
flatten_columns <- function(x) {
mutate(
x,
across(
where(is.list),
function(x) {
out <- purrr::map(x, function(x) {
paste0(x, collapse = ", ")
})
unlist(out)
}
)
)
}
#' Dictionary for predictor names
#'
#' @param data Dataset of responses as generated by the [get_responses()] function
#' @returns A named character vector
get_vars_dict <- function(responses) {
var.names <- c(
"b_Intercept[1]",
"b_Intercept[2]",
"b_n_phon_std",
"b_freq_std",
"b_age_std",
"b_exposure_std",
"b_doe_std",
"b_lv_std",
"b_exposure_std:lv_std",
"b_age_std:exposure_std",
"b_age_std:lv_std",
"b_age_std:exposure_std:lv_std",
"b_age_std:doe_std",
"b_doe_std:lv_std",
"b_age_std:doe_std:lv_std"
)
var.labels <- c(
"Comprehension and Production",
"Comprehension",
"Length",
"Frequency",
"Age",
"LEWF",
"Exposure",
"Cognateness",
"Exposure \u00d7 Cognateness",
"Age \u00d7 LEWF",
"Age \u00d7 Cognateness",
"Age \u00d7 LEWF \u00d7 Cognateness",
"Age \u00d7 Exposure",
"Exposure \u00d7 Cognateness",
"Age \u00d7 Exposure \u00d7 Cognateness"
)
vars_dict <- var.labels
names(vars_dict) <- var.names
return(vars_dict)
}
#' Dictionary for predictor names
#'
#' @param data Dataset of responses as generated by the [get_responses()] function
#' @returns A named character vector
get_vars_dict_composite <- function(responses) {
var.names <- c(
"b_Intercept[1]",
"b_Intercept[2]",
"b_n_phon_std",
"b_age_std",
"b_exposure_std",
"b_lv_std",
"b_age_std:exposure_std",
"b_exposure_std:lv_std",
"b_age_std:lv_std",
"b_age_std:exposure_std:lv_std"
)
var.labels <- c(
"Comprehension and Production",
"Comprehension",
glue::glue("Phonemes (+1 SD, {round(sd(responses$n_phon), 2)} phonemes)"),
glue::glue("Age (+1 SD, {round(sd(responses$age), 2)}, months)"),
glue::glue("LEWF (+1 SD, {round(sd(responses$exposure), 2)})"),
glue::glue("Cognateness (+1 SD, {round(sd(responses$lv), 2)})"),
"Age \u00d7 LEWF",
"LEWF \u00d7 Cognateness",
"Age \u00d7 Cognateness",
"Age \u00d7 LEWF \u00d7 Cognateness"
)
vars_dict <- var.labels
names(vars_dict) <- var.names
return(vars_dict)
}
#' Syllabify phonological transcriptions in X-SAMPA formats
#'
#' @param x A character string with a phonological transcription in X-SAMPA.
#' @param .sep A vector of character strings indicating the characters that will be used to separate syllables. Takes `"\\."` and `"\\\""` by default.
#'
#' @return A vector of characters in which each element is a syllable.
syllabify_xsampa <- function(x, .sep = c("\\.", "\\\"")) {
syll <- strsplit(x, split = paste0(.sep, collapse = "|"))
syll <- lapply(syll, function(x) x[x != ""])
return(syll)
}
#' Remove punctuation from X-SAMPA transcriptions.
#'
#' @details Note that this function will effectively remove information about
#' syllabification and stress from the phonological representations.
#'
#' @param x A character string with a phonological transcription in X-SAMPA format.
#'
#' @return A character string containing a phonological transcription in X-SAMPA format in which punctuation characters
#' have been removed.
flatten_xsampa <- function(x) {
str_rm <- c("\\.", "\\\\", ",", "/", "?", "'", '"')
str <- gsub(paste0(str_rm, collapse = "|"), "", x)
str <- gsub("\\{", "\\\\{", str)
return(str)
}
#' Deal with repeated measures
#'
#' @param x A data frame containing a column for participants (each participant
#' gets a unique ID), and a column for times (a numeric value indicating how
#' many times each participant appears in the data frame counting this one).
#' One participant may appear several times in the data frame, with each time
#' with a unique value of `time`.
#' @param longitudinal A character string indicating what subset of the
#' participants should be returned:
#' * `"all"` (default) returns all participants.
#' * `"no"` remove all participants with more than one response.
#' * `"only"` returns only participants with more than one response in the
#' dataset (i.e., longitudinal participants).
#' * `"first"` returns the first response of each participant (participants with only one appearance are
#' included).
#' * `"last"` returns the last response from each participant (participants with only one response are included).
#' @returns A subset of the data frame `x` with only the selected cases,
#' according to `longitudinal`.
get_longitudinal <- function(x, longitudinal = "all") {
longitudinal_opts <- c("all", "no", "first", "last", "only")
if (!(longitudinal %in% longitudinal_opts) && interactive()) {
long_colapsed <- paste0(longitudinal_opts, collapse = ", ")
cli_abort(paste0("longitudinal must be one of: ", long_colapsed))
}
repeated <- filter(distinct(x, child_id, time), n() > 1, .by = child_id)
if (longitudinal == "no") {
x <- filter(x, !(child_id
%in% repeated$id))
}
if (longitudinal == "first") x <- filter(x, time == min(time), .by = child_id)
if (longitudinal == "last") x <- filter(x, time == max(time), .by = child_id)
if (longitudinal == "only") {
x <- filter(x, child_id
%in% repeated$child_id)
}
return(x)
}
#' Get CHILDES lexical frequencies
#'
#' @param collection CHILDES corpora from where to fetch transcriptions. Takes "Eng-NA" (North American English by default). See [CHILDES Index to corpora(https://childes.talkbank.org/access/) to see options
#' @param age_range Numeric vector of length two indicating the minimum and maximum age range of interest for which to compute lexical frequencies in the CHILDES corpora. Frequencies will be summarised across this age range using the mean
#' @paran ... Additional arguments passed to [childesr::get_types()]
get_childes_frequencies <- function(collection = "Eng-NA",
age_range = c(10, 36),
...) {
suppressMessages({
roles <- c(
"Mother",
"Father",
"Investigator",
"Sibling",
"Sister",
"Grandmother",
"Adult",
"Friend",
"Brother",
"Visitor",
"Relative",
"Grandfather",
"Teacher",
"Student"
)
counts <- childesr::get_types(
collection = collection,
role = roles,
...
)
speaker_ids <- distinct(
counts,
collection_id,
corpus_id,
transcript_id,
speaker_id
)
speakers <- speaker_ids |>
left_join(
childesr::get_speaker_statistics(collection = collection),
by = c(
"collection_id",
"corpus_id",
"speaker_id",
"transcript_id"
)
) |>
select(
collection_id,
corpus_id,
transcript_id,
speaker_id,
num_tokens
)
childes <- counts |>
left_join(speakers,
by = c(
"collection_id",
"corpus_id",
"speaker_id",
"transcript_id"
)
) |>
mutate(
id = as.character(id),
age_months = target_child_age,
age_bin = as.integer(floor(age_months / 6) * 6),
token = tolower(gloss)
) |>
summarise(
transcript_count = sum(count),
transcript_num_tokens = sum(num_tokens),
.by = c(age_bin, token, target_child_id, transcript_id)
) |>
dplyr::filter(between(
age_bin,
age_range[1],
age_range[2]
)) |>
summarise(
freq_count = mean(transcript_count),
total_count = mean(transcript_num_tokens),
n_children = length(unique(target_child_id)),
.by = token
) |>
mutate(
freq_million = freq_count / total_count * 1e6,
freq_zipf = log10(freq_million) + 3
) |>
relocate(
token,
n_children,
freq_count,
freq_million,
freq_zipf
)
})
return(childes)
}