Fix IC's email, add again the model diagnostics appendix, and add not…

…es in formula
gongcastro · Aug 13, 2023 · d0a26d4 · d0a26d4
1 parent 816d424
commit d0a26d4
Show file tree

Hide file tree

Showing 12 changed files with 261 additions and 39 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -1,13 +1,12 @@
 FROM rocker/rstudio:4.2.2
 
-
 LABEL "about" = "A Docker container for the cognate-begininings study" \
     "author" = "Gonzalo Garcia-Castro <[email protected]>"\
     "github" = "https://github.com/gongcastro/cognate-beginnings" \
     "osf" = "https://osf.io/hy984/" \
     "source"="https://github.com/gongcastro/cognate-beginnings/blob/main/Dockerfile"
 
-# add C++ dependencies
+# add system
 RUN apt-get update && \
     apt-get install -y libxml2-dev \
     libglpk-dev \
@@ -34,9 +33,7 @@ RUN apt-get update && \
     libicu-dev
 
 # copy the whole directory to /rstudio (working directory in Posit Cloud)
-#RUN mkdir /cognate-beginnings/ && chown -c rstudio /cognate-beginnings/
 COPY . /home/rstudio/
-#RUN cd /cognate-beginnings/
 WORKDIR /home/rstudio/
 
 # install Quarto

diff --git a/_targets/meta/meta b/_targets/meta/meta
@@ -1,6 +1,6 @@
 name|type|data|command|depend|seed|path|time|size|bytes|format|repository|iteration|parent|children|seconds|warnings|error
-.Random.seed|object|f2008ece88cd2775|||||||||||||||
-appendix|stem|357f2e4532634282|46650b2562dabf32|a2f54ca013ed64d9|1554002692|manuscript/appendix.pdf*manuscript/appendix.qmd|t19564.6388121597s|f9b4fad96ede03f0|72265|file|local|vector|||31.82||
+.Random.seed|object|b848b32295dd6844|||||||||||||||
+appendix|stem|b6d1ae6ace9fe9b2|d394e6929dfd1cc7|32d8fd80fb90a66d|1554002692|manuscript/appendix.pdf*manuscript/appendix.qmd|t19582.6784998602s|0b8fef27a334cf5e|766008|file|local|vector|||69.28||
 bvq_data|stem|b9cdbf2e4a893895|1713fb13bcb0ffed|abbee844f3596b03|-1902070830||t19559.5893925943s|63253b11449ae909|1233401|rds|local|vector|||1.84||
 bvq_data_file|stem|2eeaa6c449fa1e44|48eaf072f2413071|ef46db3751d8e999|1011699987||t19559.5893425132s|d586f792a395948c|68|rds|local|vector|||0||
 childes|stem|637fa6be8f7bc26f|257adfb4dcdab016|55115efc18bbf37d|-1780705379||t19559.5893422066s|a003be010e86b4f4|534735|rds|local|vector|||71.14||
@@ -25,7 +25,7 @@ get_vars_dict|function|10a30763d4598b29|||||||||||||||
 get_vars_dict_doe|function|f54423dd2f43698b|||||||||||||||
 items|stem|02e41dc21548abba|49b6cd25c41a1b0f|794808e876deba6b|-986180865||t19559.5934274803s|8a1a7c406d603fbb|26069|rds|local|vector|||0.6||
 items_test|stem|32af597cbdb2a163|762d2cbc9b05fe3c|740911248080b911|883923261||t19559.5934444377s|ac45463db19e2cbe|48|rds|local|vector|||1.26||
-manuscript|stem|7db995dfb8441cad|e8f10768ffdf98ee|c1b03181b33425b9|615889701|manuscript/manuscript.pdf*manuscript/manuscript.qmd|t19581.6983569696s|dd278c33f1016749|284060|file|local|vector|||38.88||
+manuscript|stem|79d510624c960174|e8f10768ffdf98ee|c1b03181b33425b9|615889701|manuscript/manuscript.pdf*manuscript/manuscript.qmd|t19582.6846384484s|b7441086a0c6e44c|284360|file|local|vector|||36.95||
 model_convergence|stem|ca2f52d5efb4f3c0|68cbac5a8705de12|5ca4901a79f93b0d|1264401096||t19564.3860637151s|40173b4f8b5e89ec|124886|rds|local|vector|||53.46||
 model_doe|stem|9bc65508804e8e95|24a8cbde6537d6ff|0c1b3778fcb624c4|1345599899||t19564.3851348444s|2279300c61bae421|150172129|rds|local|vector|||2.03||
 model_draws|stem|eeb914dd253556f7|1fff6399ba521c80|ab23309017e56441|-665592718||t19564.3899257505s|43497b512cb14b56|273272|rds|local|vector|||1.09||

diff --git a/docs/sitemap.xml b/docs/sitemap.xml
@@ -2,14 +2,14 @@
 <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
   <url>
     <loc>https://github.com/gongcastro/cognate-beginnings/index.html</loc>
-    <lastmod>2023-08-12T17:25:42.299Z</lastmod>
+    <lastmod>2023-08-13T16:27:31.113Z</lastmod>
   </url>
   <url>
     <loc>https://github.com/gongcastro/cognate-beginnings/docs/_repro.html</loc>
-    <lastmod>2023-08-12T17:25:43.916Z</lastmod>
+    <lastmod>2023-08-13T16:27:32.655Z</lastmod>
   </url>
   <url>
     <loc>https://github.com/gongcastro/cognate-beginnings/docs/_data-dictionary.html</loc>
-    <lastmod>2023-08-12T17:25:53.443Z</lastmod>
+    <lastmod>2023-08-13T16:27:34.690Z</lastmod>
   </url>
 </urlset>
diff --git a/manuscript/_appendix/_model-diagnostics.qmd b/manuscript/_appendix/_model-diagnostics.qmd
@@ -0,0 +1,153 @@
+We used Stan [@carpenter2017stan] as the probabilistic language behind the estimation of our Bayesian models in this study, with `brms` as its R interface [@burkner2017brms]. This language implements the Markov Chain Monte-Carlo (MCMC) algorithm to explore the posterior distribution of the model. Broadly, this algorithm is used to iteratively sample the joint sampling space of the parameters to be estimated in the model, and compute, for each value sampled, its likelihood under some probability distribution previously defined. We run `r dim(model_fit$fit)[2]` MCMC chains, each `r dim(model_fit$fit)[1]` iterations long each. The correct performance of this algorithm is critical to the quality of the statistical evidence to which the outcomes of the model lead.
+
+One way to diagnose the behaviour of MCMC is to inspect whether the different MCMC chains (if more than one) have converged to a similar region of the posterior. The Gelman-Rubin diagnostic [$\hat{R}$ or R-hat @gelman1992inference] provides a measure of chain convergence by comparing the variance within each chain *versus* the variance between each chain. Both are expected to be identical when chains have perfectly converged, so that $\hat{R} = 1$. Values lower than 1.01 are recommended, while values higher than 1.05 indicate that chains might have trouble converging and therefore the estimated parameters must be taken with caution. @fig-rhats-neffs (A) shows the distribution of $\hat{R}$ values for the coefficients of the fixed effect of our models, which we used for statistical inference. Most values are lower than 1.01, and never higher than 1.05, which provides evidence of successful MCMC convergence.
+
+Another diagnostic of good MCMC converge is the ratio of effective sample size to total sample size ($N_{eff}/N$), which indicates the proportion of samples in the chain that resulted from a non-divergent transition. Values closer to 1 are ideal, as they indicate that all posterior samples from the MCMC were used to estimate the posterior distribution of the parameter. Values larger than 0.1 are recommended. @fig-rhats-neffs (B) shows the distribution of the effective sample sizes of the coefficients of the fixed effects in our models. Most values are larger than 0.1, although model 0 ($\mathcal{M}_0$) accumulates most effective sample sizes close to 0.1.
+
+```{r fig-rhats-neffs}
+#| label: fig-rhats-neffs
+#| fig-cap: "MCMC convergence diagnostic of all parameters in the model. Each dot represents the score of one parameter. (A) Distribution of the Gelman-Rubin (R-hat) scores. (B) Distribution of the ratio of effective sample size."
+#| echo: false
+#| message: false
+#| warning: false
+#| fig-height: 3
+#| fig-width: 6
+#| out-width: 100%
+model_convergence |> 
+    ggplot(aes(.rhat)) +
+    geom_dots(colour = "black") +
+    geom_vline(xintercept = 1.01, linetype = "dashed") +
+    labs(
+        x = "R-hat",
+        y = "MCMC samples"
+    ) +
+    model_convergence |> 
+    ggplot(aes(.neff)) +
+    geom_dots(colour = "black") +
+    geom_vline(xintercept = 1.01, linetype = "dashed") +
+    labs(
+        x = "Effective sample size ratio",
+        y = "MCMC samples"
+    ) +    
+    plot_layout(ncol = 1) &
+    plot_annotation(tag_levels = "A") &
+    theme(
+        legend.position = "none",
+        panel.grid = element_blank()
+    )
+```
+
+{{< pagebreak >}}
+
+### Posterior draws and bi-variate scatterplots {.unnumbered}
+
+```{r}
+#| label: fig-model-pairs
+#| fig-cap: "Marginal distribution and bi-variate scatterplot of posterior samples for the fixed regression coefficients in Model 3."
+#| fig-height: 8
+#| fig-width: 10
+#| out-width: 100%
+#| out-height: 100%
+#| echo: false
+#| warning: false
+#| message: false
+titles <- c(
+    "Comprehension",
+    "Production",
+    "Age",
+    "Length",
+    "Exposure",
+    "Cognateness",
+    "Age × Exposure",
+    "Age × Cognateness",
+    "Exposure × Cognateness",
+    "Age × Exposure × Cognateness"
+)
+
+posterior <- as_draws_df(model_fit) |>
+    select(`b_Intercept[1]`:`b_age_std:exposure_std:lv_std`)
+
+my_diag <- function(data, mapping, ...) {
+    ggplot(data = data, mapping = mapping) +
+        stat_slab(
+            colour = "white",
+            fill = "grey15"
+        )
+}
+
+my_upper <- function(data, mapping, ...) {
+    ggplot(data = data, mapping = mapping) +
+        stat_density_2d(aes(fill = ..density..),
+            geom = "raster",
+            contour = FALSE
+        ) +
+        scale_fill_distiller(
+            palette = "Greys",
+            direction = 1
+        )
+}
+
+posterior |> 
+    GGally::ggpairs(
+        upper = list(continuous = my_upper),
+        diag = list(continuous = my_diag),
+        lower = NULL,
+        columnLabels = c(
+            "Intercept\n(Comprehension)",
+            "Intercept\n(Production)",
+            "Age", "Length", "Exposure",
+            "Levenshtein", "Age × Exposure",
+            "Age × Levenshtein", "Exposure × Leveshtein",
+            "Age × Exposure ×\nLevenshtein"
+        ),
+        axisLabels = "none"
+    ) +
+    theme(
+        strip.text = element_text(size = 5),
+        panel.grid = element_blank(),
+        panel.border = element_blank()
+    )
+
+```
+
+{{< pagebreak >}}
+
+### Posterior-predictive checks {.unnumbered}
+
+```{r tbl-ppc}
+#| label: fig-ppc
+#| fig-cap: "Model posterior predictive checks (PPC). Bars indicate the observed proportion of responses to each category (No, Understands, and Understands and Says). Blue dots and error bars represent the mean proportion of responses simulated from the posterior for each category, and its 95\\% interval."
+#| eval: true
+#| echo: false
+#| warning: false
+#| message: false
+#| fig-width: 6
+#| fig-height: 3
+#| out-width: 80%
+#| out-height: 80%
+y_int <- as.integer(as.integer(model_fit$data$response))
+bayesplot::color_scheme_set(c(rep("grey60", 2), rep("black", 4)))
+
+bayesplot::ppc_bars(
+    y_int,
+    model_ppcs,
+    prob = 0.95,
+    linewidth = 1,
+    size = 1.25,
+    fatten = 2,
+    freq = FALSE
+) +
+    scale_x_continuous(
+        breaks = 1:3,
+        labels = c(
+            "No",
+            "Understands",
+            "Understands and Says"
+        )
+    ) +
+    theme(
+        legend.position = "none",
+        axis.text = element_text(size = 11),
+        panel.grid = element_blank()
+    )
+```
diff --git a/manuscript/_freeze/manuscript/execute-results/tex.json b/manuscript/_freeze/manuscript/execute-results/tex.json
diff --git a/manuscript/_freeze/manuscript/figure-pdf/fig-marginal-1.pdf b/manuscript/_freeze/manuscript/figure-pdf/fig-marginal-1.pdf
diff --git a/manuscript/_quarto.yaml b/manuscript/_quarto.yaml
@@ -43,7 +43,7 @@ author:
         city: Lisbon, 1600-214
   - name: Ignacio Castillejo
     orcid: 0000-0001-7445-0416
-    email: [email protected]
+    email: [email protected]
     affiliations: 
       - id: uam 
         name: Universidad Autónoma de Madrid

diff --git a/manuscript/appendix.pdf b/manuscript/appendix.pdf
diff --git a/manuscript/appendix.qmd b/manuscript/appendix.qmd
@@ -30,38 +30,55 @@ format:
 library(knitr)
 library(kableExtra)
 library(ggplot2)
+library(ggdist)
+library(patchwork)
 
-targets::tar_config_set(store = here::here("_targets"),
-                        script = here::here("_targets.R"))
+targets::tar_config_set(
+    store = here::here("_targets"),
+    script = here::here("_targets.R")
+)
 
 targets::tar_load_globals()
 
-targets::tar_load(c(bvq_data, childes, items, model_summary,
-                    participants, responses))
+targets::tar_load(c(
+    bvq_data, childes, items, model_summary,
+    participants, responses
+))
+
+targets::tar_load(c(model_fit, model_convergence, model_ppcs))
 
 targets::tar_load(c(posterior_doe_summary, posterior_doe_draws))
 
-targets::tar_load(c(syllables_data, model_fit_syllables, post_syllables,
-                    posterior_syllables_summary))
+targets::tar_load(c(syllables_data, model_fit_syllables, post_syllables, posterior_syllables_summary))
 
 my_theme <- theme_minimal() +
-    theme(panel.grid = element_line(colour = "grey",
-                                    linetype = "dotted"),
-          axis.line = element_line(colour = "black"),
-          text = element_text(size = 12, colour = "black"),
-          axis.text = element_text(colour = "black"))
+    theme(
+        panel.grid = element_line(
+            colour = "grey",
+            linetype = "dotted"
+        ),
+        axis.line = element_line(colour = "black"),
+        text = element_text(size = 12, colour = "black"),
+        axis.text = element_text(colour = "black")
+    )
 
 theme_set(my_theme)
 ```
 
+## Appendix A: model diagnostics {.appendix .unnumbered}
+
+{{< include _appendix/_model-diagnostics.qmd >}}
 
-## Appendix A: frequency and language exposure as separate predictors {.appendix .unnumbered}
+{{< pagebreak >}}
+
+## Appendix B: frequency and language exposure as separate predictors {.appendix .unnumbered}
 
 {{< include _appendix/_model-doe.qmd >}}
 
 {{< pagebreak >}}
 
-## Appendix B: frequency and language exposure as separate predictors {.appendix .unnumbered}
+
+## Appendix C: frequency and language exposure as separate predictors {.appendix .unnumbered}
 
 {{< include _appendix/_syllable-frequency.qmd >}}
 

diff --git a/manuscript/manuscript.pdf b/manuscript/manuscript.pdf