-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmgcv_phid_code.R
408 lines (298 loc) · 14.9 KB
/
mgcv_phid_code.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
######################################################
#### ####
#### Empirical Bayesian spatial models using mgcv ####
#### ####
######################################################
## This tutorial gives a brief introduction to empirical Bayesian model-fitting using mgcv.
# We will apply spatio-temporal models to a binary dengue outbreak indicator created
# using data from 2010 - 2020 from Rio de Janeiro state. Model estimation will be
# carried out using INLA (a fully Bayesian approach) and mgcv (an EB allternative).
# We will compare the predicted probabilities and random effect estimates produced
# using these methods. The models' predictive abilities will also be compared using
# the Brier score and a receiver operator curve (ROC) analysis.
## Why empirical Bayes?
# Unlike fully Bayesian methods (e.g. INLA/MCMC), EB estimates priors from the data
# rather than specifying them in advance. This can provide more flexibility, particularly
# in spatial models where the underlying structure may not be known.
#### Load packages and data from R script ####
source("load_data_packages.R")
#### Explore data ####
## Combine case data with the shapefile to plot the data
df_shp <- full_join(df, shp,
by = "municip_code_ibge") %>%
st_as_sf()
# Plot time series of dengue incidence rate (cases per 100,000 residents) per municipality
# There is a wide range of values, with some explosive outbreaks, particularly 2013, 2015 and 2016.
ggplot(data = df) +
geom_line(aes(x = year, y = DIR, colour = municip_code_ibge,
group = municip_code_ibge)) +
scale_colour_viridis_c(name = "Municipality") +
scale_x_continuous(name = "Year", breaks = seq(2010, 2020, by = 2)) +
theme_bw()
# Let's look at DIR on a map to identify potential spatial patterns in the data
# Seem to be spatial clusters of high DIR in the northeast of the state and lower rates
# in the centre of the state. This spatial pattern should be incorporated into our model.
ggplot(data = df_shp) +
geom_sf(aes(fill = DIR), lwd = .05) +
# Produce a map per year
facet_wrap(~ year) +
# Transform onto a log scale to make patterns easier to identify
scale_fill_viridis_c(name = "DIR", trans = "log1p",
breaks = c(0, 100, 300, 1000, 5000),
direction = 1) +
# Remove coordinate grid
theme_void()
## Let's look at the socioeconomic variables to build hypotheses about potential
# driver of these outbreaks. First, let's create a df with just these variables:
df_socio <- df %>%
# Select just a single year as these are stationary across the time period
filter(year == 2010) %>%
dplyr::select(municip_code_ibge, urban10:level18_num) %>%
# Join to shapefile to plot
full_join(., shp, by = "municip_code_ibge") %>%
st_as_sf()
# % urbanisation
urban_map <- ggplot(data = df_socio) +
geom_sf(aes(fill = urban10)) +
scale_fill_distiller(name = "% urban", palette = "BuPu", direction = 1) +
theme_void()
# % piped water
water_map <- ggplot(data = df_socio) +
geom_sf(aes(fill = water_network)) +
scale_fill_distiller(name = "% piped water", palette = "BuPu", direction = 1) +
theme_void()
# % refuse collected
refuse_map <- ggplot(data = df_socio) +
geom_sf(aes(fill = total_collected)) +
scale_fill_distiller(name = "% refuse collect", palette = "BuPu", direction = 1) +
theme_void()
# Levels of influence
connect_cols <- c("#355070", "#6d597a", "#b56576", "#e56b6f", "#eaac8b")
regic_map <- ggplot(data = df_socio) +
geom_sf(aes(fill = regic18)) +
scale_fill_manual(name = "Level of \ninfluence", values = connect_cols) +
theme_void()
## In Rio these socioeconomic variables are highly correlated. To avoid multicollinearity
# we will just include urbanisation as a measure of access to basic services and influence
socio_maps <- plot_grid(urban_map, water_map, refuse_map, regic_map, nrow = 2)
#### Fit an empirical Bayesian model (using mgcv) ####
# Include urbanisation and year as linear covariates, and a 2-d
# thin plate spline applied to coordinates of the centroid of municipalities to
# produce a spatial smooth surface
gam_model <- gam(outbreak ~ urban10 + fyear +
# spatial plane (using thin-plate spline)
s(lon, lat, bs = "tp"),
family = "binomial", data = st_drop_geometry(df_shp),
# fit using restricted maximum likelihood
method = "REML")
summary(gam_model)
# Generic plot.gam function does not look good, use mgcViz package
plot(gam_model)
# Create mgcViz object
gam_viz <- getViz(gam_model)
# Plot the smoothed functions, apply exp transformation to show on the aOR scale
plot(sm(gam_viz, 1), tran = exp)
#### Fit a fully Bayesian model (in INLA) ####
## The most common spatial modelling approach in INLA assumes a Gaussian Markov random field structure
# (This means that regions are only expected to be correlated with neighbours)
# First, we must create a neighbourhood object
# (any municipalities sharing a border are considered neighbours)
nb.rj <- poly2nb(shp, row.names = shp$municip_index)
# Attach index to nb object to allow matching between this and the df
names(nb.rj) <- attr(nb.rj, "region.id")
# Plot the neighbourhood object (lines indicate municipalities are connected)
nb_lines <- as(nb2lines(nb.rj, coords = st_centroid(shp)$geom), 'sf')
ggplot() +
geom_sf(data = shp) +
geom_sf(data = nb_lines)+
theme_void()
# Convert into an INLA nb object
nb_inla <- nb2INLA("rj.graph", nb.rj)
# As INLA is a fully Bayesian approach, we can specify priors before model fitting
# We will use penalised complexity (PC) priors as these allow us to set meaningful,
# interpretable priors
# Prior for phi (proportion of variance explained by the spatial structure)
phi.prior = list(prior = "pc",
param = c(0.5, 2/3)) # P(phi < 0.5) = 2/3
# Prior for random effect precision
prec.prior = list(prior = "pc.prec",
param = c(1, 0.01)) # P(st. dev > 1) = 0.01
# We are assuming a BYM2 spatial structure in our data, with the structured element
# defined by our neighbourhood matrix.
inla_model <- inla(outbreak ~ urban10 + fyear +
# spatial random effect
f(municip_index, model = "bym2",
graph = "rj.graph",
# prior specification
hyper = list(prec = prec.prior,
phi = phi.prior)),
data = st_drop_geometry(df_shp), family = "binomial",
# as we will be sampling from the posterior, include the following
# to ensure these results are returned
control.compute = list(config = T))
summary(inla_model)
#### Compare estimated probabilities ####
# We can obtain predicted probabilities from the INLA model object, however for the
# GAM, we have to explicitly sample from the posterior distributions to obtain the
# predicted response.
## Step 1: Estimate the regression coefficients (betas):
# Function to obtain MVN random variates
rmvn <- function(n, mu, sig) {
L <- mroot(sig); m <- ncol(L);
t(mu + L%*%matrix(rnorm(m*n),m,n))
}
# Specify number of simulations
n.sims <- 1000
# Simulate from beta posterior distributions
# Mean = coefficient estimates
# Variance = covariance matrix of the predictors (model$Vp)
betas_gam <- rmvn(n.sims, coef(gam_model), gam_model$Vp)
## Step 2: Extract linear predictors from the model
model_mat <- predict(object = gam_model,
# Feed through data used to fit the model
newdata = gam_model$model,
# Return a matrix of linear predictors
type = "lpmatrix")
## Step 3: Multiple the simulated betas with linear predictors to produce an estimate
# per municipality per year (per simulation)
# Apply probit transformation to convert this onto the probability scale:
# g(prob) = log(prob / (1 - prob)) = beta0 + beta1 + f(year) + f(space)
# prob = exp[g(prob)] / 1 + exp[g(prob)]
mean_prob_gam <- exp(model_mat %*% t(betas_gam)) /
(1 + exp(model_mat %*% t(betas_gam)))
# Step 4: Take the mean of the 1000 simulations to return the predicted probability
pred_prob_gam <- apply(mean_prob_gam, 1, mean)
#### Compare predicted probabilities ####
# Combine predictions from INLA and mgcv into the same df
prob_pred <- data.table(municip_code_ibge = df$municip_code_ibge,
year = df$year,
# Keep observed outbreak variable
outbreak_obs = df$outbreak,
# Return predicted probability from the INLA model
pred_inla = inla_model$summary.fitted.values$mean,
# Predicted probability of an outbreak from the GAM model
pred_gam = pred_prob_gam)
# Plot predicted probabilities for INLA and GAM against each other
# Both models produce similar estimates, randomly scattered around the y = x line
ggplot(data = prob_pred) +
geom_point(aes(x = pred_gam, y = pred_inla)) +
# Add reference line (if models predictions were equal)
geom_abline(intercept = 0, slope = 1, linetype = "dashed") +
theme_bw()
#### Compare spatial random effects ####
# As with predicted probabilities, we can obtain spatial random effects from the INLA
# model object, however mgcv requires simulation from the posterior distribution.
## INLA
# Extract mean of posterior for spatial random effects (the first 92 rows relate to
# the spatially structured random effect, whereas the next 92 are a combination of
# the structured and unstructured with the phi parameter)
inla_spat <- inla_model$summary.random$municip_index[(nrow(shp)+1):(nrow(shp)*2), 1:2] %>%
# Transform onto odds scale for easier interpretation
transmute(municip_index = ID - 92,
spat_re = exp(mean))
## mgcv
# Follow the same process as before but just selecting beta and linear predictors
# asociated with the spatial smooth
betas_gam_spat <- betas_gam[ ,substr(names(coef(gam_model)), 1, 5) == "s(lon"]
model_mat_spat <- model_mat[ ,substr(names(coef(gam_model)), 1, 5) == "s(lon"]
# Estimate spatial random effects + transform from log scale
spat_est_gam <- exp(model_mat_spat %*% t(betas_gam_spat))
# As all years are equal, just keep one of them (2010)
spat_est_gam <- spat_est_gam[gam_model$model$fyear == 2010, ]
# Take average of simulations
mean_spat_gam <- apply(spat_est_gam, 1, mean)
# Join random effects to dataset, include INLA estimates
spat_re <- data.table(municip_code_ibge = df[df$year == 2010,]$municip_code_ibge,
spat_re_gam = mean_spat_gam,
spat_re_inla = inla_spat$spat_re) %>%
left_join(., shp, by = "municip_code_ibge") %>%
st_as_sf()
## Plot random effects on a map to compare
# Return max random effect to set legend to same limits
max_re <- max(c(spat_re$spat_re_gam, spat_re$spat_re_inla))
# INLA spatial effects
inla_spat_map <- ggplot(data = spat_re) +
geom_sf(aes(fill = spat_re_inla), lwd = .05) +
scale_fill_viridis_c(name = "Spatial random \neffect") +
expand_limits(fill = c(0, max_re)) +
theme_void()
# GAM spatial effects
gam_spat_map <- ggplot(data = spat_re) +
geom_sf(aes(fill = spat_re_gam), lwd = .05) +
scale_fill_viridis_c(name = "Spatial random \neffect") +
expand_limits(fill = c(0, max_re)) +
theme_void()
# Obtain legend (to plot maps on the same grid)
spat_re_leg <- get_legend(inla_spat_map)
# Combine map onto the same grid
spat_re_maps <- plot_grid(inla_spat_map + theme(legend.position = "none"),
gam_spat_map + theme(legend.position = "none"),
labels = c("INLA", "GAM"))
# Add the legend
spat_re_maps <- plot_grid(spat_re_maps, spat_re_leg, rel_widths = c(4, .3))
# There appears to be a similar pattern in the random effects, however the GAM
# is smoother
## Compare the spatial random effect estimates using a scatterplot
# Generally similar results, however INLA estimates tend to be higher for larger
# random effects - potentially a result of the smoothness of the GAM model
ggplot(data = spat_re) +
geom_point(aes(x = spat_re_gam, y = spat_re_inla)) +
# Add a reference line showing equality
geom_abline(intercept = 0, slope = 1, linetype = "dashed") +
labs(x = "GAM", y = "INLA") +
theme_bw()
#### Compare Brier score ####
## Calculate the Brier score (mean squared difference) for each model.
# The smaller the score, the better the model is doing of predicting the outcome.
brier_scores <- prob_pred %>%
mutate(sqdiff_inla = (pred_inla - outbreak_obs)^2,
sqdiff_gam = (pred_gam - outbreak_obs)^2) %>%
group_by(municip_code_ibge) %>%
summarise(brier_inla = sum(sqdiff_inla),
brier_gam = sum(sqdiff_gam)) %>%
ungroup() %>%
mutate(brier_diff = brier_inla - brier_gam) %>%
full_join(., shp, by = "municip_code_ibge") %>%
st_as_sf()
# Plot these Brier scores on a map
max_brier <- max(c(brier_scores$brier_inla, brier_scores$brier_gam))
brier_inla <- ggplot(data = brier_scores) +
geom_sf(aes(fill = brier_inla), lwd = .05) +
scale_fill_viridis_c(name = "Brier score") +
expand_limits(fill = c(0, max_brier)) +
theme_void()
brier_gam <- ggplot(data = brier_scores) +
geom_sf(aes(fill = brier_gam), lwd = .05) +
scale_fill_viridis_c(name = "Brier score") +
expand_limits(fill = c(0, max_brier)) +
theme_void()
# Plot maps on the same grid for easier comparison
brier_comp <- plot_grid(brier_inla + theme(legend.position = "none"),
brier_gam + theme(legend.position = "bottom"),
labels = c("INLA", "GAM"))
# Plot the difference in Brier scores per municipality
# In general, INLA performs slightly better
brier_diff <- ggplot(data = brier_scores) +
geom_sf(aes(fill = brier_diff), lwd = .05) +
scale_fill_viridis_c(name = "Brier score") +
theme_void()
#### Compare ROC curves ####
roc_obj_inla <- roc(prob_pred$outbreak_obs,
prob_pred$pred_inla,
auc = T, ci = T, plot = T)
roc_obj_gam <- roc(prob_pred$outbreak_obs,
prob_pred$pred_gam,
auc = T, ci = T, plot = T)
roc_obj <- data.table(roc_inla_sens = roc_obj_inla$sensitivities,
roc_inla_spec = 1 - roc_obj_inla$specificities,
roc_gam_sens = roc_obj_gam$sensitivities,
roc_gam_spec = 1 - roc_obj_gam$specificities)
roc_curve <- ggplot(data = roc_obj) +
# Add a line per threshold
geom_line(aes(x = roc_inla_spec, y = roc_inla_sens)) +
geom_line(aes(x = roc_gam_spec, y = roc_gam_sens),
linetype = "dashed", col = "red") +
# Add reference line (y = x line represents chance)
geom_abline(intercept = 0, slope = 1, linetype = "dashed") +
labs(x = "True negative rate", y = "True positive rate") +
theme_light()