-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSimulation-based calibration.R
207 lines (162 loc) · 8.18 KB
/
Simulation-based calibration.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
source("ML functions.R")
library(rstan)
library(MCMCprecision)
library(ggplot2)
library(reshape2)
library(posterior)
rstan_options(auto_write = T)
ncores <- 4 #Set this to less than 4 if there are less than 4 cores available on machine
options(mc.cores = ncores)
set.seed(5)
##Simulation-based calibration function (used in loop further down)
mcmc_calib <- function(mu, P, C, l, st_model, nsamps, thin_period, seed) {
P_real <- P
mu_real <- mu
#Simulate data according to above parameters
distribution <- rep("Poisson", 3)
Xobs <- observedW.gen.multi(n = l, mu = mu_real, pmat = P_real,
distribution = distribution)
xmeans <- colMeans(Xobs)
alpha_P <- matrix(1, nrow = 3, ncol = 3) #Specify Dirichlet hyperparameters for each category
for (i in 1:3) {
alpha_P[i,i] <- 15
}
w_dat <- list(L = l, M = 3, w = Xobs, C = C, alpha_P = alpha_P)
mcmc.fit <- sampling(st_model, data = w_dat, iter = nsamps, thin = thin_period, seed = seed) #Fit stan model
arr.fit <- as.array(mcmc.fit)
Rh <- max(apply(arr.fit, 3, Rhat))
ess <- min(apply(arr.fit, c(2, 3), ess_basic))
posterior <- as.matrix(mcmc.fit)
return(list(samples = posterior, Rh = Rh, ess = ess, xmeans = xmeans))
}
#Setting initial variables and loading model
nsites <- 15 #Number of locations
s <- c(50, 200, 100) #Totals of each category used to generate confusion matrix
st_model <- stan_model(file = "poisson_model.stan")
#Estimating thinning period to achieve desired effective sample size
target_ess <- 2000
actual_ess <- 0
nIter <- 500
mu <- rgamma(3, shape = 1, rate = 0.001)
nmu <- length(mu)
alpha_P <- matrix(1, nrow = nmu, ncol = nmu)
for (i in 1:nmu) {
alpha_P[i,i] <- nsites
}
P <- t(sapply(1:nmu, function(i) {rdirichlet(1, alpha_P[i,])}))
C <- Cgen(s, P)
while (actual_ess < target_ess) {
posterior <- mcmc_calib(mu, P, C, nsites, st_model, nIter, thin = 1, seed = 2)$samples
npars <- dim(posterior)[2]
actual_ess <- min(sapply(1:npars, function(i) {ess_basic(posterior[,i])}))
nIter <- nIter*2 #Double number of iterations if effective sample size is too low
}
nsamps <- nIter #Number of non-burn-in samples to draw during each MCMC run
#Initialising variables for main calibration routine
#Users can ignore WARNING bits about effective sample sizes - because
#the calibration doesn't actually happen until the main loop below.
## Used nsamples = 1000 for actual run
nsamples <- 100 #Number of prior draws to make (also the number of independent MCMC runs)
thin <- ceiling(nsamps*2 / target_ess) #Keep 1 out of every 'thin' samples - multiplying nsamps by 2 in expression due to burn-in taking up the first half of all MCMC samples (stan default)
draws <- data.frame(matrix(0, nrow = nsamples, ncol = 3)) #Data frame to store rank statistics of category mean parameters (the mus) calculated from each posterior sample
names(draws) <- c("mu[1]", "mu[2]", "mu[3]")
P_draws <- data.frame(matrix(0, nrow = nsamples, ncol = 9)) #Data frame to store rank statistics of P entries calculated from each posterior sample
P_labs <- expand.grid(1:3, 1:3)
names(P_draws) <- sapply(1:9, function(x) {paste0("P[", P_labs[x,1],",", P_labs[x,2],"]")})
emp_quants <- data.frame(matrix(0, nrow = nsamples, ncol = 3)) #Data frame to store the empirical quantiles of the column means of the simulated data
names(emp_quants) <- c("mu[1]", "mu[2]", "mu[3]")
nchains <- 4 #Number of chains per MCMC run
target_chain_ess <- target_ess/nchains #Target per-chain effective sample size
stan_seed <- 5
#Main simulation-based calibration loop
# This takes a while to run
for (sample in 1:nsamples) {
#Draw parameter values from priors
mu <- rgamma(3, shape = 1, rate = 0.001)
P <- t(sapply(1:3, function(i) {rdirichlet(1, alpha_P[i,])}))
C <- Cgen(s, P)
#Run calibration
n_mixed <- 0
while (n_mixed < 1) {
fit_obj <- mcmc_calib(mu, P, C, nsites, st_model, nsamps, thin, stan_seed)
if (fit_obj$Rh < 1.05) {
n_mixed = n_mixed + 1 #Keep sample only if Rhat is acceptable
}
}
ess <- fit_obj$ess
#Run for more samples (nsamps_new) if effective sample size is too low
if (ess < target_chain_ess) {
thin_new <- ceiling(nsamps*2/ess)
nsamps_new <- target_chain_ess*thin_new
while (n_mixed < 1) {
fit_obj <- mcmc_calib(mu, P, C, nsites, st_model, nsamps_new, thin_new, stan_seed)
if (fit_obj$Rh < 1.05) { #Keep sample only if Rhat is acceptable
n_mixed = n_mixed + 1
}
}
}
posterior <- fit_obj$samples
wmeans <- fit_obj$xmeans
#Calculate rank statistics for the mus and update draws
draws[sample,] <- sapply(1:3, function(x){sum(posterior[,x] < mu[x])})
#Calculate rank statistics for P entries and update P_draws
P_draws[sample,] <- sapply(1:9, function(x){sum(posterior[,x+3] < P[as.numeric(P_labs[x,1]),as.numeric(P_labs[x,2])])})
#Calculate quantile of w mean
emp_quants[sample,] <- sapply(1:3, function(i) {mean(wmeans[i] < posterior[,i])})
}
##Quantiles of uncorrected counts in mean posteriors
#Code to create histogram of empirical quantiles of the w means in the posterior draws of the mus
quantpltdat <- melt(emp_quants)
names(quantpltdat)[2] <- "Quantile"
# Figure S2 in supporting information
emp_quant_plt <- ggplot(quantpltdat) + geom_histogram(aes(x = Quantile), binwidth = 0.05, color = "#005b96", fill = '#6497b1', linewidth = 0.25) + facet_wrap(vars(variable), nrow = 1, labeller = label_parsed) + ylab("Count")
#jpeg("empquantplot1.jpg", height=10, width=30, res=600, units="cm")
emp_quant_plt
#dev.off()
##Diagnosing uniformity
#Code to create empirical cdf plot for ranks of posterior draws of the mus
## Ignore R comment
draws_stacked <- melt(draws)
names(draws_stacked)[2] <- "Rank"
npars <- ncol(draws)
draws_prob <- data.frame(matrix(0, nrow = target_ess*npars, ncol = 2))
draws_prob[,1] <- melt(sapply(1:npars, function(x) {rep(paste0("mu[",x,"]"), target_ess)}))[,3]
draws_prob[,2] <- melt(sapply(1:npars, function(x) {sapply(1:target_ess, function(y) {length(which(draws_stacked[((x-1)*nsamples + 1):(x*nsamples),2] <= y))/nsamples})}))[,3]
true_cdf <- melt(replicate(npars, sapply(1:target_ess, function(x) {x/target_ess})))[,3]
upper_ecdf <- sapply(true_cdf, function(x) {qbinom(0.95, nsamples, x)})/nsamples
lower_ecdf <- sapply(true_cdf, function(x) {qbinom(0.05, nsamples, x)})/nsamples
row_inds <- melt(replicate(npars, seq(1, target_ess)))[,3]
draws_with_actual <- cbind(draws_prob, true_cdf, upper_ecdf, lower_ecdf, row_inds)
names(draws_with_actual) <- c("variable", "Value", "true", "upper", "lower", "Rank")
ecdf_mu_general <- ggplot(draws_with_actual, aes(x = Rank)) + geom_line(aes(y = Value), color = "black") + geom_ribbon(aes(ymin = lower, ymax = upper), alpha = 0.1, fill = "green", linetype = 0) + facet_wrap(vars(variable), nrow = 1, labeller = label_parsed)
# Figure S1 in supporting information
#jpeg("sbc_mu_ecdf90_5.jpg", height=10, width=30, res=600, pointsize=1, units="cm")
ecdf_mu_general
#dev.off()
##Bias plots
set.seed(5)
nsamples <- 1000
draws <- data.frame(matrix(0, nrow = nsamples, ncol = 3))
names(draws) <- c("mu[1]", "mu[2]", "mu[3]")
alpha_P <- matrix(1, nrow = 3, ncol = 3)
for (i in 1:3) {
alpha_P[i,i] <- 15
}
for (i in 1:nsamples){
mu <- rgamma(3, shape = 1, rate = 0.001)
Pt <- sapply(1:3, function(i) {rdirichlet(1, alpha_P[i,])})
biases <- mu - Pt%*%mu
draws[i,] <- biases
}
biasdat <- melt(draws)
names(biasdat)[2] <- "Bias"
# Figure S3 in supporting information
bias_plt <- ggplot(biasdat) + geom_histogram(aes(x = Bias), binwidth = 100, color = "#005b96", fill = '#6497b1', linewidth = 0.25) + facet_wrap(vars(variable), nrow = 1, labeller = label_parsed) + ylab("Count")
#jpeg("biasplot3.jpg", height=10, width=30, res=600, units="cm")
bias_plt
#dev.off()
#Trimmed bias plot
bias_plt_tr <- ggplot(biasdat[which(abs(biasdat$Bias) < 100),]) + geom_histogram(aes(x = Bias), binwidth = 10, color = "#005b96", fill = '#6497b1', linewidth = 0.25) + facet_wrap(vars(variable), nrow = 1, labeller = label_parsed) + ylab("Count")
#jpeg("biasplottr2.jpg", height=10, width=30, res=600, units="cm")
bias_plt_tr
#dev.off()