-
Notifications
You must be signed in to change notification settings - Fork 0
/
2_model_data_processing.R
196 lines (146 loc) · 6.85 KB
/
2_model_data_processing.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
#John Spaw
#Create data for input into JAGS
#Combining data from 2010 census, 2016 CCES survey, and 2016 IAT Responses
library(mrpdata) #loading census data
library(reshape) #combine_factor
library(dplyr)
#### DATA LOAD ####
#Load state level IAT data
state_IAT <- readRDS("Data/state_IAT.rds")
state_IAT <- subset(state_IAT, state_IAT$state_code != "DC") #remove DC
#Load CCES survey data
load("Data/CCES16_Common_OUTPUT_Feb2018_VV.RData") #loads as "x"
CCES.complete <- x
rm("x") #remove initial load to save memory
#Load census datasets for other US applications of MRP:
data(mrp.census) # census with common demo strata
mrp.census <- na.omit(mrp.census)
##State code/name key table
state_codes <- read.csv("Data/state_codes.csv")
############################
### Clean data - make variables consistent ###
# Need to match strata between CCES poll, census population, and State IAT data
# Convert variables to factors with identical levels
# Include: Education, Gender, Race, State
###Education
education_levels <- c("No High School", "High School", "Some College", "College Graduate", "Post Graduate")
##CCES
#Cleaning
CCES.complete$educ[CCES.complete$educ == "2-year"] <- "Some college" #combine some college and 2 - year
CCES.complete <- subset(CCES.complete, CCES.complete$educ != "Skipped" & CCES.complete$educ != "Not Asked") #remove NAs
#Combine and rename levels
CCES.complete$education <- combine_factor(CCES.complete$educ, c(1,2,3,3,4,5,1,1))
levels(CCES.complete$education) <- education_levels
##Census
#No cleaning necessary... just rename levels
levels(mrp.census$education) <- education_levels
###Race
race_levels <- c("White", "Black", "Hispanic", "Other")
##CCES
#Combine and rename levels
CCES.complete$race <- combine_factor(CCES.complete$race, c(1,2,3,4,4,4,4,4,4,4))
levels(CCES.complete$race) <- race_levels
##Census
#Rename levels
levels(mrp.census$race) <- race_levels
## To ensure matching of strata between poll and population,
## both should be factors with identical names and levels.
CCES.complete$sex <- CCES.complete$gender
CCES.complete <- within (CCES.complete, {
female <- factor(sex=="Female", labels=c("Male","Female"), exclude=NA)
race <- factor(race,exclude=NA)
f.race <- interaction(female,race)
})
## Poll has four levels of education, so we need to combine
## the top two levels in the census data. We'll also go ahead
## and trim it down to just the variables used here.
mrp.census <- within(mrp.census,{
state <- factor(state,exclude=NA)
race <- factor(race,exclude=NA)
f.race <- interaction(sex,race)
})
###State
CCES.complete$inputstate <- as.character(CCES.complete$inputstate)
state_codes$State <- as.character(state_codes$State)
state_codes$Abbreviation <- as.character(state_codes$Abbreviation)
##CCES
CCES.complete$State <- factor(CCES.complete$inputstate)
##Census
#change data type for merge
mrp.census <- left_join(mrp.census, state_codes, by = c("state" = "Abbreviation"))
mrp.census$State <- factor(mrp.census$State)
##IAT
state_IAT <- left_join(state_IAT, state_codes, by = c("state_code" = "Abbreviation"))
state_IAT$State <- factor(state_IAT$State)
##Remove District of Columbia from data
mrp.census <- subset(mrp.census, mrp.census$State != "District of Columbia")
CCES.complete <- subset(CCES.complete, CCES.complete$State != "District of Columbia")
#Remove District of Columbia from levels
mrp.census$State <- factor(mrp.census$State)
CCES.complete$State <- factor(CCES.complete$State)
###Model responses
#Help dreamers
CCES.complete$issue_help_dreamer <- as.factor(CCES.complete$CC16_331_3)
CCES.complete$issue_help_dreamer <- combine_factor(CCES.complete$issue_help_dreamer, c(1,2,1,1))
levels(CCES.complete$issue_help_dreamer) <- c("Help Dreamers", "Don't Help Dreamers")
table(CCES.complete$issue_help_dreamer)
#Deport undocumented immigrants
CCES.complete$issue_deport_undocumented <- as.factor(CCES.complete$CC16_331_7)
CCES.complete$issue_deport_undocumented <- combine_factor(CCES.complete$issue_deport_undocumented, c(1,2,1,1))
levels(CCES.complete$issue_deport_undocumented) <- c("Deport Undocumented Immigrants", "Don't Deport Undocumented Immigrants")
table(CCES.complete$issue_deport_undocumented)
#######Check that all levels match
levels(mrp.census$State) == levels(CCES.complete$State) & levels(mrp.census$State) == levels(state_IAT$State)
levels(mrp.census$f.race) == levels(CCES.complete$f.race)
levels(mrp.census$education) == levels(CCES.complete$education)
############################
### CREATE JAGS DATA ###
#individual level formula and model frame
mrp.formula <- issue_help_dreamer ~ f.race + education
mrp.frame <- model.frame(mrp.formula, strata=State, data=CCES.complete)
#input data for jags model as list
data <- list()
data$y <- model.response(mrp.frame) #response
data$x <- model.matrix(mrp.frame, mrp.frame) #design matrix
data$state <- model.extract(mrp.frame, "strata") #states as strata
data$num_x <- ncol(data$x) #predictor columns
data$num_y <- length(data$y) #number of individual level responses
predictors <- c("State", attr(terms(mrp.frame), "term.labels"))
data.census <- aggregate(list(weights=mrp.census$weighted2008),
as.list(mrp.census[,predictors]),
sum)
#data for post-stratification adjustments
data.census <- reshape(data.census,
varying=list(State=levels(data$state)),
v.names="weights",
timevar="State",
times=levels(data$state),
idvar=setdiff(predictors,"State"),
direction="wide")
data$pred_x <- model.matrix(update(mrp.formula, NULL ~ .), data.census)
data$census_count <- as.matrix(data.census[,levels(data$state)]) #counts for each factor combination level by state
data$num_cells <- nrow(data$census_count) #number of factor combination levels
data$num_state <- ncol(data$census_count) #number of states
#Additional state level data - election data (dem vote share, south in civil war indicator), IAT
library(pscl)
#Only include 2012 election results and exclude DC
pres2012 <- subset(presidentialElections, year==2012 & state != "DC", select=c("state","demVote","south"))
#add state level IAT data
pres2012$iat_prop <- state_IAT$IAT_proportion
#add state level data to jags data list
data$state_x <- t(as.matrix(cbind(Intercept=1,
pres2012[match(levels(data$state),pres2012$state), c("demVote", "south","iat_prop")]
)
)
)
#number of state level predictors
data$num_state_x <- nrow(data$state_x)
#Set model priors
data$gamma_mean <- rep(0, nrow(data$state_x))
data$gamma_prec <- diag(0.01, data$num_state_x, data$num_state_x)
data$tau_k <- data$num_x + 1
data$tau_R <- diag(1/data$tau_k, data$num_x, data$num_x)
#export
saveRDS(data, "Data/model_data.rds")
#clean environment
rm(list=ls())