-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.R
539 lines (455 loc) · 21.9 KB
/
main.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
#---------------------------------------------------------------------------------------#
# modif: modify a dataset based on pointed variable, transformation type and parameters #
# trial: update a model based on pointed variables, transformation type and parameters #
# recom: test transformation and provide recommendation for trans method and parameters #
# rebuild: rebuild a model based on parameter testing history from last modeling result #
# warn: pop up warning messages if sign of coef changes or p-value of a predictor > 0.2 #
#---------------------------------------------------------------------------------------#
modif <- function(pred, type, data, co.r=NULL, sc.1=NULL, sc.2=NULL, pc.r=NULL, object=NULL, obj.name=obj.name){
#---------------------------------
# transform specified variable with(out) parameters
# this function returns **a dataset** with selected variable transformed
# make sure the raw data is already loaded into global environment
#---------------------------------
# pred: the name of variable to be transormed of class "character"
# type: transformation method:
# 0: no transformation
# 1.1~1.2 media: carryover + s-curve / carryover + power curve
# 2.1.1~2.1.4 basic: +,-,*,/; corresponding to parameter "object"
# 2.2 basic: logarithm
# 2.3 basic: root
# 2.4 basic: exponent
# 2.5 basic: reciprocal
# 2.6 basic: time lag
# the input for type should be strictly controlled to the list of option number above!!!
# data: data frame to store the variable transformed (original var. will be covered)
#---------------------------------
trans.collection <- c("co","sc","pc","bt")
if(!sum(sapply(trans.collection, existsFunction))==4){
# check if the functions co(), sc(), pc() & bt() exist or not, if not, source "odds.R"
if(file.exists("odds.R")){
source("odds.R")
}else{
source("https://raw.githubusercontent.com/elliott828/boulot-test/master/odds.R")
}
}
df <- data
x <- data[[pred]]
if(as.character(type) == "1.1"){
df[[pred]] <- cs(data[[pred]], co.r, sc.1, sc.2)
}else if(as.character(type) == "1.2"){
df[[pred]] <- cp(data[[pred]], co.r, pc.r)
}else if(as.character(type) == "0"){
df[[pred]] <- data[[pred]]
}else{
opt <- as.numeric(substr(as.character(type),3,nchar(as.character(type))))
# turn 2.1.1~2.1.4 and 2.2~2.6 to 1.1~1.4 and 2~6
i.colname <- colnames(df)
df <- cbind(df, bt(data[[pred]], opt, object))
full.opt <- c("2.1.1","2.1.2","2.1.3","2.1.4","2.2","2.3","2.4","2.5","2.6")
opt <- letters[which(full.opt == as.character(type))]
object.bis <- if(length(object)>1){obj.name}else{gsub("-", "_", object)}
pred.name <<- switch(opt,
a = paste(pred, ".plus.", object.bis, sep = ""),
b = paste(pred, ".mnus.", object.bis, sep = ""),
c = paste(pred, ".mult.", object.bis, sep = ""),
d = paste(pred, ".divi.", object.bis, sep = ""),
e = paste(pred, ".loga.", sep = ""),
f = paste(pred, ".pore.", object.bis, ")", sep = ""),
g = paste(pred, ".powr.", object.bis, sep = ""),
h = paste(pred, ".reci.", sep =""),
i = paste(pred, ".tlag.", object.bis, sep = ""))
colnames(df) <- c(i.colname, pred.name)
message('The transformed variable called "', pred.name, '" has been added to the dataset!', sep = "")
cat("\n")
}
return(df)
}
#----------------------------------------------------------------------------------
trial <- function(data, resp, fit = NULL, action = 2, pred = NULL) {
# Build the model
# based on NULL or fit1, add/remove a predictor/the intercept, output summary
# action: -1 means delete the Intercept
# 1 means add the Intercept
# 2 means add a predictor
# -2 means delete a predictor
if(action == -1 | action == 1) pred <- "(Intercept)"
if(is.null(fit)){ # if(exists("fit", mode = "list"))
if(action == 1) {
fit.new <- lm(as.formula(sprintf('%s ~ 1', resp)), data = data, na.action = na.exclude)
} else if(action == 2) {
fit.new <- lm(as.formula(sprintf('%s ~ %s + 0', resp, pred)), data = data, na.action = na.exclude)
} else if(action == -1) {
fit.new <- lm(as.formula(sprintf('%s ~ -1', resp)), data = data, na.action = na.exclude)
} else { # if(action == -2)
message("There's no existed model to let you delete ", pred, " from!", sep = "")
cat("\n")
fit.new <- fit
}
} else { #if(!is.null(fit))
if(pred %in% names(coef(fit))) {
if(action == -1) {
fit.new <- update(fit, ~. -1, data = data)
} else if(action == -2) {
fit.new <- update(fit, as.formula(sprintf('~. - %s', pred)), data = data)
} else {
message("The ", pred, " is already in the model!", sep = "")
cat("\n")
fit.new <- fit
}
} else {
if(action == 1){
fit.new <- update(fit, ~. + 1, data = data)
} else if(action == 2){
fit.new <- update(fit, as.formula(sprintf('~. + %s', pred)), data = data)
} else {
message("The ", pred, " isn't in the model!", sep = "")
cat("\n")
fit.new <- fit
}
}
}
return(fit.new)
}
#----------------------------------------------------------------------------------
recom <- function(pred, resp, data, type, fit = NULL, object = NULL, obj.name, st.row){
#---------------------------------
# pred: predictor to be inserted to the model, to be quoted. i.e.: "cly" (mtcars)
# resp: response variable in the model, to be quoted. i.e.: "mpg" (mtcars)
# data: data base for the model
# type: transformation method for the predictor
# fit: default value is NULL; if a model exists, then put the name of fit list
#---------------------------------
# When type = 1.1: test all combinations of parameters of carryover & s-curve
# When type = 1.2: test all combinations of parameters of carryover & power curve
# When type = 1.3: do both of the actions above
# When type >= 2: call bt()
# Check the best R-square / p-value of coef, give recommendations
#---------------------------------
#----------------------#
# PART I - Preparation #
#----------------------#
# Check if the function trial() & the package exists
if(!existsFunction("bt") == T){
if(file.exists("odds.R")){
source("odds.R")
}else{
source("https://raw.githubusercontent.com/elliott828/boulot-test/master/odds.R")
}
}
if(!"plyr" %in% installed.packages()){
install.packages("plyr")
require(plyr)
}
#---------------------------------
# Load the data
df0 <- data
df1 <- df0[st.row:dim(df0)[1], ]
x0 <- df0[[pred]]
y0 <- df0[[resp]]
x1 <- x0[st.row:length(x0)]
y1 <- y0[st.row:length(y0)]
# The default parameter ranges are as below:
# (To be modified on demand)
co.range <- seq(0.05, 0.95, 0.05) #
lamda1.range <- seq(0.0001, 0.0007, 0.0001)
lamda2.range <- seq(1.1, 1.7, 0.1)
pc.range <- seq(0.4, 0.95, 0.05)
#---------------------------------#
# PART II - Fundamental functions #
#---------------------------------#
# Function 1.1
#---------------------------------
# Set sub functions (cs.trans & cp.trans) for generating predictor matrix
# (transformed by different group of parameters)
cstrans <- function(x){
cs.mat <- t(mdply(expand.grid(co.rate = co.range,
lamda1 = lamda1.range,
lamda2 = lamda2.range),
cs, x))
return(cs.mat)
}
# Function 1.2
#---------------------------------
cptrans <- function(x){
cp.mat <- t(mdply(expand.grid(co.rate = co.range,
exponent = pc.range),
cp, x))
return(cp.mat)
}
# Function 2
#---------------------------------
# Check if a fit exists, update the fit if it does, otherwise create one
fit.update <- function(resp, x, fit.check = NULL, pred, data){
df <- data
df[[pred]] <- x
fit.new <- if (is.null(fit.check)){
lm(as.formula(sprintf("%s ~ %s", resp, pred)), data = df)
}else{
update(fit.check, as.formula(sprintf("~.+ %s", pred)), data = df)
}
return(fit.new)
}
# Function 3
#---------------------------------
# Extract the key modeling summary statatistics
# Call Function 2: fit.update()
summary.stats <- function(resp, x, fit.coef = fit, pred, data){
smr <- summary(fit.update(resp, x, fit.coef, pred, data))
smr.coef <- subset(smr$coefficients, rownames(smr$coefficients)==pred)[c(1,4)]
smr.key <- c(smr.coef, smr$r.squared, smr$adj.r.squared)
names(smr.key) <- c("Coefficient", "P.value", "R.squared", "Adj.R.squared")
# to generate a group of key statistics, how many variables corresponding to how many groups
return(t(smr.key))
}
# Function 4
#---------------------------------
# Test all the combinations of parameters & the respective modeling result
testall <- function(resp, x, pred, fit.coef = fit, type, data){
opt <- LETTERS[as.numeric(substr(as.character(type),3,3))]
#both <- function(x)list(cstrans(x),cptrans(x))
mat <- switch(opt,
A = cstrans(x),
B = cptrans(x))
len <- switch(opt,
A = 3,
B = 2)
nam <- switch(opt,
A = "cs",
B = "cp")
met <- switch(opt,
A = "Carryover + S-curve",
B = "Carryover + Power curve")
prmt <- mat[1:len,] # capture the parameter combinations
var0 <- as.data.frame(mat[(len+1):nrow(mat),]) # capture the transfored variables
#colnames(var) <- paste(pred, 1:ncol(var), sep = "")
var <- var0[st.row:dim(var0)[1], ]
test.stats <- t(sapply(var, summary.stats, resp = resp, pred = pred, fit.coef = fit.coef, data = data))
# coef <- test.stats[1:2]
# rsq <- test.stats[3]
# adj.rsq <- test.stats[4]
prmt.all <- as.data.frame(cbind(t(prmt), test.stats))
prmt.all <- prmt.all[complete.cases(prmt.all), ]
if(nrow(prmt.all) == 0){
op.recom <- as.data.frame(matrix(1, nrow = 1, ncol = 9))
message("This variable cannot get into the model!\n")
message("Please switch another variable!\n")
} else {
curve.prmt <- if(len == 2){"exponent"}else{c("lamda1","lamda2")}
colnames(prmt.all) <- c("carryover.rate",curve.prmt,"coef","p-value","r.squared","adjusted.r.squared")
rownames(prmt.all) <- NULL
# export all parameters and related model statistics to local working directory
write.csv(prmt.all, paste("prmt", nam, pred, "csv",sep = "."))
cat("\n")
message(paste("The parameter reference: 'prmt", nam, pred, "csv' is exported!",sep = "."))
# capture the best group of parameters
# get the best adj. r squared first then allocate the position
best <- max(as.numeric(prmt.all[, ncol(prmt.all)]))
posi <- which(round(prmt.all$adjusted.r.squared,6)==round(best,6))
if (length(posi)>1)posi <- sample(posi,1)
best.stats <- prmt.all[posi,]
rownames(best.stats) <- NULL # remove the row index assigned automatically by the program
# print(paste("the size of prmt.all is ", paste(dim(prmt.all),collapse=" and "),sep=""))
# print(paste("the value of best is ", best,sep=""))
# print(paste("the best row number is ", which(round(prmt.all$adjusted.r.squared,6)==round(best,6)), sep =""))
# print(paste("the size of best.stats is ", paste(dim(best.stats),collapse=" and "),sep=""))
# print the message indicating the best transformation parameters and results
cat("",
paste(" For the transformation method ", met, ":", sep=""),
paste(" ", paste(rep("-",40), collapse = ""),sep = ""),
paste(" - The recommended carryover rate is ", best.stats[1], sep=""),
sep = "\n")
if(as.character(type)=="1.1"){
cat(paste(" - The recommended lamda1(S-curve) is ", best.stats[2], sep = ""),
paste(" - The recommended lamda2(S-curve) is ", best.stats[3], sep = ""),
sep = "\n")
curve.prmt <- c(best.stats[1], best.stats[2], best.stats[3], NA)
}else if(as.character(type)=="1.2"){
cat(paste(" - The recommended power rate is ", best.stats[2], sep = ""),
sep = "\n")
curve.prmt <- c(best.stats[1], NA, NA, best.stats[2])
}
cat(paste(" ", paste(rep("-",40), collapse = ""),sep = ""),"\n")
cat(paste(" - The coefficient of ", pred, " in this model is ", round(best.stats[ncol(prmt.all)-3],4)),
paste(" - The p-value of the coefficient is ", as.numeric(format(best.stats[ncol(prmt.all)-2],scientific=T))),
paste(" - The r-squared of the model is ", round(best.stats[ncol(prmt.all)-1],4)),
paste(" - The adjusted r-squared of the model is ", round(best.stats[ncol(prmt.all)],4)),
paste(" ", paste(rep("-",40), collapse = ""),sep = ""),"", sep = "\n")
if(is.na(best.stats[ncol(prmt.all)-2])){
message("This variable is not advised to be added to the model!\n")
}else if (best.stats[ncol(prmt.all)-2] > 0.2){
message("Please be aware that the p-value of predictor coefficient is larger than 0.2!")
message("The estimate of coefficient is not significant!")
cat("\n")
}
# return(list(prmt.all, best.stats)) return the variable transformed by all combination of parameters
op.recom <- c(type, best.stats[ncol(prmt.all)-3], best.stats[ncol(prmt.all)-2],
best.stats[ncol(prmt.all)-1], best.stats[ncol(prmt.all)], curve.prmt)
names(op.recom) <- c("Trans.Type","Coef","P.Value","R.Sq","Adj.R.Sq","Carryover.Rate",
"Lamda1","Lamda2","Power.Rate")
}
return(op.recom)
# write.csv(var, paste("transformation",pred,"csv",sep="."))
}
# Function 5
#---------------------------------
# Test other transformation methods (or no transformation)
# Here the transformation type could be:
# - 2.1.1~2.1.4 & 2.2~2.6: basic transformation
# - 0: no transformation
testoth <- function(resp, x, pred, fit.coef = fit, type, data, object = object, obj.name = obj.name){
if (as.character(type) == "0"){
x.new <- x
}else{
type.new <- as.numeric(substr(as.character(type),3,nchar(as.character(type))))
x.new <- bt(x, type.new, object)
}
fit.new <- fit.update(resp = resp, x = x.new, fit.check = fit.coef, pred = pred, data = data)
full.opt <- c("0","2.1.1","2.1.2","2.1.3","2.1.4","2.2","2.3","2.4","2.5","2.6")
opt <- letters[which(full.opt == as.character(type))]
object.bis <- if(length(object)>1){obj.name}else{object}
trans.opt <- switch(opt,
a = "No transformation",
b = paste(pred, " plus ", object.bis, sep = ""),
c = paste(pred, " minus ", object.bis, sep = ""),
d = paste(pred, " times ", object.bis, sep = ""),
e = paste(pred, " divided by ", object.bis, sep = ""),
f = paste("Logarithm on ", pred, sep = ""),
g = paste(pred, " to the power of 1/", object.bis, sep = ""),
h = paste(pred, " to the power of ", object.bis, sep = ""),
i = paste("Reciprocal of ", pred, sep =""),
j = paste("Time lag for ", pred, " by ", object.bis, " time unit(s)", sep = ""))
coef.new <- c(summary(fit.new)$coefficients[length(coef(fit.new)),1],summary(fit.new)$coefficients[length(coef(fit.new)),4],
summary(fit.new)$r.squared, summary(fit.new)$adj.r.squared)
names(coef.new) <- c("coef", "p-value", "r.squared", "adjusted.r.squared")
cat(
paste("You choose to do: ", trans.opt, sep = ""),
paste(paste(rep("-",40),collapse = ""), sep = ""),
paste("The coefficient of ", pred, " in this model is ", round(coef.new[1],4), sep = ""),
paste("The p-value of the coefficient is ", as.numeric(format(coef.new[2],scientific=T)), sep = ""),
paste("The r-squared of the model is ", round(coef.new[3],4), sep = ""),
paste("The adjusted r-squared of the model is ", round(coef.new[4],4), sep = ""),
"", sep = "\n")
# return(list(coef.new, c(object, type)))
op.recom <- c(type, coef.new[1], coef.new[2], coef.new[3], coef.new[4],
rep(NA, 4))
names(op.recom) <- c("Trans.Type","Coef","P.Value","R.Sq","Adj.R.Sq","Carryover.Rate",
"Lamda1","Lamda2","Power.Rate")
return(op.recom)
}
#----------------------------------#
# PART III - Fundamental functions #
#----------------------------------#
if(as.character(type) == "1.1"){
# carryover + s-curve only
prmt.rec <- testall(resp, x0, pred, fit.coef = fit, type = 1.1, data = df1)
}else if(as.character(type) == "1.2"){
# carryover + power curve only
prmt.rec <- testall(resp, x0, pred, fit.coef = fit, type = 1.2, data = df1)
}else if(as.character(type) == "1.3"){
# compary carryover + s-cu4ve and carryover + power curve
prmt.cs <- testall(resp, x0, pred, fit.coef = fit, type = 1.1, data = df1)
prmt.cp <- testall(resp, x0, pred, fit.coef = fit, type = 1.2, data = df1)
if(is.na(prmt.cs[2])|is.na(prmt.cs[2])){
prmt.rec <- as.list(rep(NA,9))
message("There is no recommendation!\n")
}else if(as.numeric(prmt.cs[5]) > as.numeric(prmt.cp[5])){
prmt.rec <- prmt.cs
message("Concerning r-squared, the method **CARRY-OVER + S-CURVE** is preferred.")
cat("\n")
}else if(as.numeric(prmt.cs[5]) < as.numeric(prmt.cp[5])){
prmt.rec <- prmt.cp
message("Concerning r-squared, the method **CARRY-OVER + POWER CURVE** is preferred.")
cat("\n")
}else if(as.numeric(prmt.cs[5]) == as.numeric(prmt.cp[5])){
prmt.rec <- prmt.cp
message("Both transformation methods are OK for the model.")
cat("\n")
}
}else{
# modeling result of other transformation
# caliberate the option number for type > 2
prmt.rec <- testoth(resp = resp, x1, pred = pred, fit.coef = fit, type = type, data = df1, object = object, obj.name = obj.name)
}
return(prmt.rec)
}
#----------------------------------------------------------------------------------
rebuild <- function(resp, data, prmt.name, st.row) {
# resp and data is already in the global environment
# data is raw without modification
# source(modif)
# need one step to confirm the model, then
# fit <- fit.temp; df <- df.temp
df.history0 <<- data
prmt.history <<- as.data.frame(read.csv(paste(getwd(), "/", prmt.name, sep = "")),
stringsAsFactors = F)[,-1]
prmt.alive <<- prmt.history[which(prmt.history[[8]] == "alive"),]
for(i in 1:nrow(prmt.alive)) {
pred <- as.character(prmt.alive[[1]][i])
type <- prmt.alive[[2]][i]
co.r <- prmt.alive[[3]][i]
sc.1 <- prmt.alive[[4]][i]
sc.2 <- prmt.alive[[5]][i]
pc.r <- prmt.alive[[6]][i]
# object
obj.pre <- prmt.alive[[7]][i]
if(obj.pre %in% c("min", "max", "mean")){
if(obj.pre == "min"){
object <- min(data[[pred]])
} else if(obj.pre == "max"){
object <- max(data[[pred]])
} else if(obj.pre == "mean"){
object <- mean(data[[pred]])
}
} else if(obj.pre %in% names(data)) {
object <- data[[obj.pre]]
} else {
object <- as.numeric(obj.pre)
}
df.history0 <<- modif(pred, type, df.history0, co.r, sc.1, sc.2, pc.r, object)
# type: character
}
df.history <<- df.history0[st.row:dim(df.history0)[1], ]
fit.history <<- lm(as.formula(paste(paste(c(resp, paste(prmt.alive[[9]], collapse = " + ")), collapse = " ~ "), " - 1", sep = "")),
data = df.history, na.action = na.exclude)
}
#----------------------------------------------------------------------------------
warn <- function(fit1, fit2, p.cons = 0.2) {
# 1. warn user when there is some coefficient which changes **sign** between two models
# 2. warn user when there is big gap between **p-value** of one pred in two models
if(!is.null(names(coef(fit2)))) {
coef2 <- coef(fit2)
name2 <- names(coef2)
p.value2 <- coef(summary(fit2))[, 4]
if(sum(p.value2 > p.cons) > 0){
message("P-value of the following predictor",
if(sum(p.value2 > p.cons) > 1) "s are" else " is",
" larger than ", as.character(p.cons), sep = "")
cat(paste(names(p.value2)[p.value2 > p.cons], collapse = ", "), "", sep = "\n")
cat(paste(rep("-", 40), collapse = ""), "", sep = "\n")
}
if(!is.null(names(coef(fit1)))){
coef1 <- coef(fit1)
name1 <- names(coef1)
p.value1 <- coef(summary(fit1))[, 4]
name <- intersect(name1, name2)
sign1 <- sign(coef1[which(name1 %in% name)])
sign1 <- sign1[order(names(sign1))]
sign2 <- sign(coef2[which(name2 %in% name)])
sign2 <- sign2[order(names(sign2))]
if(any(sign1 * sign2 == -1)) {
message("Sign of the following predictor's coefficient has changed!", sep = "")
cat(paste(names(sign1)[sign1 * sign2 == -1], collapse = ", "), "", sep = "\n")
cat(paste(rep("-", 40), collapse = ""), "", sep = "\n")
}
}
}
} # end function warn()
#----------------------------------------------------------------------------------
# modif: created 11/12/2014 by Elliott
# trial: created 11/12/2014 by Katherine
# recom: created 11/13/2014 and modified 11/17/2014 by Elliott
# rebuild: created 11/12/2014 by Katherine
# updated 12/19/2014: change fit.history into with no intercept to avoid annoyed message
# warn: created 11/17/2014 by Katherine
#----------------------------------------------------------------------------------