-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathLogistic Regression.R
79 lines (63 loc) · 3.55 KB
/
Logistic Regression.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
##==== Logistic Regression ====##
# This predcits binary classification
# Lets try and predict if an individual will earn more than $50K using logistic regression based on demographic variables available in the adult data
inputData <- read.csv("http://rstatistics.net/wp-content/uploads/2015/09/adult.csv")
head(inputData)
## Check Class bias
table(inputData$ABOVE50K)
# there is a class bias, a condition observed when the proportion of events is much smaller than proportion of non-events
# So, sample to have equal amount from both
# Create Training/Test sets
input_ones <- inputData[which(inputData$ABOVE50K==1),]
input_zeros <- inputData[which(inputData$ABOVE50K==0),]
set.seed(0623)
input_ones_training_rows <- sample(1:nrow(input_ones), 0.7*nrow(input_ones))
input_zeros_training_rows <- sample(1:nrow(input_zeros), 0.7*nrow(input_ones)) # 0's for training. Pick as many 0's as 1's
training_ones <- input_ones[input_ones_training_rows, ]
training_zeros <- input_zeros[input_zeros_training_rows, ]
trainingData <- rbind(training_ones, training_zeros) # row bind the 1's and 0's
test_ones <- input_ones[-input_ones_training_rows, ]
test_zeros <- input_zeros[-input_zeros_training_rows, ]
testData <- rbind(test_ones, test_zeros) # row bind the 1's and 0's
##++++ Optional ++++ ##
# Compute Information Values
# The smbinning::smbinning function converts a continuous variable into a categorical variable using recursive partitioning
library(smbinning)
# segregate continuous and factor variables
factor_vars <- c ("WORKCLASS", "EDUCATION", "MARITALSTATUS", "OCCUPATION", "RELATIONSHIP", "RACE", "SEX", "NATIVECOUNTRY")
continuous_vars <- c("AGE", "FNLWGT","EDUCATIONNUM", "HOURSPERWEEK", "CAPITALGAIN", "CAPITALLOSS")
iv_df <- data.frame(VARS=c(factor_vars, continuous_vars), IV=numeric(14)) # init for IV results
# compute IV for categoricals
for(factor_var in factor_vars){
smb <- smbinning.factor(trainingData, y="ABOVE50K", x=factor_var) # WOE table
if(class(smb) != "character"){ # heck if some error occured
iv_df[iv_df$VARS == factor_var, "IV"] <- smb$iv
}
}
# compute IV for continuous vars
for(continuous_var in continuous_vars){
smb <- smbinning(trainingData, y="ABOVE50K", x=continuous_var) # WOE table
if(class(smb) != "character"){ # any error while calculating scores.
iv_df[iv_df$VARS == continuous_var, "IV"] <- smb$iv
}
}
iv_df <- iv_df[order(-iv_df$IV), ] # sort
iv_df
##++++ Optional ++++ ##
# Build Logistic Regression Models and Predict
logitMod <- glm(ABOVE50K ~ RELATIONSHIP + AGE + CAPITALGAIN + OCCUPATION + EDUCATIONNUM, data=trainingData, family=binomial)
predicted <- predict(logitMod, testData) # predicted scores in log-odds
new_predicted <- plogis(predicted)
list(new_predicted)
# The glm() procedure with family="binomial" will build the logistic regression model on the given formula.
# When we use the predict function on this model, it will predict the log(odds) of the Y variable.
# This is not what we ultimately want because, the predicted values may not lie within the 0 and 1 range as expected.
# So, to convert it into prediction probability scores that is bound between 0 and 1, we use the plogis().
## Model Diagnostics After
library(InformationValue)
optCutOff <- optimalCutoff(testData$ABOVE50K, predicted)[1]
summary(logitMod)
vif(logitMod) # should be less than 4
# Misclassification Error
misClassError(testData$ABOVE50K, predicted, threshold = optCutOff)
# Misclassification error is the percentage mismatch of predcited vs actuals, irrespective of 1's or 0's. The lower the misclassification error, the better is your model.