-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathClassify.r
120 lines (91 loc) · 2.79 KB
/
Classify.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
library(xgboost)
library(Matrix)
set.seed(1340)
train <- read.csv("../input/train.csv")
test <- read.csv("../input/test.csv")
cat('Length: ', nrow(train))
##### Removing IDs
train$ID <- NULL
test.id <- test$ID
test$ID <- NULL
##### Extracting TARGET
train.y <- train$TARGET
train$TARGET <- NULL
##### 0 count per line
count0 <- function(x) {
return( sum(x == 0) )
}
train$n0 <- apply(train, 1, FUN=count0)
test$n0 <- apply(test, 1, FUN=count0)
##### Removing constant features
cat("\n## Removing the constants features.\n")
for (f in names(train)) {
if (length(unique(train[[f]])) == 1) {
# cat(f, "is constant in train. We delete it.\n")
train[[f]] <- NULL
test[[f]] <- NULL
}
}
##### Removing identical features
features_pair <- combn(names(train), 2, simplify = F)
toRemove <- c()
for(pair in features_pair) {
f1 <- pair[1]
f2 <- pair[2]
if (!(f1 %in% toRemove) & !(f2 %in% toRemove)) {
if (all(train[[f1]] == train[[f2]])) {
# cat(f1, "and", f2, "are equals.\n")
toRemove <- c(toRemove, f2)
}
}
}
feature.names <- setdiff(names(train), toRemove)
train$var38 <- log(train$var38)
test$var38 <- log(test$var38)
train <- train[, feature.names]
test <- test[, feature.names]
tc <- test
#---limit vars in test based on min and max vals of train
print('Setting min-max lims on test data')
for(f in colnames(train)){
lim <- min(train[,f])
test[test[,f]<lim,f] <- lim
lim <- max(train[,f])
test[test[,f]>lim,f] <- lim
}
#---
train$TARGET <- train.y
train <- sparse.model.matrix(TARGET ~ ., data = train)
dtrain <- xgb.DMatrix(data=train, label=train.y)
watchlist <- list(train=dtrain)
param <- list( objective = "binary:logistic",
booster = "gbtree",
eval_metric = "auc",
eta = 0.0202048,
max_depth = 5,
subsample = 0.6815,
colsample_bytree = 0.701
)
clf <- xgb.train( params = param,
data = dtrain,
nrounds = 560,
verbose = 1,
watchlist = watchlist,
maximize = FALSE
)
#######actual variables
feature.names
test$TARGET <- -1
test <- sparse.model.matrix(TARGET ~ ., data = test)
preds <- predict(clf, test)
pred <-predict(clf,train)
AUC<-function(actual,predicted)
{
library(pROC)
auc<-auc(as.numeric(actual),as.numeric(predicted))
auc
}
AUC(train.y,pred) ##AUC
submission <- data.frame(ID=test.id, TARGET=preds)
cat("saving the submission file\n")
write.csv(submission, "submission.csv", row.names = F)