-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtrain.py
67 lines (38 loc) · 1.67 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
# Import dependencies
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import make_pipeline
input_data = './datasets/Churn_Modelling.csv'
output_model = './models/pipeline.bin'
print(f'Reading data from {input_data}...')
df = pd.read_csv(input_data)
df = df.drop(['RowNumber', 'CustomerId', 'Surname'], axis = 1)
categorical = ['Geography', 'Gender']
numerical = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary']
##Divide the df into df_train and df_test
df_train_all, df_test = train_test_split(df, test_size = 0.3, random_state = 0)
##Training model
df_train, df_val = train_test_split(df_train_all, test_size = 0.3, random_state = 0)
y_train = df_train.Exited.astype(int).values
y_val = df_val.Exited.astype(int).values
##Use Dict Vectorizer to transform categorical variables into numerical variables
train_dicts = df_train[categorical + numerical].to_dict(orient = 'records')
val_dicts = df_val[categorical + numerical].to_dict(orient = 'records')
## Let's make the pipeline
Pipeline = make_pipeline(
DictVectorizer(),
RandomForestClassifier(n_estimators = 100, random_state = 0)
)
Pipeline.fit(train_dicts, y_train)
## Test our Pipeline
y_pred = Pipeline.predict_proba(val_dicts)[:,1]
## Evaluate the model
auc = roc_auc_score(y_val, y_pred)
print(f'AUC value: {auc:0.3f}')
print(f'Saving model into {output_model}...')
with open (output_model, 'wb' ) as f_out:
pickle.dump(Pipeline, f_out)