forked from bnsreenu/python_for_microscopists
-
Notifications
You must be signed in to change notification settings - Fork 0
/
060-RandomForest1.py
117 lines (78 loc) · 3.79 KB
/
060-RandomForest1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
#!/usr/bin/env python
__author__ = "Sreenivas Bhattiprolu"
__license__ = "Feel free to copy, I appreciate if you acknowledge Python for Microscopists"
# https://www.youtube.com/watch?v=YYjvkSJoui4
"""
@author: Sreenivas Bhattiprolu
First part fo code: Same code as logistic regression.
"""
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
#STEP 1: DATA READING AND UNDERSTANDING
df = pd.read_csv("other_files/images_analyzed_productivity1.csv")
print(df.head())
#Count productivity values to see the split between Good and Bad
sizes = df['Productivity'].value_counts(sort = 1)
print(sizes)
#plt.pie(sizes, autopct='%1.1f%%')
#Good to know so we know the proportion of each label
#STEP 2: DROP IRRELEVANT DATA
#In our example, Images_Analyzed reflects whether it is good analysis or bad
#so should not include it. ALso, User number is just a number and has no inflence
#on the productivity, so we can drop it.
df.drop(['Images_Analyzed'], axis=1, inplace=True)
df.drop(['User'], axis=1, inplace=True)
#STEP 3: Handle missing values, if needed
#df = df.dropna() #Drops all rows with at least one null value.
#STEP 4: Convert non-numeric to numeric, if needed.
#Sometimes we may have non-numeric data, for example batch name, user name, city name, etc.
#e.g. if data is in the form of YES and NO then convert to 1 and 2
df.Productivity[df.Productivity == 'Good'] = 1
df.Productivity[df.Productivity == 'Bad'] = 2
print(df.head())
#STEP 5: PREPARE THE DATA.
#Y is the data with dependent variable, this is the Productivity column
Y = df["Productivity"].values #At this point Y is an object not of type int
#Convert Y to int
Y=Y.astype('int')
#X is data with independent variables, everything except Productivity column
# Drop label column from X as you don't want that included as one of the features
X = df.drop(labels = ["Productivity"], axis=1)
#print(X.head())
#STEP 6: SPLIT THE DATA into TRAIN AND TEST data.
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.4, random_state=20)
#random_state can be any integer and it is used as a seed to randomly split dataset.
#By doing this we work with same test dataset evry time, if this is important.
#random_state=None splits dataset randomly every time
#print(X_train)
#STEP 7: Defining the model and training.
# Import the model we are using
# Import the model we are using
#RandomForestRegressor is for regression type of problems.
#For classification we use RandomForestClassifier.
#Both yield similar results except for regressor the result is float
#and for classifier it is an integer.
#Let us use classifier since this is a classification problem
from sklearn.ensemble import RandomForestClassifier
#from sklearn.ensemble import RandomForestRegressor
# Instantiate model with 10 decision trees
model = RandomForestClassifier(n_estimators = 10, random_state = 30)
# Train the model on training data
model.fit(X_train, y_train)
#STEP 8: TESTING THE MODEL BY PREDICTING ON TEST DATA
#AND CALCULATE THE ACCURACY SCORE
prediction_test = model.predict(X_test)
#print(y_test, prediction_test)
from sklearn import metrics
#Print the prediction accuracy
print ("Accuracy = ", metrics.accuracy_score(y_test, prediction_test))
#Test accuracy for various test sizes and see how it gets better with more training data
#One amazing feature of Random forest is that it provides us info on feature importances
# Get numerical feature importances
#importances = list(model.feature_importances_)
#Let us print them into a nice format.
feature_list = list(X.columns)
feature_imp = pd.Series(model.feature_importances_,index=feature_list).sort_values(ascending=False)
print(feature_imp)