Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cross Validation Added #407

Open
wants to merge 13 commits into
base: master
Choose a base branch
from
24 changes: 21 additions & 3 deletions gramex/handlers/mlhandler.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@
from slugify import slugify
from tornado.gen import coroutine
from tornado.web import HTTPError
from sklearn.metrics import get_scorer
from sklearn.model_selection import cross_val_predict, cross_val_score
from sklearn.model_selection import cross_val_predict, cross_val_score
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This line appears twice.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This extra line is unnecessary.


op = os.path
MLCLASS_MODULES = [
Expand All @@ -39,7 +42,7 @@
'pipeline': True,
'nums': [],
'cats': [],
'target_col': None,
'target_col': None
}
ACTIONS = ['predict', 'score', 'append', 'train', 'retrain']
DEFAULT_TEMPLATE = op.join(op.dirname(__file__), '..', 'apps', 'mlhandler', 'template.html')
Expand Down Expand Up @@ -103,7 +106,6 @@ def setup(cls, data=None, model={}, config_dir='', **kwargs):

cls.set_opt('class', model.get('class'))
cls.set_opt('params', model.get('params', {}))

if op.exists(cls.model_path): # If the pkl exists, load it
cls.model = joblib.load(cls.model_path)
elif data is not None:
Expand All @@ -112,14 +114,26 @@ def setup(cls, data=None, model={}, config_dir='', **kwargs):
data = cls._filtercols(data)
data = cls._filterrows(data)
cls.model = cls._assemble_pipeline(data, mclass=mclass, params=params)

# train the model
target = data[target_col]
train = data[[c for c in data if c != target_col]]
# cross validation
mod = cls.modelFunction()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is not required. The model is already present as cls.model, see line no: 116.

CVscore = cross_val_score(mod, train, target)
CV = sum(CVscore)/len(CVscore)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  1. Use CVscore.mean()
  2. Variable naming has to follow a specified style - do pip install flake8 and run the flake8 command against this file, i.e. flake8 mlhandler.py, and check the output.

print('CV score: ', CV)
gramex.service.threadpool.submit(
_fit, cls.model, train, target, cls.model_path, cls.name)
cls.config_store.flush()

@classmethod
def modelFunction(cls, mclass = ''):
model_kwargs = cls.config_store.load('model', {})
mclass = model_kwargs.get('class', False)
if mclass:
model = search_modelclass(mclass)(**model_kwargs.get('params', {}))
return model
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This function is not required.


@classmethod
def load_data(cls, default=pd.DataFrame()):
try:
Expand Down Expand Up @@ -268,6 +282,10 @@ def _predict(self, data=None, score_col=''):
self.model = cache.open(self.model_path, joblib.load)
try:
target = data.pop(score_col)
metric = self.get_argument('_metric', False)
if metric:
scorer = get_scorer(metric)
return scorer(self.model, data, target)
return self.model.score(data, target)
except KeyError:
# Set data in the same order as the transformer requests
Expand Down
5 changes: 5 additions & 0 deletions tests/test_mlhandler.py
Original file line number Diff line number Diff line change
Expand Up @@ -266,6 +266,11 @@ def test_get_bulk_score(self):
data=self.df.to_json(orient='records'),
headers={'Content-Type': 'application/json'})
self.assertGreaterEqual(resp.json()['score'], self.ACC_TOL)
resp = self.get(
'/mlhandler?_action=score&_metric=f1_weighted', method='post',
data=self.df.to_json(orient='records'),
headers={'Content-Type': 'application/json'})
self.assertGreaterEqual(resp.json()['score'], self.ACC_TOL)

def test_get_cache(self):
df = pd.DataFrame.from_records(self.get('/mlhandler?_cache=true').json())
Expand Down