Skip to content


nested k-fold cross-validation with grid search, see #285 & allow kwa…
Browse files Browse the repository at this point in the history
…rgs in CLI & refactoring
  • Loading branch information
Marco Fossati committed May 2, 2019
1 parent 14c990c commit b03970e
Showing 1 changed file with 134 additions and 44 deletions.
178 changes: 134 additions & 44 deletions soweego/linker/
Original file line number Diff line number Diff line change
Expand Up @@ -9,55 +9,112 @@
__license__ = 'GPL-3.0'
__copyright__ = 'Copyleft 2019, Hjfocs'

import json
import logging
import os
from collections import defaultdict

import click
import recordlinkage as rl
from numpy import mean, std
from pandas import concat
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.externals import joblib
from sklearn.model_selection import (GridSearchCV, StratifiedKFold,

from soweego.commons import constants, target_database
from soweego.commons import constants, target_database, utils
from soweego.linker import train, workflow

LOGGER = logging.getLogger(__name__)

@click.command(context_settings={'ignore_unknown_options': True, 'allow_extra_args': True})
@click.argument('classifier', type=click.Choice(constants.CLASSIFIERS))
@click.argument('target', type=click.Choice(target_database.available_targets()))
@click.argument('target_type', type=click.Choice(target_database.available_types()))
@click.option('--nested', is_flag=True, help='Compute a nested cross-validation with hyperparameters tuning via grid search.')
@click.option('--single', is_flag=True, help='Compute a single evaluation over all k folds, instead of k evaluations.')
@click.option('-k', '--k-folds', default=5, help="Number of folds, default: 5.")
@click.option('-b', '--binarize', default=0.1, help="Default: 0.1.")
@click.option('-m', '--metric', type=click.Choice(constants.PERFORMANCE_METRICS),
help="Performance metric for nested cross-validation. Implies '--nested'. Default: F1.")
@click.option('-d', '--dir-io', type=click.Path(file_okay=False), default=constants.SHARED_FOLDER, help="Input/output directory, default: '%s'." % constants.SHARED_FOLDER)
def cli(classifier, target, target_type, single, k_folds, binarize, dir_io):
def cli(ctx, classifier, target, target_type, nested, single, k_folds, metric, dir_io):
"""Evaluate the performance of a probabilistic linker."""
if not single:
predictions, p_mean, p_std, r_mean, r_std, fscore_mean, fscore_std = average_k_fold(
constants.CLASSIFIERS[classifier], target, target_type, binarize, dir_io, k=k_folds)'Precision: mean = %s; std = %s; Recall: mean = %s; std = %s; F-score: mean = %s; std = %s',
p_mean, p_std, r_mean, r_std, fscore_mean, fscore_std)
kwargs = utils.handle_extra_cli_args(ctx.args)
if kwargs is None:
return 1

performance_fileout = os.path.join(dir_io, constants.LINKER_PERFORMANCE %
(target, target_type, classifier))
predictions_fileout = os.path.join(dir_io, constants.LINKER_EVALUATION_PREDICTIONS %
(target, target_type, classifier))

if nested:
'You have opted for the slowest evaluation option, please be patient ...')
'Starting nested %d-fold cross validation with hyperparameters tuning via grid search ...', k_folds)

clf = constants.CLASSIFIERS[classifier]
param_grid = constants.PARAMETER_GRIDS[clf]
result = nested_k_fold_with_grid_search(
clf, param_grid, target, target_type, k_folds, metric, dir_io, **kwargs)'Evaluation done: %s', result)

# Persist best models
for k, model in enumerate(result.pop('best_models'), 1):
model_fileout = os.path.join(dir_io, constants.LINKER_NESTED_CV_BEST_MODEL % (
target, target_type, classifier, k), )

joblib.dump(model, model_fileout)"Best model for fold %d dumped to '%s'",
k, model_fileout)

performance_fileout = performance_fileout.replace('txt', 'json')
with open(performance_fileout, 'w') as fout:
json.dump(result, fout, indent=2)"%s performance dumped to '%s'",
metric, performance_fileout)
return 0

if single:'Starting single evaluation over %d folds ...', k_folds)

predictions.to_series().to_csv(os.path.join(dir_io, constants.LINKER_EVALUATION_PREDICTIONS %
(target, target_type, classifier)), columns=[], header=True)

with open(os.path.join(dir_io, constants.LINKER_PERFORMANCE % (target, target_type, classifier)), 'w') as fileout:
f'Precision:\n\tmean = {p_mean}\n\tstandard deviation = {p_std}\nRecall:\n\tmean = {r_mean}\n\tstandard deviation = {r_std}\nF-score:\n\tmean = {fscore_mean}\n\tstandard deviation = {fscore_std}\n')

predictions, (precision, recall, fscore, confusion_matrix) = single_k_fold(
constants.CLASSIFIERS[classifier], target, target_type, binarize, dir_io, k=k_folds)
constants.CLASSIFIERS[classifier], target, target_type, k_folds, dir_io, **kwargs)

predictions.to_series().to_csv(os.path.join(dir_io, constants.LINKER_EVALUATION_PREDICTIONS %
(target, target_type, classifier)), columns=[], header=True)
with open(os.path.join(dir_io, constants.LINKER_PERFORMANCE % (target, target_type, classifier)), 'w') as fileout:
fileout.write('Evaluation done')

with open(performance_fileout, 'w') as fout:
f'Precision: {precision}\nRecall: {recall}\nF-score: {fscore}\nConfusion matrix:\n{confusion_matrix}\n')"Predictions dumped to '%s', Performance dumped to '%s'",
predictions_fileout, performance_fileout)
return 0

# Default: average evaluation over k-fold'Starting average evaluation over %d folds ...', k_folds)

predictions, p_mean, p_std, r_mean, r_std, fscore_mean, fscore_std = average_k_fold(
constants.CLASSIFIERS[classifier], target, target_type, k_folds, dir_io, **kwargs)'Evaluation done. Precision: mean = %s; std = %s; Recall: mean = %s; std = %s; F-score: mean = %s; std = %s',
p_mean, p_std, r_mean, r_std, fscore_mean, fscore_std)

with open(performance_fileout, 'w') as fout:
f'Precision:\n\tmean = {p_mean}\n\tstandard deviation = {p_std}\nRecall:\n\tmean = {r_mean}\n\tstandard deviation = {r_std}\nF-score:\n\tmean = {fscore_mean}\n\tstandard deviation = {fscore_std}\n')"Predictions dumped to '%s', Performance dumped to '%s'",
predictions_fileout, performance_fileout)

def _compute_performance(test_index, predictions, test_vectors_size):'Running performance evaluation ...')
Expand All @@ -75,58 +132,91 @@ def _compute_performance(test_index, predictions, test_vectors_size):
return precision, recall, fscore, confusion_matrix

def average_k_fold(classifier, catalog, entity, binarize, dir_io, k=5):
def nested_k_fold_with_grid_search(classifier, param_grid, catalog, entity, k, scoring, dir_io, **kwargs):
if classifier is constants.SINGLE_LAYER_PERCEPTRON:
# TODO make Keras work with GridSearchCV
raise NotImplementedError()

result = defaultdict(list)

dataset, positive_samples_index = train.build_dataset(
'training', catalog, entity, dir_io)
model = workflow.init_model(classifier, **kwargs).kernel

inner_k_fold, target = utils.prepare_stratified_k_fold(
k, dataset, positive_samples_index)
outer_k_fold = StratifiedKFold(n_splits=k, shuffle=True, random_state=1269)
grid_search = GridSearchCV(
model, param_grid, scoring=scoring, n_jobs=-1, cv=inner_k_fold, verbose=2)
dataset = dataset.to_numpy()

for train_index, test_index in outer_k_fold.split(dataset, target):
# Run grid search[train_index], target[train_index])
# Grid search best score is the train score
# Let grid search compute the test score
test_score = grid_search.score(dataset[test_index], target[test_index])
best_model = grid_search.best_estimator_

return result

def average_k_fold(classifier, catalog, entity, k, dir_io, **kwargs):
predictions, precisions, recalls, fscores = None, [], [], []
dataset, positive_samples_index = train.build_dataset(
'training', catalog, entity, dir_io)
k_fold = StratifiedKFold(n_splits=k, shuffle=True)
# scikit's stratified k-fold no longer supports multi-label data representation.
# It expects a binary array instead, so build it based on the positive samples index
binary_target_variables =
lambda x: 1 if x in positive_samples_index else 0)
k_fold, binary_target_variables = utils.prepare_stratified_k_fold(
k, dataset, positive_samples_index)

if classifier is constants.SINGLE_LAYER_PERCEPTRON:
model = workflow.init_model(classifier, dataset.shape[1], **kwargs)
model = workflow.init_model(classifier, **kwargs)

for train_index, test_index in k_fold.split(dataset, binary_target_variables):
training, test = dataset.iloc[train_index], dataset.iloc[test_index]

model = workflow.init_model(classifier, binarize, training.shape[1])
model = workflow.init_model(classifier, **kwargs), positive_samples_index & training.index)

preds = model.predict(test)

p, r, f, _ = _compute_performance(
positive_samples_index & test.index, preds, len(test))

if predictions is None:
predictions = preds
predictions |= preds


return predictions, mean(precisions), std(precisions), mean(recalls), std(recalls), mean(fscores), std(fscores)

def single_k_fold(classifier, catalog, entity, binarize, dir_io, k=5):
def single_k_fold(classifier, catalog, entity, k, dir_io, **kwargs):
predictions, test_set = None, []
dataset, positive_samples_index = train.build_dataset(
'training', catalog, entity, dir_io)
k_fold, binary_target_variables = utils.prepare_stratified_k_fold(
k, dataset, positive_samples_index)

k_fold = StratifiedKFold(n_splits=k, shuffle=True)

# scikit's stratified k-fold no longer supports multi-label data representation.
# It expects a binary array instead, so build it based on the positive samples index
binary_target_variables =
lambda x: 1 if x in positive_samples_index else 0)
if classifier is constants.SINGLE_LAYER_PERCEPTRON:
model = workflow.init_model(classifier, dataset.shape[1], **kwargs)
model = workflow.init_model(classifier, **kwargs)

for train_index, test_index in k_fold.split(dataset, binary_target_variables):

training, test = dataset.iloc[train_index], dataset.iloc[test_index]

model = workflow.init_model(classifier, binarize)

model = workflow.init_model(classifier, **kwargs), positive_samples_index & training.index)

if predictions is None:
Expand All @@ -138,7 +228,7 @@ def single_k_fold(classifier, catalog, entity, binarize, dir_io, k=5):
return predictions, _compute_performance(positive_samples_index & test_set.index, predictions, len(test_set))

def _random_split(wd_chunk, target_chunk):
def random_split(wd_chunk, target_chunk):
wd_train, wd_test = train_test_split(wd_chunk, test_size=0.33)
target_train, target_test = train_test_split(
target_chunk, test_size=0.33)
Expand Down

0 comments on commit b03970e

Please sign in to comment.