From cf27323a056a57b02a39765f6fbf5433fea82976 Mon Sep 17 00:00:00 2001 From: Francisco Rivera Valverde <44504424+franchuterivera@users.noreply.github.com> Date: Tue, 16 Feb 2021 07:25:42 +0100 Subject: [PATCH] Calculate loss support (#1075) * Calculate loss support * Relaxed log loss test for individual models * Feedback from #1075 * Missing loss in comment * Revert back test as well --- autosklearn/ensemble_builder.py | 220 ++++++++++--------- autosklearn/ensembles/ensemble_selection.py | 32 ++- autosklearn/evaluation/abstract_evaluator.py | 40 ++-- autosklearn/evaluation/test_evaluator.py | 17 +- autosklearn/metrics/__init__.py | 73 +++++- test/test_automl/test_automl.py | 5 +- test/test_ensemble_builder/test_ensemble.py | 84 +++---- test/test_metric/test_metrics.py | 70 +++++- 8 files changed, 329 insertions(+), 212 deletions(-) diff --git a/autosklearn/ensemble_builder.py b/autosklearn/ensemble_builder.py index 56db171197..84e19ee397 100644 --- a/autosklearn/ensemble_builder.py +++ b/autosklearn/ensemble_builder.py @@ -27,7 +27,7 @@ from autosklearn.util.backend import Backend from autosklearn.constants import BINARY_CLASSIFICATION -from autosklearn.metrics import calculate_score, Scorer +from autosklearn.metrics import calculate_score, calculate_loss, Scorer from autosklearn.ensembles.ensemble_selection import EnsembleSelection from autosklearn.ensembles.abstract_ensemble import AbstractEnsemble from autosklearn.util.logging_ import get_named_client_logger @@ -76,7 +76,7 @@ def __init__( task_type: int type of ML task metric: str - name of metric to score predictions + name of metric to compute the loss of the given predictions ensemble_size: int maximal size of ensemble (passed to autosklearn.ensemble.ensemble_selection) ensemble_nbest: int/float @@ -294,7 +294,7 @@ def fit_and_return_ensemble( dataset_name: str name of dataset metric: str - name of metric to score predictions + name of metric to compute the loss of the given predictions task_type: int type of ML task ensemble_size: int @@ -399,7 +399,7 @@ def __init__( task_type: int type of ML task metric: str - name of metric to score predictions + name of metric to compute the loss of the given predictions ensemble_size: int maximal size of ensemble (passed to autosklearn.ensemble.ensemble_selection) ensemble_nbest: int/float @@ -498,14 +498,14 @@ def __init__( # already read prediction files # {"file name": { - # "ens_score": float + # "ens_loss": float # "mtime_ens": str, # "mtime_valid": str, # "mtime_test": str, # "seed": int, # "num_run": int, # }} - self.read_scores = {} + self.read_losses = {} # {"file_name": { # Y_ENSEMBLE: np.ndarray # Y_VALID: np.ndarray @@ -516,7 +516,7 @@ def __init__( # Depending on the dataset dimensions, # regenerating every iteration, the predictions - # scores for self.read_preds + # losses for self.read_preds # is too computationally expensive # As the ensemble builder is stateless # (every time the ensemble builder gets resources @@ -539,17 +539,17 @@ def __init__( traceback.format_exc(), ) ) - self.ensemble_score_file = os.path.join( + self.ensemble_loss_file = os.path.join( self.backend.internals_directory, - 'ensemble_read_scores.pkl' + 'ensemble_read_losses.pkl' ) - if os.path.exists(self.ensemble_score_file): + if os.path.exists(self.ensemble_loss_file): try: - with (open(self.ensemble_score_file, "rb")) as memory: - self.read_scores = pickle.load(memory) + with (open(self.ensemble_loss_file, "rb")) as memory: + self.read_losses = pickle.load(memory) except Exception as e: self.logger.warning( - "Could not load the previous iterations of ensemble_builder scores." + "Could not load the previous iterations of ensemble_builder losses." "This might impact the quality of the run. Exception={} {}".format( e, traceback.format_exc(), @@ -677,8 +677,8 @@ def main(self, time_left, iteration, return_predictions): time_left - used_time, ) - # populates self.read_preds and self.read_scores - if not self.score_ensemble_preds(): + # populates self.read_preds and self.read_losses + if not self.compute_loss_per_model(): if return_predictions: return self.ensemble_history, self.ensemble_nbest, train_pred, valid_pred, test_pred else: @@ -745,9 +745,9 @@ def main(self, time_left, iteration, return_predictions): if self.max_resident_models is not None: self._delete_excess_models(selected_keys=candidate_models) - # Save the read scores status for the next iteration - with open(self.ensemble_score_file, "wb") as memory: - pickle.dump(self.read_scores, memory) + # Save the read losses status for the next iteration + with open(self.ensemble_loss_file, "wb") as memory: + pickle.dump(self.read_losses, memory) if ensemble is not None: train_pred = self.predict(set_="train", @@ -808,10 +808,10 @@ def get_disk_consumption(self, pred_path): # get the megabytes return round(this_model_cost / math.pow(1024, 2), 2) - def score_ensemble_preds(self): + def compute_loss_per_model(self): """ - score predictions on ensemble building data set; - populates self.read_preds and self.read_scores + Compute the loss of the predictions on ensemble building data set; + populates self.read_preds and self.read_losses """ self.logger.debug("Read ensemble data set predictions") @@ -865,9 +865,9 @@ def score_ensemble_preds(self): self.logger.info('Error loading file (not .npy or .npy.gz): %s', y_ens_fn) continue - if not self.read_scores.get(y_ens_fn): - self.read_scores[y_ens_fn] = { - "ens_score": -np.inf, + if not self.read_losses.get(y_ens_fn): + self.read_losses[y_ens_fn] = { + "ens_loss": np.inf, "mtime_ens": 0, "mtime_valid": 0, "mtime_test": 0, @@ -889,37 +889,37 @@ def score_ensemble_preds(self): Y_TEST: None, } - if self.read_scores[y_ens_fn]["mtime_ens"] == mtime: + if self.read_losses[y_ens_fn]["mtime_ens"] == mtime: # same time stamp; nothing changed; continue - # actually read the predictions and score them + # actually read the predictions and compute their respective loss try: y_ensemble = self._read_np_fn(y_ens_fn) - score = calculate_score(solution=self.y_true_ensemble, - prediction=y_ensemble, - task_type=self.task_type, - metric=self.metric, - scoring_functions=None) + loss = calculate_loss(solution=self.y_true_ensemble, + prediction=y_ensemble, + task_type=self.task_type, + metric=self.metric, + scoring_functions=None) - if np.isfinite(self.read_scores[y_ens_fn]["ens_score"]): + if np.isfinite(self.read_losses[y_ens_fn]["ens_loss"]): self.logger.debug( - 'Changing ensemble score for file %s from %f to %f ' + 'Changing ensemble loss for file %s from %f to %f ' 'because file modification time changed? %f - %f', y_ens_fn, - self.read_scores[y_ens_fn]["ens_score"], - score, - self.read_scores[y_ens_fn]["mtime_ens"], + self.read_losses[y_ens_fn]["ens_loss"], + loss, + self.read_losses[y_ens_fn]["mtime_ens"], os.path.getmtime(y_ens_fn), ) - self.read_scores[y_ens_fn]["ens_score"] = score + self.read_losses[y_ens_fn]["ens_loss"] = loss # It is not needed to create the object here - # To save memory, we just score the object. - self.read_scores[y_ens_fn]["mtime_ens"] = os.path.getmtime(y_ens_fn) - self.read_scores[y_ens_fn]["loaded"] = 2 - self.read_scores[y_ens_fn]["disc_space_cost_mb"] = self.get_disk_consumption( + # To save memory, we just compute the loss. + self.read_losses[y_ens_fn]["mtime_ens"] = os.path.getmtime(y_ens_fn) + self.read_losses[y_ens_fn]["loaded"] = 2 + self.read_losses[y_ens_fn]["disc_space_cost_mb"] = self.get_disk_consumption( y_ens_fn ) @@ -931,20 +931,20 @@ def score_ensemble_preds(self): y_ens_fn, traceback.format_exc(), ) - self.read_scores[y_ens_fn]["ens_score"] = -np.inf + self.read_losses[y_ens_fn]["ens_loss"] = np.inf self.logger.debug( 'Done reading %d new prediction files. Loaded %d predictions in ' 'total.', n_read_files, - np.sum([pred["loaded"] > 0 for pred in self.read_scores.values()]) + np.sum([pred["loaded"] > 0 for pred in self.read_losses.values()]) ) return True def get_n_best_preds(self): """ - get best n predictions (i.e., keys of self.read_scores) - according to score on "ensemble set" + get best n predictions (i.e., keys of self.read_losses) + according to the loss on the "ensemble set" n: self.ensemble_nbest Side effects: @@ -960,16 +960,22 @@ def get_n_best_preds(self): num_keys = len(sorted_keys) # remove all that are at most as good as random # note: dummy model must have run_id=1 (there is no run_id=0) - dummy_scores = list(filter(lambda x: x[2] == 1, sorted_keys)) + dummy_losses = list(filter(lambda x: x[2] == 1, sorted_keys)) # number of dummy models - num_dummy = len(dummy_scores) - dummy_score = dummy_scores[0] - self.logger.debug("Use %f as dummy score" % dummy_score[1]) - sorted_keys = filter(lambda x: x[1] > dummy_score[1], sorted_keys) + num_dummy = len(dummy_losses) + dummy_loss = dummy_losses[0] + self.logger.debug("Use %f as dummy loss" % dummy_loss[1]) + + # sorted_keys looks like: (k, v["ens_loss"], v["num_run"]) + # On position 1 we have the loss of a minimization problem. + # keep only the predictions with a loss smaller than the dummy + # prediction + sorted_keys = filter(lambda x: x[1] < dummy_loss[1], sorted_keys) + # remove Dummy Classifier sorted_keys = list(filter(lambda x: x[2] > 1, sorted_keys)) if not sorted_keys: - # no model left; try to use dummy score (num_run==0) + # no model left; try to use dummy loss (num_run==0) # log warning when there are other models but not better than dummy model if num_keys > num_dummy: self.logger.warning("No models better than random - using Dummy Score!" @@ -978,10 +984,10 @@ def get_n_best_preds(self): num_keys - 1, num_dummy) sorted_keys = [ - (k, v["ens_score"], v["num_run"]) for k, v in self.read_scores.items() + (k, v["ens_loss"], v["num_run"]) for k, v in self.read_losses.items() if v["seed"] == self.seed and v["num_run"] == 1 ] - # reload predictions if scores changed over time and a model is + # reload predictions if losses changed over time and a model is # considered to be in the top models again! if not isinstance(self.ensemble_nbest, numbers.Integral): # Transform to number of models to keep. Keep at least one @@ -1004,9 +1010,9 @@ def get_n_best_preds(self): if not isinstance(self.max_models_on_disc, numbers.Integral): consumption = [ [ - v["ens_score"], + v["ens_loss"], v["disc_space_cost_mb"], - ] for v in self.read_scores.values() if v["disc_space_cost_mb"] is not None + ] for v in self.read_losses.values() if v["disc_space_cost_mb"] is not None ] max_consumption = max(c[1] for c in consumption) @@ -1015,10 +1021,10 @@ def get_n_best_preds(self): # max_consumption megabytes if (sum(c[1] for c in consumption) + max_consumption) > self.max_models_on_disc: - # just leave the best -- higher is better! + # just leave the best -- smaller is better! # This list is in descending order, to preserve the best models sorted_cum_consumption = np.cumsum([ - c[1] for c in list(reversed(sorted(consumption))) + c[1] for c in list(sorted(consumption)) ]) + max_consumption max_models = np.argmax(sorted_cum_consumption > self.max_models_on_disc) @@ -1048,17 +1054,17 @@ def get_n_best_preds(self): # consider performance_range_threshold if self.performance_range_threshold > 0: - best_score = sorted_keys[0][1] - min_score = dummy_score[1] - min_score += (best_score - min_score) * self.performance_range_threshold - if sorted_keys[keep_nbest - 1][1] < min_score: + best_loss = sorted_keys[0][1] + worst_loss = dummy_loss[1] + worst_loss -= (worst_loss - best_loss) * self.performance_range_threshold + if sorted_keys[keep_nbest - 1][1] > worst_loss: # We can further reduce number of models # since worst model is worse than thresh for i in range(0, keep_nbest): # Look at most at keep_nbest models, # but always keep at least one model - current_score = sorted_keys[i][1] - if current_score <= min_score: + current_loss = sorted_keys[i][1] + if current_loss >= worst_loss: self.logger.debug("Dynamic Performance range: " "Further reduce from %d to %d models", keep_nbest, max(1, i)) @@ -1075,15 +1081,15 @@ def get_n_best_preds(self): self.read_preds[k][Y_ENSEMBLE] = None self.read_preds[k][Y_VALID] = None self.read_preds[k][Y_TEST] = None - if self.read_scores[k]['loaded'] == 1: + if self.read_losses[k]['loaded'] == 1: self.logger.debug( - 'Dropping model %s (%d,%d) with score %f.', + 'Dropping model %s (%d,%d) with loss %f.', k, - self.read_scores[k]['seed'], - self.read_scores[k]['num_run'], - self.read_scores[k]['ens_score'], + self.read_losses[k]['seed'], + self.read_losses[k]['num_run'], + self.read_losses[k]['ens_loss'], ) - self.read_scores[k]['loaded'] = 2 + self.read_losses[k]['loaded'] = 2 # Load the predictions for the winning for k in sorted_keys[:ensemble_n_best]: @@ -1092,14 +1098,15 @@ def get_n_best_preds(self): k not in self.read_preds or self.read_preds[k][Y_ENSEMBLE] is None ) - and self.read_scores[k]['loaded'] != 3 + and self.read_losses[k]['loaded'] != 3 ): self.read_preds[k][Y_ENSEMBLE] = self._read_np_fn(k) # No need to load valid and test here because they are loaded # only if the model ends up in the ensemble - self.read_scores[k]['loaded'] = 1 + self.read_losses[k]['loaded'] = 1 - # return best scored keys of self.read_scores + # return best scored keys of self.read_losses + # That is, the one with the lowest loss return sorted_keys[:ensemble_n_best] def get_valid_test_preds(self, selected_keys: List[str]) -> Tuple[List[str], List[str]]: @@ -1126,14 +1133,14 @@ def get_valid_test_preds(self, selected_keys: List[str]) -> Tuple[List[str], Lis os.path.join( glob.escape(self.backend.get_runs_directory()), '%d_%d_%s' % ( - self.read_scores[k]["seed"], - self.read_scores[k]["num_run"], - self.read_scores[k]["budget"], + self.read_losses[k]["seed"], + self.read_losses[k]["num_run"], + self.read_losses[k]["budget"], ), 'predictions_valid_%d_%d_%s.npy*' % ( - self.read_scores[k]["seed"], - self.read_scores[k]["num_run"], - self.read_scores[k]["budget"], + self.read_losses[k]["seed"], + self.read_losses[k]["num_run"], + self.read_losses[k]["budget"], ) ) ) @@ -1142,14 +1149,14 @@ def get_valid_test_preds(self, selected_keys: List[str]) -> Tuple[List[str], Lis os.path.join( glob.escape(self.backend.get_runs_directory()), '%d_%d_%s' % ( - self.read_scores[k]["seed"], - self.read_scores[k]["num_run"], - self.read_scores[k]["budget"], + self.read_losses[k]["seed"], + self.read_losses[k]["num_run"], + self.read_losses[k]["budget"], ), 'predictions_test_%d_%d_%s.npy*' % ( - self.read_scores[k]["seed"], - self.read_scores[k]["num_run"], - self.read_scores[k]["budget"] + self.read_losses[k]["seed"], + self.read_losses[k]["num_run"], + self.read_losses[k]["budget"] ) ) ) @@ -1163,7 +1170,7 @@ def get_valid_test_preds(self, selected_keys: List[str]) -> Tuple[List[str], Lis else: valid_fn = valid_fn[0] if ( - self.read_scores[k]["mtime_valid"] == os.path.getmtime(valid_fn) + self.read_losses[k]["mtime_valid"] == os.path.getmtime(valid_fn) and k in self.read_preds and self.read_preds[k][Y_VALID] is not None ): @@ -1173,7 +1180,7 @@ def get_valid_test_preds(self, selected_keys: List[str]) -> Tuple[List[str], Lis y_valid = self._read_np_fn(valid_fn) self.read_preds[k][Y_VALID] = y_valid success_keys_valid.append(k) - self.read_scores[k]["mtime_valid"] = os.path.getmtime(valid_fn) + self.read_losses[k]["mtime_valid"] = os.path.getmtime(valid_fn) except Exception: self.logger.warning('Error loading %s: %s', valid_fn, traceback.format_exc()) @@ -1186,7 +1193,7 @@ def get_valid_test_preds(self, selected_keys: List[str]) -> Tuple[List[str], Lis else: test_fn = test_fn[0] if ( - self.read_scores[k]["mtime_test"] == os.path.getmtime(test_fn) + self.read_losses[k]["mtime_test"] == os.path.getmtime(test_fn) and k in self.read_preds and self.read_preds[k][Y_TEST] is not None ): @@ -1196,7 +1203,7 @@ def get_valid_test_preds(self, selected_keys: List[str]) -> Tuple[List[str], Lis y_test = self._read_np_fn(test_fn) self.read_preds[k][Y_TEST] = y_test success_keys_test.append(k) - self.read_scores[k]["mtime_test"] = os.path.getmtime(test_fn) + self.read_losses[k]["mtime_test"] = os.path.getmtime(test_fn) except Exception: self.logger.warning('Error loading %s: %s', test_fn, traceback.format_exc()) @@ -1210,7 +1217,7 @@ def fit_ensemble(self, selected_keys: list): Parameters --------- selected_keys: list - list of selected keys of self.read_scores + list of selected keys of self.read_losses Returns ------- @@ -1224,9 +1231,9 @@ def fit_ensemble(self, selected_keys: list): predictions_train = [self.read_preds[k][Y_ENSEMBLE] for k in selected_keys] include_num_runs = [ ( - self.read_scores[k]["seed"], - self.read_scores[k]["num_run"], - self.read_scores[k]["budget"], + self.read_losses[k]["seed"], + self.read_losses[k]["num_run"], + self.read_losses[k]["budget"], ) for k in selected_keys] @@ -1298,7 +1305,7 @@ def predict(self, set_: str, ensemble: EnsembleSelection trained Ensemble selected_keys: list - list of selected keys of self.read_scores + list of selected keys of self.read_losses n_preds: int number of prediction models used for ensemble building same number of predictions on valid and test are necessary @@ -1407,7 +1414,7 @@ def _add_ensemble_trajectory(self, train_pred, valid_pred, test_pred): def _get_list_of_sorted_preds(self): """ Returns a list of sorted predictions in descending order - Scores are taken from self.read_scores. + Losses are taken from self.read_losses. Parameters ---------- @@ -1417,24 +1424,23 @@ def _get_list_of_sorted_preds(self): ------ sorted_keys: list """ - # Sort by score - higher is better! - # First sort by num_run - sorted_keys = list(reversed(sorted( + # Sort by loss - smaller is better! + sorted_keys = list(sorted( [ - (k, v["ens_score"], v["num_run"]) - for k, v in self.read_scores.items() + (k, v["ens_loss"], v["num_run"]) + for k, v in self.read_losses.items() ], - key=lambda x: x[2], - ))) - # Then by score - sorted_keys = list(reversed(sorted(sorted_keys, key=lambda x: x[1]))) + # Sort by loss as priority 1 and then by num_run on a ascending order + # We want small num_run first + key=lambda x: (x[1], x[2]), + )) return sorted_keys def _delete_excess_models(self, selected_keys: List[str]): """ Deletes models excess models on disc. self.max_models_on_disc defines the upper limit on how many models to keep. - Any additional model with a worst score than the top + Any additional model with a worst loss than the top self.max_models_on_disc is deleted. """ @@ -1463,9 +1469,9 @@ def _delete_excess_models(self, selected_keys: List[str]): os.rename(numrun_dir, numrun_dir + '.old') shutil.rmtree(numrun_dir + '.old') self.logger.info("Deleted files of non-candidate model %s", pred_path) - self.read_scores[pred_path]["disc_space_cost_mb"] = None - self.read_scores[pred_path]["loaded"] = 3 - self.read_scores[pred_path]["ens_score"] = -np.inf + self.read_losses[pred_path]["disc_space_cost_mb"] = None + self.read_losses[pred_path]["loaded"] = 3 + self.read_losses[pred_path]["ens_loss"] = np.inf except Exception as e: self.logger.error( "Failed to delete files of non-candidate model %s due" diff --git a/autosklearn/ensembles/ensemble_selection.py b/autosklearn/ensembles/ensemble_selection.py index 8a3afb157f..363bf000ac 100644 --- a/autosklearn/ensembles/ensemble_selection.py +++ b/autosklearn/ensembles/ensemble_selection.py @@ -6,7 +6,7 @@ from autosklearn.constants import TASK_TYPES from autosklearn.ensembles.abstract_ensemble import AbstractEnsemble -from autosklearn.metrics import Scorer, calculate_score +from autosklearn.metrics import Scorer, calculate_loss from autosklearn.pipeline.base import BasePipeline @@ -100,7 +100,7 @@ def _fast( dtype=np.float64, ) for i in range(ensemble_size): - scores = np.zeros( + losses = np.zeros( (len(predictions)), dtype=np.float64, ) @@ -129,11 +129,11 @@ def _fast( out=fant_ensemble_prediction ) - # Calculate score is versatile and can return a dict of score + # calculate_loss is versatile and can return a dict of losses # when scoring_functions=None, we know it will be a float - calculated_score = cast( + losses[j] = cast( float, - calculate_score( + calculate_loss( solution=labels, prediction=fant_ensemble_prediction, task_type=self.task_type, @@ -141,12 +141,11 @@ def _fast( scoring_functions=None ) ) - scores[j] = self.metric._optimum - calculated_score - all_best = np.argwhere(scores == np.nanmin(scores)).flatten() + all_best = np.argwhere(losses == np.nanmin(losses)).flatten() best = self.random_state.choice(all_best) ensemble.append(predictions[best]) - trajectory.append(scores[best]) + trajectory.append(losses[best]) order.append(best) # Handle special case @@ -155,7 +154,7 @@ def _fast( self.indices_ = order self.trajectory_ = trajectory - self.train_score_ = trajectory[-1] + self.train_loss_ = trajectory[-1] def _slow( self, @@ -172,18 +171,18 @@ def _slow( ensemble_size = self.ensemble_size for i in range(ensemble_size): - scores = np.zeros( + losses = np.zeros( [np.shape(predictions)[0]], dtype=np.float64, ) for j, pred in enumerate(predictions): ensemble.append(pred) ensemble_prediction = np.mean(np.array(ensemble), axis=0) - # Calculate score is versatile and can return a dict of score + # calculate_loss is versatile and can return a dict of losses # when scoring_functions=None, we know it will be a float - calculated_score = cast( + losses[j] = cast( float, - calculate_score( + calculate_loss( solution=labels, prediction=ensemble_prediction, task_type=self.task_type, @@ -191,11 +190,10 @@ def _slow( scoring_functions=None ) ) - scores[j] = self.metric._optimum - calculated_score ensemble.pop() - best = np.nanargmin(scores) + best = np.nanargmin(losses) ensemble.append(predictions[best]) - trajectory.append(scores[best]) + trajectory.append(losses[best]) order.append(best) # Handle special case @@ -210,7 +208,7 @@ def _slow( trajectory, dtype=np.float64, ) - self.train_score_ = trajectory[-1] + self.train_loss_ = trajectory[-1] def _calculate_weights(self) -> None: ensemble_members = Counter(self.indices_).most_common() diff --git a/autosklearn/evaluation/abstract_evaluator.py b/autosklearn/evaluation/abstract_evaluator.py index 18054ce36d..fb3cb49eb0 100644 --- a/autosklearn/evaluation/abstract_evaluator.py +++ b/autosklearn/evaluation/abstract_evaluator.py @@ -19,7 +19,7 @@ from autosklearn.pipeline.implementations.util import ( convert_multioutput_multiclass_to_multilabel ) -from autosklearn.metrics import calculate_score +from autosklearn.metrics import calculate_loss, Scorer from autosklearn.util.logging_ import get_named_client_logger from ConfigSpace import Configuration @@ -236,17 +236,18 @@ def _get_model(self): init_params=self._init_params) return model - def _loss(self, y_true, y_hat, scoring_functions=None): - """Auto-sklearn follows a minimization goal, so the make_scorer - sign is used as a guide to obtain the value to reduce. - - On this regard, to optimize a metric: - 1- score is calculared with calculate_score, with the caveat, that if - for the metric greater is not better, a negative score is returned. - 2- the err (the optimization goal) is then: - optimum - (metric.sign * actual_score) - For accuracy for example: optimum(1) - (+1 * actual score) - For logloss for example: optimum(0) - (-1 * actual score) + def _loss(self, y_true: np.ndarray, y_hat: np.ndarray, + scoring_functions: typing.Optional[typing.List[Scorer]] = None + ) -> typing.Union[float, typing.Dict[str, float]]: + """Auto-sklearn follows a minimization goal. + The calculate_loss internally translate a score function to + a minimization problem. + + For a dummy prediction, the worst result is assumed. + + Parameters + ---------- + y_true """ scoring_functions = ( self.scoring_functions @@ -255,23 +256,14 @@ def _loss(self, y_true, y_hat, scoring_functions=None): ) if not isinstance(self.configuration, Configuration): if scoring_functions: - return {self.metric.name: 1.0} + return {self.metric.name: self.metric._worst_possible_result} else: - return 1.0 + return self.metric._worst_possible_result - score = calculate_score( + return calculate_loss( y_true, y_hat, self.task_type, self.metric, scoring_functions=scoring_functions) - if hasattr(score, '__len__'): - err = {metric.name: metric._optimum - score[metric.name] - for metric in scoring_functions} - err[self.metric.name] = self.metric._optimum - score[self.metric.name] - else: - err = self.metric._optimum - score - - return err - def finish_up(self, loss, train_loss, opt_pred, valid_pred, test_pred, additional_run_info, file_output, final_call, status): """This function does everything necessary after the fitting is done: diff --git a/autosklearn/evaluation/test_evaluator.py b/autosklearn/evaluation/test_evaluator.py index 23c101bfa4..b4c685b4f8 100644 --- a/autosklearn/evaluation/test_evaluator.py +++ b/autosklearn/evaluation/test_evaluator.py @@ -5,8 +5,7 @@ AbstractEvaluator, _fit_and_suppress_warnings, ) -from autosklearn.metrics import calculate_score, CLASSIFICATION_METRICS, REGRESSION_METRICS -from autosklearn.constants import CLASSIFICATION_TASKS +from autosklearn.metrics import calculate_loss __all__ = [ @@ -71,7 +70,7 @@ def predict_and_loss(self, train=False): if train: Y_pred = self.predict_function(self.X_train, self.model, self.task_type, self.Y_train) - score = calculate_score( + err = calculate_loss( solution=self.Y_train, prediction=Y_pred, task_type=self.task_type, @@ -80,23 +79,13 @@ def predict_and_loss(self, train=False): else: Y_pred = self.predict_function(self.X_test, self.model, self.task_type, self.Y_train) - score = calculate_score( + err = calculate_loss( solution=self.Y_test, prediction=Y_pred, task_type=self.task_type, metric=self.metric, scoring_functions=self.scoring_functions) - if hasattr(score, '__len__'): - if self.task_type in CLASSIFICATION_TASKS: - err = {key: metric._optimum - score[key] for key, metric in - CLASSIFICATION_METRICS.items() if key in score} - else: - err = {key: metric._optimum - score[key] for key, metric in - REGRESSION_METRICS.items() if key in score} - else: - err = self.metric._optimum - score - return err, Y_pred, None, None diff --git a/autosklearn/metrics/__init__.py b/autosklearn/metrics/__init__.py index e85dffc2f2..89be6ab8c5 100644 --- a/autosklearn/metrics/__init__.py +++ b/autosklearn/metrics/__init__.py @@ -1,6 +1,6 @@ from abc import ABCMeta, abstractmethod from functools import partial -from typing import Any, Callable, Dict, List, Optional, Union +from typing import Any, Callable, Dict, List, Optional, Union, cast import numpy as np @@ -353,7 +353,7 @@ def calculate_score( for metric_ in scoring_functions: try: - score_dict[metric_.name] = metric_(solution, cprediction) + score_dict[metric_.name] = metric_._sign * metric_(solution, cprediction) except ValueError as e: print(e, e.args[0]) if e.args[0] == "Mean Squared Logarithmic Error cannot be used when " \ @@ -369,7 +369,7 @@ def calculate_score( # handle? try: - score_dict[metric_.name] = metric_(solution, prediction) + score_dict[metric_.name] = metric_._sign * metric_(solution, prediction) except ValueError as e: if e.args[0] == 'multiclass format is not supported': continue @@ -397,10 +397,73 @@ def get_metric_score( solution: np.ndarray, task_type: int ) -> float: + # We match the behaviour of GridSearchCV + # In scikit learn, the exact value of the score_func + # is returned (not that of the 'Scorer' which might be + # negative in functions like mse, as scikit learn + # maximizes.) If an user wants to use GridSearchCV + # They are expected to pass neg_mean_squared_error + # For this reason we multiply back by metric_._sign if task_type in REGRESSION_TASKS: # TODO put this into the regression metric itself cprediction = sanitize_array(prediction) - score = metric_(solution, cprediction) + score = metric_._sign * metric_(solution, cprediction) else: - score = metric_(solution, prediction) + score = metric_._sign * metric_(solution, prediction) return score + + +def calculate_loss( + solution: np.ndarray, + prediction: np.ndarray, + task_type: int, + metric: Scorer, + scoring_functions: Optional[List[Scorer]] = None +) -> Union[float, Dict[str, float]]: + """ + Returns a loss (a magnitude that allows casting the + optimization problem, as a minimization one) for the + given Auto-Sklearn Scorer object + Parameters + ---------- + solution: np.ndarray + The ground truth of the targets + prediction: np.ndarray + The best estimate from the model, of the given targets + task_type: int + To understand if the problem task is classification + or regression + metric: Scorer + Object that host a function to calculate how good the + prediction is according to the solution. + scoring_functions: List[Scorer] + A list of metrics to calculate multiple losses + Returns + ------- + float or Dict[str, float] + A loss function for each of the provided scorer objects + """ + score = calculate_score( + solution=solution, + prediction=prediction, + task_type=task_type, + metric=metric, + scoring_functions=scoring_functions, + ) + + if scoring_functions: + score = cast(Dict, score) + # we expect a dict() object for which we should calculate the loss + loss_dict = dict() + for metric_ in scoring_functions + [metric]: + # TODO: When metrics are annotated with type_of_target support + # we can remove this check + if metric_.name not in score: + continue + # maybe metric argument is not in scoring_functions + # so append it to the list. Rather than check if such + # is the case, redefining loss_dict[metric] is less expensive + loss_dict[metric_.name] = metric_._optimum - metric_._sign * score[metric_.name] + return loss_dict + else: + return metric._optimum - metric._sign * cast(float, score) diff --git a/test/test_automl/test_automl.py b/test/test_automl/test_automl.py index 023954a42d..c358ac9553 100644 --- a/test/test_automl/test_automl.py +++ b/test/test_automl/test_automl.py @@ -308,7 +308,7 @@ def test_automl_outputs(backend, dask_client): 'start_time_100', 'datamanager.pkl', 'ensemble_read_preds.pkl', - 'ensemble_read_scores.pkl', + 'ensemble_read_losses.pkl', 'runs', 'ensembles', 'ensemble_history.json', @@ -625,7 +625,8 @@ def test_load_best_individual_model(metric, backend, dask_client): if metric.name == 'balanced_accuracy': assert automl.score(X_test, Y_test) > 0.9 elif metric.name == 'log_loss': - assert automl.score(X_test, Y_test) <= 0.2 + # Seen values in github actions of 0.6978304740364537 + assert automl.score(X_test, Y_test) <= 0.72 else: raise ValueError(metric.name) diff --git a/test/test_ensemble_builder/test_ensemble.py b/test/test_ensemble_builder/test_ensemble.py index e23de18a3c..335c07eca2 100644 --- a/test/test_ensemble_builder/test_ensemble.py +++ b/test/test_ensemble_builder/test_ensemble.py @@ -111,22 +111,22 @@ def testRead(ensemble_backend): seed=0, # important to find the test files ) - success = ensbuilder.score_ensemble_preds() + success = ensbuilder.compute_loss_per_model() assert success, str(ensbuilder.read_preds) assert len(ensbuilder.read_preds) == 3, ensbuilder.read_preds.keys() - assert len(ensbuilder.read_scores) == 3, ensbuilder.read_scores.keys() + assert len(ensbuilder.read_losses) == 3, ensbuilder.read_losses.keys() filename = os.path.join( ensemble_backend.temporary_directory, ".auto-sklearn/runs/0_1_0.0/predictions_ensemble_0_1_0.0.npy" ) - assert ensbuilder.read_scores[filename]["ens_score"] == 0.5 + assert ensbuilder.read_losses[filename]["ens_loss"] == 0.5 filename = os.path.join( ensemble_backend.temporary_directory, ".auto-sklearn/runs/0_2_0.0/predictions_ensemble_0_2_0.0.npy" ) - assert ensbuilder.read_scores[filename]["ens_score"] == 1.0 + assert ensbuilder.read_losses[filename]["ens_loss"] == 0.0 @pytest.mark.parametrize( @@ -151,7 +151,7 @@ def testNBest(ensemble_backend, ensemble_nbest, max_models_on_disc, exp): max_models_on_disc=max_models_on_disc, ) - ensbuilder.score_ensemble_preds() + ensbuilder.compute_loss_per_model() sel_keys = ensbuilder.get_n_best_preds() assert len(sel_keys) == exp @@ -192,7 +192,7 @@ def testMaxModelsOnDisc(ensemble_backend, test_case, exp): with unittest.mock.patch('os.path.getsize') as mock: mock.return_value = 100*1024*1024 - ensbuilder.score_ensemble_preds() + ensbuilder.compute_loss_per_model() sel_keys = ensbuilder.get_n_best_preds() assert len(sel_keys) == exp, test_case @@ -211,8 +211,8 @@ def testMaxModelsOnDisc2(ensemble_backend): ) ensbuilder.read_preds = {} for i in range(50): - ensbuilder.read_scores['pred'+str(i)] = { - 'ens_score': i*10, + ensbuilder.read_losses['pred'+str(i)] = { + 'ens_loss': -i*10, 'num_run': i, 'loaded': 1, "seed": 1, @@ -242,16 +242,16 @@ def testPerformanceRangeThreshold(ensemble_backend, performance_range_threshold, ensemble_nbest=100, performance_range_threshold=performance_range_threshold ) - ensbuilder.read_scores = { - 'A': {'ens_score': 1, 'num_run': 1, 'loaded': -1, "seed": 1}, - 'B': {'ens_score': 2, 'num_run': 2, 'loaded': -1, "seed": 1}, - 'C': {'ens_score': 3, 'num_run': 3, 'loaded': -1, "seed": 1}, - 'D': {'ens_score': 4, 'num_run': 4, 'loaded': -1, "seed": 1}, - 'E': {'ens_score': 5, 'num_run': 5, 'loaded': -1, "seed": 1}, + ensbuilder.read_losses = { + 'A': {'ens_loss': -1, 'num_run': 1, 'loaded': -1, "seed": 1}, + 'B': {'ens_loss': -2, 'num_run': 2, 'loaded': -1, "seed": 1}, + 'C': {'ens_loss': -3, 'num_run': 3, 'loaded': -1, "seed": 1}, + 'D': {'ens_loss': -4, 'num_run': 4, 'loaded': -1, "seed": 1}, + 'E': {'ens_loss': -5, 'num_run': 5, 'loaded': -1, "seed": 1}, } ensbuilder.read_preds = { key: {key_2: True for key_2 in (Y_ENSEMBLE, Y_VALID, Y_TEST)} - for key in ensbuilder.read_scores + for key in ensbuilder.read_losses } sel_keys = ensbuilder.get_n_best_preds() @@ -277,16 +277,16 @@ def testPerformanceRangeThresholdMaxBest(ensemble_backend, performance_range_thr performance_range_threshold=performance_range_threshold, max_models_on_disc=None, ) - ensbuilder.read_scores = { - 'A': {'ens_score': 1, 'num_run': 1, 'loaded': -1, "seed": 1}, - 'B': {'ens_score': 2, 'num_run': 2, 'loaded': -1, "seed": 1}, - 'C': {'ens_score': 3, 'num_run': 3, 'loaded': -1, "seed": 1}, - 'D': {'ens_score': 4, 'num_run': 4, 'loaded': -1, "seed": 1}, - 'E': {'ens_score': 5, 'num_run': 5, 'loaded': -1, "seed": 1}, + ensbuilder.read_losses = { + 'A': {'ens_loss': -1, 'num_run': 1, 'loaded': -1, "seed": 1}, + 'B': {'ens_loss': -2, 'num_run': 2, 'loaded': -1, "seed": 1}, + 'C': {'ens_loss': -3, 'num_run': 3, 'loaded': -1, "seed": 1}, + 'D': {'ens_loss': -4, 'num_run': 4, 'loaded': -1, "seed": 1}, + 'E': {'ens_loss': -5, 'num_run': 5, 'loaded': -1, "seed": 1}, } ensbuilder.read_preds = { key: {key_2: True for key_2 in (Y_ENSEMBLE, Y_VALID, Y_TEST)} - for key in ensbuilder.read_scores + for key in ensbuilder.read_losses } sel_keys = ensbuilder.get_n_best_preds() @@ -303,29 +303,29 @@ def testFallBackNBest(ensemble_backend): ensemble_nbest=1 ) - ensbuilder.score_ensemble_preds() + ensbuilder.compute_loss_per_model() print() print(ensbuilder.read_preds.keys()) - print(ensbuilder.read_scores.keys()) + print(ensbuilder.read_losses.keys()) print(ensemble_backend.temporary_directory) filename = os.path.join( ensemble_backend.temporary_directory, ".auto-sklearn/runs/0_2_0.0/predictions_ensemble_0_2_0.0.npy" ) - ensbuilder.read_scores[filename]["ens_score"] = -1 + ensbuilder.read_losses[filename]["ens_loss"] = -1 filename = os.path.join( ensemble_backend.temporary_directory, ".auto-sklearn/runs/0_3_100.0/predictions_ensemble_0_3_100.0.npy" ) - ensbuilder.read_scores[filename]["ens_score"] = -1 + ensbuilder.read_losses[filename]["ens_loss"] = -1 filename = os.path.join( ensemble_backend.temporary_directory, ".auto-sklearn/runs/0_1_0.0/predictions_ensemble_0_1_0.0.npy" ) - ensbuilder.read_scores[filename]["ens_score"] = -1 + ensbuilder.read_losses[filename]["ens_loss"] = -1 sel_keys = ensbuilder.get_n_best_preds() @@ -347,8 +347,10 @@ def testGetValidTestPreds(ensemble_backend): ensemble_nbest=1 ) - ensbuilder.score_ensemble_preds() + ensbuilder.compute_loss_per_model() + # d1 is a dummt prediction. d2 and d3 have the same prediction with + # different name. num_run=2 is selected when doing sorted() d1 = os.path.join( ensemble_backend.temporary_directory, ".auto-sklearn/runs/0_1_0.0/predictions_ensemble_0_1_0.0.npy" @@ -397,7 +399,7 @@ def testEntireEnsembleBuilder(ensemble_backend): ) ensbuilder.SAVE2DISC = False - ensbuilder.score_ensemble_preds() + ensbuilder.compute_loss_per_model() d2 = os.path.join( ensemble_backend.temporary_directory, @@ -485,7 +487,7 @@ def test_main(ensemble_backend): os.path.join(ensemble_backend.internals_directory, 'ensemble_read_preds.pkl') ), os.listdir(ensemble_backend.internals_directory) assert os.path.exists( - os.path.join(ensemble_backend.internals_directory, 'ensemble_read_scores.pkl') + os.path.join(ensemble_backend.internals_directory, 'ensemble_read_losses.pkl') ), os.listdir(ensemble_backend.internals_directory) @@ -522,9 +524,9 @@ def testLimit(ensemble_backend): ) ensbuilder.SAVE2DISC = False - read_scores_file = os.path.join( + read_losses_file = os.path.join( ensemble_backend.internals_directory, - 'ensemble_read_scores.pkl' + 'ensemble_read_losses.pkl' ) read_preds_file = os.path.join( ensemble_backend.internals_directory, @@ -554,15 +556,15 @@ def mtime_mock(filename): mtime.side_effect = mtime_mock ensbuilder.run(time_left=1000, iteration=0, pynisher_context='fork') - assert os.path.exists(read_scores_file) + assert os.path.exists(read_losses_file) assert not os.path.exists(read_preds_file) assert logger_mock.warning.call_count == 1 ensbuilder.run(time_left=1000, iteration=0, pynisher_context='fork') - assert os.path.exists(read_scores_file) + assert os.path.exists(read_losses_file) assert not os.path.exists(read_preds_file) assert logger_mock.warning.call_count == 2 ensbuilder.run(time_left=1000, iteration=0, pynisher_context='fork') - assert os.path.exists(read_scores_file) + assert os.path.exists(read_losses_file) assert not os.path.exists(read_preds_file) assert logger_mock.warning.call_count == 3 @@ -570,7 +572,7 @@ def mtime_mock(filename): assert ensbuilder.ensemble_nbest == 1 ensbuilder.run(time_left=1000, iteration=0, pynisher_context='fork') - assert os.path.exists(read_scores_file) + assert os.path.exists(read_losses_file) assert not os.path.exists(read_preds_file) assert logger_mock.warning.call_count == 4 @@ -580,7 +582,7 @@ def mtime_mock(filename): # And then it still runs, but basically won't do anything any more except for raising error # messages via the logger ensbuilder.run(time_left=1000, iteration=0, pynisher_context='fork') - assert os.path.exists(read_scores_file) + assert os.path.exists(read_losses_file) assert not os.path.exists(read_preds_file) assert logger_mock.warning.call_count == 4 @@ -629,15 +631,15 @@ def test_read_pickle_read_preds(ensemble_backend): ensemble_memory_file = os.path.join( ensemble_backend.internals_directory, - 'ensemble_read_scores.pkl' + 'ensemble_read_losses.pkl' ) assert os.path.exists(ensemble_memory_file) # Make sure we pickle the correct read scores with (open(ensemble_memory_file, "rb")) as memory: - read_scores = pickle.load(memory) + read_losses = pickle.load(memory) - compare_read_preds(read_scores, ensbuilder.read_scores) + compare_read_preds(read_losses, ensbuilder.read_losses) # Then create a new instance, which should automatically read this file ensbuilder2 = EnsembleBuilder( @@ -650,7 +652,7 @@ def test_read_pickle_read_preds(ensemble_backend): max_models_on_disc=None, ) compare_read_preds(ensbuilder2.read_preds, ensbuilder.read_preds) - compare_read_preds(ensbuilder2.read_scores, ensbuilder.read_scores) + compare_read_preds(ensbuilder2.read_losses, ensbuilder.read_losses) assert ensbuilder2.last_hash == ensbuilder.last_hash diff --git a/test/test_metric/test_metrics.py b/test/test_metric/test_metrics.py index 449466ed97..0f4a57e07c 100644 --- a/test/test_metric/test_metrics.py +++ b/test/test_metric/test_metrics.py @@ -1,11 +1,13 @@ import unittest +import pytest + import numpy as np import sklearn.metrics import autosklearn.metrics -from autosklearn.metrics import calculate_score +from autosklearn.metrics import calculate_score, calculate_loss from autosklearn.constants import BINARY_CLASSIFICATION, REGRESSION from smac.utils.constants import MAXINT @@ -39,7 +41,8 @@ def test_predict_scorer_binary(self): self.assertAlmostEqual(score, 0.5) scorer = autosklearn.metrics._PredictScorer( - 'accuracy', sklearn.metrics.accuracy_score, 1, 0, -1, {}) + name='accuracy', score_func=sklearn.metrics.accuracy_score, + optimum=1, worst_possible_result=0, sign=-1, kwargs={}) y_pred = np.array([[1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0]]) score = scorer(y_true, y_pred) @@ -529,3 +532,66 @@ def test_regression_only_metric(self): score = calculate_score(y_true, y_pred, REGRESSION, scorer) previous_score = scorer._optimum self.assertAlmostEqual(score, previous_score) + + +def test_calculate_loss(): + # In a 0-1 ranged scorer, make sure that the loss + # has a expected positive value + y_pred = np.array([0, 1, 0, 1, 1, 1, 0, 0, 0, 0]) + y_true = np.array([0, 1, 0, 1, 1, 0, 0, 0, 0, 0]) + score = sklearn.metrics.accuracy_score(y_true, y_pred) + assert pytest.approx(score) == calculate_score( + solution=y_true, + prediction=y_pred, + task_type=BINARY_CLASSIFICATION, + metric=autosklearn.metrics.accuracy, + ) + loss = 1.0 - score + assert pytest.approx(loss) == calculate_loss( + solution=y_true, + prediction=y_pred, + task_type=BINARY_CLASSIFICATION, + metric=autosklearn.metrics.accuracy, + ) + + # Test the dictionary case + score_dict = calculate_score( + solution=y_true, + prediction=y_pred, + task_type=BINARY_CLASSIFICATION, + metric=autosklearn.metrics.accuracy, + scoring_functions=[autosklearn.metrics.accuracy, autosklearn.metrics.balanced_accuracy] + ) + expected_score_dict = { + 'accuracy': 0.9, + 'balanced_accuracy': 0.9285714285714286, + } + loss_dict = calculate_loss( + solution=y_true, + prediction=y_pred, + task_type=BINARY_CLASSIFICATION, + metric=autosklearn.metrics.accuracy, + scoring_functions=[autosklearn.metrics.accuracy, autosklearn.metrics.balanced_accuracy] + ) + for expected_metric, expected_score in expected_score_dict.items(): + assert pytest.approx(expected_score) == score_dict[expected_metric] + assert pytest.approx(1-expected_score) == loss_dict[expected_metric] + + # Lastly make sure that metrics whose optimum is zero + # are also properly working + y_true = np.array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6]) + y_pred = np.array([0.11, 0.22, 0.33, 0.44, 0.55, 0.66]) + score = sklearn.metrics.mean_squared_error(y_true, y_pred) + assert pytest.approx(score) == calculate_score( + solution=y_true, + prediction=y_pred, + task_type=REGRESSION, + metric=autosklearn.metrics.mean_squared_error, + ) + loss = score + assert pytest.approx(loss) == calculate_loss( + solution=y_true, + prediction=y_pred, + task_type=REGRESSION, + metric=autosklearn.metrics.mean_squared_error, + )