From cd220fcf1f4fd334d8e5b21e8787fae8be3675a5 Mon Sep 17 00:00:00 2001
From: levsnv <36520083+levsnv@users.noreply.github.com>
Date: Tue, 9 Mar 2021 05:36:17 -0800
Subject: [PATCH] Test FIL probabilities with absolute error thresholds in
 python (#3582)

Probabilities are limited between [0.0, 1.0]. Also, we generally care more about large probabilities which are `O(1/n_classes)`.
The largest relative probability errors are usually caused by a small ground truth probability (e.g. 1e-3), as opposed to a large absolute error.
Hence, relative probability error is not the best metric. Absolute probability error is more relevant.
Moreover, absolute probability error is more stable, as relative errors have a long tail. When training or even inferring on many rows, the chance of getting a ground truth probability sized 1e-3 or 1e-4 grows. In some cases, there is no reasonable and reliable threshold. Last, if the number of predicted probabilities (clipped values) per input row grows, so does the long tail of relative probability errors, due to less undersampling. This unfairly compares binary classification with regression, and multiclass classification with binary classification.

The changes below are based on collecting absolute errors under `--run_unit`, `--run_quality` and `--run_stress`. These thresholds are violated at most a couple times per million samples, in most cases never.

Authors:
  - @levsnv

Approvers:
  - John Zedlewski (@JohnZed)
  - Andy Adinets (@canonizer)

URL: https://github.com/rapidsai/cuml/pull/3582
---
 python/cuml/test/test_fil.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/python/cuml/test/test_fil.py b/python/cuml/test/test_fil.py
index cf0d024933..cd4f03a1de 100644
--- a/python/cuml/test/test_fil.py
+++ b/python/cuml/test/test_fil.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -138,7 +138,7 @@ def test_fil_classification(n_rows, n_columns, num_rounds,
         assert array_equal(fil_preds, xgb_preds_int)
         xgb_proba = np.stack([1-xgb_preds, xgb_preds], axis=1)
         fil_proba = np.asarray(fm.predict_proba(X_validation))
-        assert np.allclose(fil_proba, xgb_proba, 1e-3)
+        assert np.allclose(fil_proba, xgb_proba, atol=3e-7)
 
 
 @pytest.mark.parametrize('n_rows', [unit_param(1000), quality_param(10000),
@@ -262,7 +262,7 @@ def test_fil_skl_classification(n_rows, n_columns, n_estimators, max_depth,
         assert array_equal(fil_preds, skl_preds_int)
         fil_proba = np.asarray(fm.predict_proba(X_validation))
         fil_proba = np.reshape(fil_proba, np.shape(skl_proba))
-        assert np.allclose(fil_proba, skl_proba, 1e-3)
+        assert np.allclose(fil_proba, skl_proba, atol=3e-7)
 
 
 @pytest.mark.parametrize('n_rows', [1000])
@@ -486,8 +486,8 @@ def test_lightgbm(tmp_path, num_classes):
                                   algo='TREE_REORG',
                                   output_class=True,
                                   model_type="lightgbm")
-        fil_proba = fm.predict_proba(X)
-        assert np.allclose(gbm_proba, fil_proba[:, 1], 1e-2)
+        fil_proba = fm.predict_proba(X)[:, 1]
+        assert np.allclose(gbm_proba, fil_proba, atol=3e-7)
         gbm_preds = (gbm_proba > 0.5)
         fil_preds = fm.predict(X)
         assert array_equal(gbm_preds, fil_preds)