From d78a8056511788f0d04554e376a4c075eb7135c6 Mon Sep 17 00:00:00 2001
From: Shaswat Anand <33908100+shaswat-indian@users.noreply.github.com>
Date: Wed, 29 Jun 2022 09:02:59 -0700
Subject: [PATCH] Regression errors failing with mixed data type combinations
 (#4770)

Resolves https://github.com/rapidsai/cuml/issues/4442

This PR fixes the issue with using mixed data types in regression errors like `mean_squared_error`, `mean_absolute_error` and `mean_squared_log_error`.

Authors:
  - Shaswat Anand (https://github.com/shaswat-indian)

Approvers:
  - William Hicks (https://github.com/wphicks)

URL: https://github.com/rapidsai/cuml/pull/4770
---
 python/cuml/metrics/regression.pyx | 12 ++++++------
 python/cuml/tests/test_metrics.py  | 17 ++++++++++-------
 2 files changed, 16 insertions(+), 13 deletions(-)

diff --git a/python/cuml/metrics/regression.pyx b/python/cuml/metrics/regression.pyx
index 218c0268a0..7c16c97269 100644
--- a/python/cuml/metrics/regression.pyx
+++ b/python/cuml/metrics/regression.pyx
@@ -104,19 +104,19 @@ def _prepare_input_reg(y_true, y_pred, sample_weight, multioutput):
     Helper function to avoid code duplication for regression metrics.
     Converts inputs to CumlArray and check multioutput parameter validity.
     """
+    allowed_d_types = [np.float32, np.float64, np.int32, np.int64]
     y_true = y_true.squeeze() if len(y_true.shape) > 1 else y_true
     y_true, n_rows, n_cols, ytype = \
-        input_to_cuml_array(y_true, check_dtype=[np.float32, np.float64,
-                                                 np.int32, np.int64])
+        input_to_cuml_array(y_true, check_dtype=allowed_d_types)
 
     y_pred = y_pred.squeeze() if len(y_pred.shape) > 1 else y_pred
     y_pred, _, _, _ = \
-        input_to_cuml_array(y_pred, check_dtype=ytype, check_rows=n_rows,
-                            check_cols=n_cols)
+        input_to_cuml_array(y_pred, check_dtype=allowed_d_types,
+                            check_rows=n_rows, check_cols=n_cols)
 
     if sample_weight is not None:
         sample_weight, _, _, _ = \
-            input_to_cuml_array(sample_weight, check_dtype=ytype,
+            input_to_cuml_array(sample_weight, check_dtype=allowed_d_types,
                                 check_rows=n_rows, check_cols=n_cols)
 
     raw_multioutput = False
@@ -134,7 +134,7 @@ def _prepare_input_reg(y_true, y_pred, sample_weight, multioutput):
             multioutput = None
     elif multioutput is not None:
         multioutput, _, _, _ = \
-            input_to_cuml_array(multioutput, check_dtype=ytype)
+            input_to_cuml_array(multioutput, check_dtype=allowed_d_types)
         if n_cols == 1:
             raise ValueError("Custom weights are useful only in "
                              "multi-output cases.")
diff --git a/python/cuml/tests/test_metrics.py b/python/cuml/tests/test_metrics.py
index 3576cd0784..7f03838342 100644
--- a/python/cuml/tests/test_metrics.py
+++ b/python/cuml/tests/test_metrics.py
@@ -481,15 +481,18 @@ def test_regression_metrics():
 
 
 @pytest.mark.parametrize('n_samples', [50, stress_param(500000)])
-@pytest.mark.parametrize('dtype', [np.int32, np.int64, np.float32, np.float64])
+@pytest.mark.parametrize('y_dtype',
+                         [np.int32, np.int64, np.float32, np.float64])
+@pytest.mark.parametrize('pred_dtype',
+                         [np.int32, np.int64, np.float32, np.float64])
 @pytest.mark.parametrize('function', ['mse', 'mae', 'msle'])
-def test_regression_metrics_random(n_samples, dtype, function):
-    if dtype == np.float32 and n_samples == 500000:
-        # stress test for float32 fails because of floating point precision
-        pytest.xfail()
+def test_regression_metrics_random_with_mixed_dtypes(n_samples, y_dtype,
+                                                     pred_dtype, function):
+    y_true, _, _, _ = generate_random_labels(
+        lambda rng: rng.randint(0, 1000, n_samples).astype(y_dtype))
 
-    y_true, y_pred, _, _ = generate_random_labels(
-        lambda rng: rng.randint(0, 1000, n_samples).astype(dtype))
+    y_pred, _, _, _ = generate_random_labels(
+        lambda rng: rng.randint(0, 1000, n_samples).astype(pred_dtype))
 
     cuml_reg, sklearn_reg = {
         'mse': (mean_squared_error, sklearn_mse),