From 6bde235197482865a8bde3cabe78c353af6fb618 Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Thu, 7 Nov 2024 07:43:45 -0800 Subject: [PATCH 1/7] Add catboost to the third-party integration tests --- .../dependencies.yaml | 17 +++ .../tests/test_catboost.py | 129 ++++++++++++++++++ 2 files changed, 146 insertions(+) create mode 100644 python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_catboost.py diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml b/python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml index 84b731e6c51..72a4e9ed175 100644 --- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml +++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml @@ -76,6 +76,13 @@ files: - py_version - test_base - test_xgboost + test_catboost: + output: none + includes: + - cuda_version + - py_version + - test_base + - test_catboost test_cuml: output: none includes: @@ -244,6 +251,16 @@ dependencies: - pip - pip: - xgboost>=2.0.1 + test_catboost: + common: + - output_types: conda + packages: + - numpy + - scipy + - scikit-learn + - pip + - pip: + - catboost test_cuml: common: - output_types: conda diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_catboost.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_catboost.py new file mode 100644 index 00000000000..922f3f385df --- /dev/null +++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_catboost.py @@ -0,0 +1,129 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import numpy as np +import pandas as pd +import pytest +from catboost import CatBoostClassifier, CatBoostRegressor, Pool +from sklearn.datasets import make_classification, make_regression + +rng = np.random.default_rng(seed=42) + + +def assert_catboost_equal(expect, got, rtol=1e-7, atol=0.0): + if isinstance(expect, (tuple, list)): + assert len(expect) == len(got) + for e, g in zip(expect, got): + assert_catboost_equal(e, g, rtol, atol) + elif isinstance(expect, np.ndarray): + np.testing.assert_allclose(expect, got, rtol=rtol, atol=atol) + elif isinstance(expect, pd.DataFrame): + pd.testing.assert_frame_equal(expect, got) + elif isinstance(expect, pd.Series): + pd.testing.assert_series_equal(expect, got) + else: + assert expect == got + + +pytestmark = pytest.mark.assert_eq(fn=assert_catboost_equal) + + +@pytest.fixture +def regression_data(): + X, y = make_regression(n_samples=100, n_features=10, random_state=42) + return pd.DataFrame(X), pd.Series(y) + + +@pytest.fixture +def classification_data(): + X, y = make_classification( + n_samples=100, n_features=10, n_classes=2, random_state=42 + ) + return pd.DataFrame(X), pd.Series(y) + + +def test_catboost_regressor_with_dataframe(regression_data): + X, y = regression_data + model = CatBoostRegressor(iterations=10, verbose=0) + model.fit(X, y) + predictions = model.predict(X) + return predictions + + +def test_catboost_regressor_with_numpy(regression_data): + X, y = regression_data + model = CatBoostRegressor(iterations=10, verbose=0) + model.fit(X.values, y.values) + predictions = model.predict(X.values) + return predictions + + +def test_catboost_classifier_with_dataframe(classification_data): + X, y = classification_data + model = CatBoostClassifier(iterations=10, verbose=0) + model.fit(X, y) + predictions = model.predict(X) + return predictions + + +def test_catboost_classifier_with_numpy(classification_data): + X, y = classification_data + model = CatBoostClassifier(iterations=10, verbose=0) + model.fit(X.values, y.values) + predictions = model.predict(X.values) + return predictions + + +def test_catboost_with_pool_and_dataframe(regression_data): + X, y = regression_data + train_pool = Pool(X, y) + model = CatBoostRegressor(iterations=10, verbose=0) + model.fit(train_pool) + predictions = model.predict(X) + return predictions + + +def test_catboost_with_pool_and_numpy(regression_data): + X, y = regression_data + train_pool = Pool(X.values, y.values) + model = CatBoostRegressor(iterations=10, verbose=0) + model.fit(train_pool) + predictions = model.predict(X.values) + return predictions + + +def test_catboost_with_categorical_features(): + data = { + "numerical_feature": rng.standard_normal(100), + "categorical_feature": rng.choice(["A", "B", "C"], size=100), + "target": rng.integers(0, 2, size=100), + } + df = pd.DataFrame(data) + X = df[["numerical_feature", "categorical_feature"]] + y = df["target"] + cat_features = ["categorical_feature"] + model = CatBoostClassifier( + iterations=10, verbose=0, cat_features=cat_features + ) + model.fit(X, y) + predictions = model.predict(X) + return predictions + + +@pytest.mark.parametrize( + "X, y", + [ + ( + pd.DataFrame(rng.standard_normal(100, 5)), + pd.Series(rng.standard_normal(100)), + ), + (rng.standard_normal(100, 5), rng.standard_normal(100)), + ], +) +def test_catboost_train_test_split(X, y): + from sklearn.model_selection import train_test_split + + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) + model = CatBoostRegressor(iterations=10, verbose=0) + model.fit(X_train, y_train) + predictions = model.predict(X_test) + return len(X_train), len(X_test), len(y_train), len(y_test), predictions From b3467359c15f91614086e6a49a202996e43a10fb Mon Sep 17 00:00:00 2001 From: Matthew Murray <41342305+Matt711@users.noreply.github.com> Date: Thu, 7 Nov 2024 13:28:51 -0500 Subject: [PATCH 2/7] Update pr.yaml --- .github/workflows/pr.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index bc237cc73b0..62f3ddff017 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -37,6 +37,7 @@ jobs: - unit-tests-cudf-pandas - pandas-tests - pandas-tests-diff + - third-party-integration-tests-cudf-pandas secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.12 if: always() From 7ee0165d20c2aa6da0838e6943220a753c7914d3 Mon Sep 17 00:00:00 2001 From: Matthew Murray <41342305+Matt711@users.noreply.github.com> Date: Thu, 7 Nov 2024 13:32:15 -0500 Subject: [PATCH 3/7] Update pr.yaml --- .github/workflows/pr.yaml | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 62f3ddff017..b5778317d20 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -303,3 +303,14 @@ jobs: node_type: cpu4 build_type: pull-request run_script: "ci/cudf_pandas_scripts/pandas-tests/diff.sh" + third-party-integration-tests-cudf-pandas: + needs: wheel-build-cudf + secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10 + with: + build_type: pull-request + node_type: "gpu-v100-latest-1" + arch: "amd64" + container_image: "rapidsai/ci-conda:latest" + run_script: | + ci/cudf_pandas_scripts/third-party-integration/test.sh python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml From ab03a4dc07e693dec2ad53a99ab3e3cbefb86331 Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Thu, 7 Nov 2024 17:41:42 -0800 Subject: [PATCH 4/7] pass a tuple --- .../third_party_integration_tests/tests/test_catboost.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_catboost.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_catboost.py index 922f3f385df..e7e990992f4 100644 --- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_catboost.py +++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_catboost.py @@ -113,7 +113,7 @@ def test_catboost_with_categorical_features(): "X, y", [ ( - pd.DataFrame(rng.standard_normal(100, 5)), + pd.DataFrame(rng.standard_normal((100, 5))), pd.Series(rng.standard_normal(100)), ), (rng.standard_normal(100, 5), rng.standard_normal(100)), From 1d0fb74252ff0b8bc5735488ae74057c6ec6c1c0 Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Fri, 8 Nov 2024 07:06:30 -0800 Subject: [PATCH 5/7] pass a tuple again --- .../third_party_integration_tests/tests/test_catboost.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_catboost.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_catboost.py index e7e990992f4..04cc69231fe 100644 --- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_catboost.py +++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_catboost.py @@ -116,7 +116,7 @@ def test_catboost_with_categorical_features(): pd.DataFrame(rng.standard_normal((100, 5))), pd.Series(rng.standard_normal(100)), ), - (rng.standard_normal(100, 5), rng.standard_normal(100)), + (rng.standard_normal((100, 5)), rng.standard_normal(100)), ], ) def test_catboost_train_test_split(X, y): From 614e406d480baf676b92d3833f0b1d712eda89bd Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Tue, 12 Nov 2024 17:50:54 -0800 Subject: [PATCH 6/7] use conda --- .../third_party_integration_tests/dependencies.yaml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml b/python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml index 72a4e9ed175..2c7330d5ee6 100644 --- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml +++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml @@ -258,9 +258,7 @@ dependencies: - numpy - scipy - scikit-learn - - pip - - pip: - - catboost + - catboost test_cuml: common: - output_types: conda From f86494d1a4f18f464aa2d14e8727aa9de5fb25d8 Mon Sep 17 00:00:00 2001 From: Matthew Murray Date: Tue, 12 Nov 2024 20:54:55 -0800 Subject: [PATCH 7/7] remove CI changes --- .github/workflows/pr.yaml | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index b5778317d20..bc237cc73b0 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -37,7 +37,6 @@ jobs: - unit-tests-cudf-pandas - pandas-tests - pandas-tests-diff - - third-party-integration-tests-cudf-pandas secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.12 if: always() @@ -303,14 +302,3 @@ jobs: node_type: cpu4 build_type: pull-request run_script: "ci/cudf_pandas_scripts/pandas-tests/diff.sh" - third-party-integration-tests-cudf-pandas: - needs: wheel-build-cudf - secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10 - with: - build_type: pull-request - node_type: "gpu-v100-latest-1" - arch: "amd64" - container_image: "rapidsai/ci-conda:latest" - run_script: | - ci/cudf_pandas_scripts/third-party-integration/test.sh python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml