Merge branch 'branch-25.02' into libucx-in-devcontainers

rapidsai · Dec 11, 2024 · 656a6e4 · 656a6e4
2 parents 5ffc8c4 + 858bd9f
commit 656a6e4
Show file tree

Hide file tree

Showing 8 changed files with 216 additions and 89 deletions.
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -9,7 +9,7 @@ channels:
 dependencies:
 - c-compiler
 - cmake>=3.26.4,!=3.30.0
-- cuda-python>=11.7.1,<12.0a0,<=11.8.3
+- cuda-python>=11.8.5,<12.0a0
 - cuda-version=11.8
 - cudatoolkit
 - cudf==25.2.*,>=0.0.0a0

diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -12,7 +12,7 @@ dependencies:
 - cuda-cudart-dev
 - cuda-nvcc
 - cuda-profiler-api
-- cuda-python>=12.0,<13.0a0,<=12.6.0
+- cuda-python>=12.6.2,<13.0a0
 - cuda-version=12.5
 - cudf==25.2.*,>=0.0.0a0
 - cupy>=12.0.0

diff --git a/conda/recipes/cuml/meta.yaml b/conda/recipes/cuml/meta.yaml
@@ -58,10 +58,10 @@ requirements:
     - cuda-version ={{ cuda_version }}
     {% if cuda_major == "11" %}
     - cudatoolkit
-    - cuda-python >=11.7.1,<12.0a0,<=11.8.3
+    - cuda-python >=11.8.5,<12.0a0
     {% else %}
     - cuda-cudart-dev
-    - cuda-python >=12.0,<13.0a0,<=12.6.0
+    - cuda-python >=12.6.2,<13.0a0
     {% endif %}
     - cudf ={{ minor_version }}
     - cython >=3.0.0
@@ -77,10 +77,10 @@ requirements:
     - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
     {% if cuda_major == "11" %}
     - cudatoolkit
-    - cuda-python >=11.7.1,<12.0a0,<=11.8.3
+    - cuda-python >=11.8.5,<12.0a0
     {% else %}
     - cuda-cudart
-    - cuda-python >=12.0,<13.0a0,<=12.6.0
+    - cuda-python >=12.6.2,<13.0a0
     {% endif %}
     - cudf ={{ minor_version }}
     - cupy >=12.0.0

diff --git a/cpp/src/glm/qn/mg/standardization.cuh b/cpp/src/glm/qn/mg/standardization.cuh
@@ -161,6 +161,16 @@ void mean(const raft::handle_t& handle,
   auto stream  = handle.get_stream();
   auto& comm   = handle.get_comms();
 
+  if (X.nnz == 0) {
+    SimpleVec<T> meanVec(mean_vector, D);
+    meanVec.fill(0., stream);
+
+    // call allreduces on zeroes to sync with other GPUs to avoid hanging
+    comm.allreduce(mean_vector, mean_vector, D, raft::comms::op_t::SUM, stream);
+    comm.sync_stream(stream);
+    return;
+  }
+
   int chunk_size = 500000;  // split matrix by rows for better numeric precision
   rmm::device_uvector<I> buff_row_ids(chunk_size + 1, stream);
 
@@ -200,27 +210,24 @@ void mean_stddev(const raft::handle_t& handle,
   auto stream = handle.get_stream();
   int D       = X.n;
 
-  if (X.nnz == 0) {
-    SimpleVec<T> meanVec(mean_vector, D);
-    meanVec.fill(0., stream);
-
-    SimpleVec<T> stddevVec(stddev_vector, D);
-    stddevVec.fill(0., stream);
-    return;
-  }
-
   mean(handle, X, n_samples, mean_vector);
 
   // calculate stdev.S
-  rmm::device_uvector<T> X_values_squared(X.nnz, stream);
-  raft::copy(X_values_squared.data(), X.values, X.nnz, stream);
-  auto square_op = [] __device__(const T a) { return a * a; };
-  raft::linalg::unaryOp(X_values_squared.data(), X_values_squared.data(), X.nnz, square_op, stream);
-
-  auto X_squared =
-    SimpleSparseMat<T, I>(X_values_squared.data(), X.cols, X.row_ids, X.nnz, X.m, X.n);
 
-  mean(handle, X_squared, n_samples, stddev_vector);
+  if (X.nnz == 0) {
+    mean(handle, X, n_samples, stddev_vector);
+  } else {
+    rmm::device_uvector<T> X_values_squared(X.nnz, stream);
+    raft::copy(X_values_squared.data(), X.values, X.nnz, stream);
+    auto square_op = [] __device__(const T a) { return a * a; };
+    raft::linalg::unaryOp(
+      X_values_squared.data(), X_values_squared.data(), X.nnz, square_op, stream);
+
+    auto X_squared =
+      SimpleSparseMat<T, I>(X_values_squared.data(), X.cols, X.row_ids, X.nnz, X.m, X.n);
+
+    mean(handle, X_squared, n_samples, stddev_vector);
+  }
 
   T weight               = n_samples / T(n_samples - 1);
   auto submean_no_neg_op = [weight] __device__(const T a, const T b) -> T {

diff --git a/dependencies.yaml b/dependencies.yaml
@@ -198,11 +198,11 @@ dependencies:
           - matrix:
               cuda: "12.*"
             packages:
-              - cuda-python>=12.0,<13.0a0,<=12.6.0
+              - cuda-python>=12.6.2,<13.0a0
           - matrix:
               cuda: "11.*"
             packages:
-              - cuda-python>=11.7.1,<12.0a0,<=11.8.3
+              - cuda-python>=11.8.5,<12.0a0
           - matrix:
             packages:
               - cuda-python

diff --git a/python/cuml/cuml/dask/neighbors/kneighbors_classifier.py b/python/cuml/cuml/dask/neighbors/kneighbors_classifier.py
@@ -114,8 +114,6 @@ def fit(self, X, y):
                 # Dask-expr does not support numerical column names
                 # See: https://github.com/dask/dask-expr/issues/1015
                 _y = y
-                if hasattr(y, "to_legacy_dataframe"):
-                    _y = y.to_legacy_dataframe()
                 n_targets = len(_y.columns)
                 for i in range(n_targets):
                     uniq_labels.append(_y.iloc[:, i].unique())

diff --git a/python/cuml/cuml/tests/dask/test_dask_arr_utils.py b/python/cuml/cuml/tests/dask/test_dask_arr_utils.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -50,6 +50,10 @@ def test_to_sparse_dask_array(input_type, nrows, ncols, client):
 
     a = cupyx.scipy.sparse.random(nrows, ncols, format="csr", dtype=cp.float32)
     if input_type == "dask_dataframe":
+        pytest.xfail(
+            reason="Dask nightlies break task fusing for this, "
+            "issue https://github.com/rapidsai/cuml/issues/6169"
+        )
         df = cudf.DataFrame(a.todense())
         inp = dask_cudf.from_cudf(df, npartitions=2)
     elif input_type == "dask_array":