From 372a9a54df42fc65b2dd3e9fc6c33e3a61ef8856 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Wed, 25 May 2022 12:41:16 -0500
Subject: [PATCH 1/4] Pin `dask` & `distributed` for release (#680)

Pinnings to be in-line with https://github.com/rapidsai/cudf/pull/10965

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/raft/pull/680
---
 ci/gpu/build.sh                          | 4 ++--
 conda/environments/raft_dev_cuda11.0.yml | 4 ++--
 conda/environments/raft_dev_cuda11.2.yml | 4 ++--
 conda/environments/raft_dev_cuda11.4.yml | 4 ++--
 conda/environments/raft_dev_cuda11.5.yml | 4 ++--
 conda/recipes/pyraft/meta.yaml           | 4 ++--
 6 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index cd4030e0b2..ba6bed2fc2 100644
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -73,8 +73,8 @@ fi
 # Install the master version of dask, distributed, and dask-ml
 gpuci_logger "Install the master version of dask and distributed"
 set -x
-pip install "git+https://github.com/dask/distributed.git@main" --upgrade --no-deps
-pip install "git+https://github.com/dask/dask.git@main" --upgrade --no-deps
+pip install "git+https://github.com/dask/distributed.git@2022.05.1" --upgrade --no-deps
+pip install "git+https://github.com/dask/dask.git@2022.05.1" --upgrade --no-deps
 set +x
 
 # Install pre-built conda packages from previous CI step
diff --git a/conda/environments/raft_dev_cuda11.0.yml b/conda/environments/raft_dev_cuda11.0.yml
index 30df28aa52..e34a9e0423 100644
--- a/conda/environments/raft_dev_cuda11.0.yml
+++ b/conda/environments/raft_dev_cuda11.0.yml
@@ -26,8 +26,8 @@ dependencies:
 - pip:
     - sphinx_markdown_tables
     - breathe
-    - git+https://github.com/dask/dask.git@main
-    - git+https://github.com/dask/distributed.git@main
+    - git+https://github.com/dask/dask.git@2022.05.1
+    - git+https://github.com/dask/distributed.git@2022.05.1
 
 # rapids-build-env, notebook-env and doc-env are defined in
 # https://docs.rapids.ai/maintainers/depmgmt/
diff --git a/conda/environments/raft_dev_cuda11.2.yml b/conda/environments/raft_dev_cuda11.2.yml
index 949be16dca..7d25499710 100644
--- a/conda/environments/raft_dev_cuda11.2.yml
+++ b/conda/environments/raft_dev_cuda11.2.yml
@@ -26,8 +26,8 @@ dependencies:
 - pip:
     - sphinx_markdown_tables
     - breathe
-    - git+https://github.com/dask/dask.git@main
-    - git+https://github.com/dask/distributed.git@main
+    - git+https://github.com/dask/dask.git@2022.05.1
+    - git+https://github.com/dask/distributed.git@2022.05.1
     
 # rapids-build-env, notebook-env and doc-env are defined in
 # https://docs.rapids.ai/maintainers/depmgmt/
diff --git a/conda/environments/raft_dev_cuda11.4.yml b/conda/environments/raft_dev_cuda11.4.yml
index 195e6fad2b..12e8cb0ab7 100644
--- a/conda/environments/raft_dev_cuda11.4.yml
+++ b/conda/environments/raft_dev_cuda11.4.yml
@@ -26,8 +26,8 @@ dependencies:
 - pip:
     - sphinx_markdown_tables
     - breathe
-    - git+https://github.com/dask/dask.git@main
-    - git+https://github.com/dask/distributed.git@main
+    - git+https://github.com/dask/dask.git@2022.05.1
+    - git+https://github.com/dask/distributed.git@2022.05.1
 
 # rapids-build-env, notebook-env and doc-env are defined in
 # https://docs.rapids.ai/maintainers/depmgmt/
diff --git a/conda/environments/raft_dev_cuda11.5.yml b/conda/environments/raft_dev_cuda11.5.yml
index 2fb2abbc85..bc03362e5d 100644
--- a/conda/environments/raft_dev_cuda11.5.yml
+++ b/conda/environments/raft_dev_cuda11.5.yml
@@ -27,8 +27,8 @@ dependencies:
 - pip:
     - sphinx_markdown_tables
     - breathe
-    - git+https://github.com/dask/dask.git@main
-    - git+https://github.com/dask/distributed.git@main
+    - git+https://github.com/dask/dask.git@2022.05.1
+    - git+https://github.com/dask/distributed.git@2022.05.1
 
 # rapids-build-env, notebook-env and doc-env are defined in
 # https://docs.rapids.ai/maintainers/depmgmt/
diff --git a/conda/recipes/pyraft/meta.yaml b/conda/recipes/pyraft/meta.yaml
index a640c4beb7..27e1153270 100644
--- a/conda/recipes/pyraft/meta.yaml
+++ b/conda/recipes/pyraft/meta.yaml
@@ -48,8 +48,8 @@ requirements:
     - ucx >={{ ucx_version }}
     - ucx-py {{ ucx_py_version }}
     - ucx-proc=*=gpu
-    - dask>=2022.03.0
-    - distributed>=2022.03.0
+    - dask==2022.05.1
+    - distributed==2022.05.1
     - cuda-python >=11.5,<12.0
     - joblib >=0.11
     - {{ pin_compatible('cudatoolkit', max_pin='x', min_pin='x') }}

From 29b3a18947e83ce242556c39b9155d9761df390a Mon Sep 17 00:00:00 2001
From: Mahesh Doijade <36705640+mdoijade@users.noreply.github.com>
Date: Thu, 26 May 2022 01:50:08 +0530
Subject: [PATCH 2/4] fix race in fusedL2knn smem read/write by adding a
 syncwarp (#679)

-- fix race in fusedL2knn smem read/write by adding a syncwarp reported by racecheck tool of compute-sanitizer.
-- this addresses issue - https://github.com/rapidsai/raft/issues/676

Authors:
  - Mahesh Doijade (https://github.com/mdoijade)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/679
---
 cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh b/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh
index 7424a5ff81..f8532e52a0 100644
--- a/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh
+++ b/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh
@@ -446,6 +446,7 @@ __global__ __launch_bounds__(Policy::Nthreads, 2) void fusedL2kNN(const DataT* x
                   }
                 }
               }
+              __syncwarp();
               const int finalNumVals = raft::shfl(numValsWarpTopK[i], 31);
               loadWarpQShmem<Policy, Pair>(heapArr[i], &shDumpKV[0], rowId, numOfNN);
               updateSortedWarpQ<Pair, myWarpSelect::kNumWarpQRegisters>(

From 88334f05c2c5db0338be272d3746765e19d62322 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@users.noreply.github.com>
Date: Wed, 25 May 2022 17:30:40 -0400
Subject: [PATCH 3/4] Adding handle and stream to pylibraft (#683)

Authors:
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Brad Rees (https://github.com/BradReesWork)

URL: https://github.com/rapidsai/raft/pull/683
---
 python/pylibraft/pylibraft/common/cuda.pyx   | 84 ++++++++++++++++++
 python/pylibraft/pylibraft/common/handle.pxd |  2 +-
 python/pylibraft/pylibraft/common/handle.pyx | 89 ++++++++++++++++++++
 3 files changed, 174 insertions(+), 1 deletion(-)
 create mode 100644 python/pylibraft/pylibraft/common/cuda.pyx
 create mode 100644 python/pylibraft/pylibraft/common/handle.pyx

diff --git a/python/pylibraft/pylibraft/common/cuda.pyx b/python/pylibraft/pylibraft/common/cuda.pyx
new file mode 100644
index 0000000000..eb48f64cf1
--- /dev/null
+++ b/python/pylibraft/pylibraft/common/cuda.pyx
@@ -0,0 +1,84 @@
+#
+# Copyright (c) 2022, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# cython: profile=False
+# distutils: language = c++
+# cython: embedsignature = True
+# cython: language_level = 3
+
+from cuda.ccudart cimport(
+    cudaStream_t,
+    cudaError_t,
+    cudaSuccess,
+    cudaStreamCreate,
+    cudaStreamDestroy,
+    cudaStreamSynchronize,
+    cudaGetLastError,
+    cudaGetErrorString,
+    cudaGetErrorName
+)
+
+
+class CudaRuntimeError(RuntimeError):
+    def __init__(self, extraMsg=None):
+        cdef cudaError_t e = cudaGetLastError()
+        cdef bytes errMsg = cudaGetErrorString(e)
+        cdef bytes errName = cudaGetErrorName(e)
+        msg = "Error! %s reason='%s'" % (errName.decode(), errMsg.decode())
+        if extraMsg is not None:
+            msg += " extraMsg='%s'" % extraMsg
+        super(CudaRuntimeError, self).__init__(msg)
+
+
+cdef class Stream:
+    """
+    Stream represents a thin-wrapper around cudaStream_t and its operations.
+
+    Examples
+    --------
+
+    .. code-block:: python
+
+        from raft.common.cuda import Stream
+        stream = Stream()
+        stream.sync()
+        del stream  # optional!
+    """
+    def __cinit__(self):
+        cdef cudaStream_t stream
+        cdef cudaError_t e = cudaStreamCreate(&stream)
+        if e != cudaSuccess:
+            raise CudaRuntimeError("Stream create")
+        self.s = stream
+
+    def __dealloc__(self):
+        self.sync()
+        cdef cudaError_t e = cudaStreamDestroy(self.s)
+        if e != cudaSuccess:
+            raise CudaRuntimeError("Stream destroy")
+
+    def sync(self):
+        """
+        Synchronize on the cudastream owned by this object. Note that this
+        could raise exception due to issues with previous asynchronous
+        launches
+        """
+        cdef cudaError_t e = cudaStreamSynchronize(self.s)
+        if e != cudaSuccess:
+            raise CudaRuntimeError("Stream sync")
+
+    cdef cudaStream_t getStream(self):
+        return self.s
diff --git a/python/pylibraft/pylibraft/common/handle.pxd b/python/pylibraft/pylibraft/common/handle.pxd
index bc248a335b..6504a122f7 100644
--- a/python/pylibraft/pylibraft/common/handle.pxd
+++ b/python/pylibraft/pylibraft/common/handle.pxd
@@ -25,7 +25,7 @@ from rmm._lib.cuda_stream_pool cimport cuda_stream_pool
 from libcpp.memory cimport shared_ptr
 from libcpp.memory cimport unique_ptr
 
-cdef extern from "raft/handle.hpp" namespace "raft" nogil:
+cdef extern from "raft/core/handle.hpp" namespace "raft" nogil:
     cdef cppclass handle_t:
         handle_t() except +
         handle_t(cuda_stream_view stream_view) except +
diff --git a/python/pylibraft/pylibraft/common/handle.pyx b/python/pylibraft/pylibraft/common/handle.pyx
new file mode 100644
index 0000000000..83a4676076
--- /dev/null
+++ b/python/pylibraft/pylibraft/common/handle.pyx
@@ -0,0 +1,89 @@
+#
+# Copyright (c) 2022, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# cython: profile=False
+# distutils: language = c++
+# cython: embedsignature = True
+# cython: language_level = 3
+
+# import raft
+from rmm._lib.cuda_stream_view cimport cuda_stream_per_thread
+from rmm._lib.cuda_stream_view cimport cuda_stream_view
+
+from .cuda cimport Stream
+from .cuda import CudaRuntimeError
+
+
+cdef class Handle:
+    """
+    Handle is a lightweight python wrapper around the corresponding C++ class
+    of handle_t exposed by RAFT's C++ interface. Refer to the header file
+    raft/handle.hpp for interface level details of this struct
+
+    Examples
+    --------
+
+    .. code-block:: python
+
+        from raft.common import Stream, Handle
+        stream = Stream()
+        handle = Handle(stream)
+
+        # call algos here
+
+        # final sync of all work launched in the stream of this handle
+        # this is same as `raft.cuda.Stream.sync()` call, but safer in case
+        # the default stream inside the `handle_t` is being used
+        handle.sync()
+        del handle  # optional!
+    """
+
+    def __cinit__(self, stream: Stream = None, n_streams=0):
+        self.n_streams = n_streams
+        if n_streams > 0:
+            self.stream_pool.reset(new cuda_stream_pool(n_streams))
+
+        cdef cuda_stream_view c_stream
+        if stream is None:
+            # this constructor will construct a "main" handle on
+            # per-thread default stream, which is non-blocking
+            self.c_obj.reset(new handle_t(cuda_stream_per_thread,
+                                          self.stream_pool))
+        else:
+            # this constructor constructs a handle on user stream
+            c_stream = cuda_stream_view(stream.getStream())
+            self.c_obj.reset(new handle_t(c_stream,
+                                          self.stream_pool))
+
+    def sync(self):
+        """
+        Issues a sync on the stream set for this handle.
+        """
+        self.c_obj.get()[0].sync_stream()
+
+    def getHandle(self):
+        return <size_t> self.c_obj.get()
+
+    def __getstate__(self):
+        return self.n_streams
+
+    def __setstate__(self, state):
+        self.n_streams = state
+        if self.n_streams > 0:
+            self.stream_pool.reset(new cuda_stream_pool(self.n_streams))
+
+        self.c_obj.reset(new handle_t(cuda_stream_per_thread,
+                                      self.stream_pool))

From 120d4780a1f00106e86e86aed4949a42c8119d39 Mon Sep 17 00:00:00 2001
From: Micka <9810050+lowener@users.noreply.github.com>
Date: Thu, 26 May 2022 03:08:25 +0200
Subject: [PATCH 4/4] Revert print vector changes because of std::vector<bool>
 (#681)

The specialization of `std::vector` when `T=bool` is unfortunately causing compilation issue in cuml because the `data()` function member is not implemented. And the elements may not be stored contiguously.

(Link to the CI failure: https://gpuci.gpuopenanalytics.com/job/rapidsai/job/gpuci/job/cuml/job/prb/job/cuml-cpu-cuda-build-arm64/CUDA=11.5/1364/console)
cc @achirkin

Authors:
  - Micka (https://github.com/lowener)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/681
---
 cpp/include/raft/core/cudart_utils.hpp | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/cpp/include/raft/core/cudart_utils.hpp b/cpp/include/raft/core/cudart_utils.hpp
index a1e7e6bc32..ce0d2a7025 100644
--- a/cpp/include/raft/core/cudart_utils.hpp
+++ b/cpp/include/raft/core/cudart_utils.hpp
@@ -36,7 +36,6 @@
 #include <iomanip>
 #include <iostream>
 #include <mutex>
-#include <vector>
 
 ///@todo: enable once logging has been enabled in raft
 //#include "logger.hpp"
@@ -303,10 +302,10 @@ void print_device_vector(const char* variable_name,
                          size_t componentsCount,
                          OutStream& out)
 {
-  std::vector<T> host_mem(componentsCount);
-  CUDA_CHECK(
-    cudaMemcpy(host_mem.data(), devMem, componentsCount * sizeof(T), cudaMemcpyDeviceToHost));
-  print_host_vector(variable_name, host_mem.data(), componentsCount, out);
+  T* host_mem = new T[componentsCount];
+  CUDA_CHECK(cudaMemcpy(host_mem, devMem, componentsCount * sizeof(T), cudaMemcpyDeviceToHost));
+  print_host_vector(variable_name, host_mem, componentsCount, out);
+  delete[] host_mem;
 }
 
 /**