Merge remote-tracking branch 'upstream/branch-22.08' into thrust-incl…

…udes
rapidsai · May 26, 2022 · 9155ae2 · 9155ae2
2 parents ff9b886 + f8c0676
commit 9155ae2
Show file tree

Hide file tree

Showing 11 changed files with 191 additions and 18 deletions.
diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
@@ -73,8 +73,8 @@ fi
 # Install the master version of dask, distributed, and dask-ml
 gpuci_logger "Install the master version of dask and distributed"
 set -x
-pip install "git+https://github.com/dask/distributed.git@main" --upgrade --no-deps
-pip install "git+https://github.com/dask/dask.git@main" --upgrade --no-deps
+pip install "git+https://github.com/dask/distributed.git@2022.05.1" --upgrade --no-deps
+pip install "git+https://github.com/dask/dask.git@2022.05.1" --upgrade --no-deps
 set +x
 
 # Install pre-built conda packages from previous CI step

diff --git a/conda/environments/raft_dev_cuda11.0.yml b/conda/environments/raft_dev_cuda11.0.yml
@@ -26,8 +26,8 @@ dependencies:
 - pip:
     - sphinx_markdown_tables
     - breathe
-    - git+https://github.com/dask/dask.git@main
-    - git+https://github.com/dask/distributed.git@main
+    - git+https://github.com/dask/dask.git@2022.05.1
+    - git+https://github.com/dask/distributed.git@2022.05.1
 
 # rapids-build-env, notebook-env and doc-env are defined in
 # https://docs.rapids.ai/maintainers/depmgmt/

diff --git a/conda/environments/raft_dev_cuda11.2.yml b/conda/environments/raft_dev_cuda11.2.yml
@@ -26,8 +26,8 @@ dependencies:
 - pip:
     - sphinx_markdown_tables
     - breathe
-    - git+https://github.com/dask/dask.git@main
-    - git+https://github.com/dask/distributed.git@main
+    - git+https://github.com/dask/dask.git@2022.05.1
+    - git+https://github.com/dask/distributed.git@2022.05.1
 
 # rapids-build-env, notebook-env and doc-env are defined in
 # https://docs.rapids.ai/maintainers/depmgmt/

diff --git a/conda/environments/raft_dev_cuda11.4.yml b/conda/environments/raft_dev_cuda11.4.yml
@@ -26,8 +26,8 @@ dependencies:
 - pip:
     - sphinx_markdown_tables
     - breathe
-    - git+https://github.com/dask/dask.git@main
-    - git+https://github.com/dask/distributed.git@main
+    - git+https://github.com/dask/dask.git@2022.05.1
+    - git+https://github.com/dask/distributed.git@2022.05.1
 
 # rapids-build-env, notebook-env and doc-env are defined in
 # https://docs.rapids.ai/maintainers/depmgmt/

diff --git a/conda/environments/raft_dev_cuda11.5.yml b/conda/environments/raft_dev_cuda11.5.yml
@@ -27,8 +27,8 @@ dependencies:
 - pip:
     - sphinx_markdown_tables
     - breathe
-    - git+https://github.com/dask/dask.git@main
-    - git+https://github.com/dask/distributed.git@main
+    - git+https://github.com/dask/dask.git@2022.05.1
+    - git+https://github.com/dask/distributed.git@2022.05.1
 
 # rapids-build-env, notebook-env and doc-env are defined in
 # https://docs.rapids.ai/maintainers/depmgmt/

diff --git a/conda/recipes/pyraft/meta.yaml b/conda/recipes/pyraft/meta.yaml
@@ -48,8 +48,8 @@ requirements:
     - ucx >={{ ucx_version }}
     - ucx-py {{ ucx_py_version }}
     - ucx-proc=*=gpu
-    - dask>=2022.03.0
-    - distributed>=2022.03.0
+    - dask==2022.05.1
+    - distributed==2022.05.1
     - cuda-python >=11.5,<12.0
     - joblib >=0.11
     - {{ pin_compatible('cudatoolkit', max_pin='x', min_pin='x') }}

diff --git a/cpp/include/raft/core/cudart_utils.hpp b/cpp/include/raft/core/cudart_utils.hpp
@@ -36,7 +36,6 @@
 #include <iomanip>
 #include <iostream>
 #include <mutex>
-#include <vector>
 
 ///@todo: enable once logging has been enabled in raft
 //#include "logger.hpp"
@@ -303,10 +302,10 @@ void print_device_vector(const char* variable_name,
                          size_t componentsCount,
                          OutStream& out)
 {
-  std::vector<T> host_mem(componentsCount);
-  CUDA_CHECK(
-    cudaMemcpy(host_mem.data(), devMem, componentsCount * sizeof(T), cudaMemcpyDeviceToHost));
-  print_host_vector(variable_name, host_mem.data(), componentsCount, out);
+  T* host_mem = new T[componentsCount];
+  CUDA_CHECK(cudaMemcpy(host_mem, devMem, componentsCount * sizeof(T), cudaMemcpyDeviceToHost));
+  print_host_vector(variable_name, host_mem, componentsCount, out);
+  delete[] host_mem;
 }
 
 /**

diff --git a/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh b/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh
@@ -446,6 +446,7 @@ __global__ __launch_bounds__(Policy::Nthreads, 2) void fusedL2kNN(const DataT* x
                   }
                 }
               }
+              __syncwarp();
               const int finalNumVals = raft::shfl(numValsWarpTopK[i], 31);
               loadWarpQShmem<Policy, Pair>(heapArr[i], &shDumpKV[0], rowId, numOfNN);
               updateSortedWarpQ<Pair, myWarpSelect::kNumWarpQRegisters>(

diff --git a/python/pylibraft/pylibraft/common/cuda.pyx b/python/pylibraft/pylibraft/common/cuda.pyx
@@ -0,0 +1,84 @@
+#
+# Copyright (c) 2022, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# cython: profile=False
+# distutils: language = c++
+# cython: embedsignature = True
+# cython: language_level = 3
+
+from cuda.ccudart cimport(
+    cudaStream_t,
+    cudaError_t,
+    cudaSuccess,
+    cudaStreamCreate,
+    cudaStreamDestroy,
+    cudaStreamSynchronize,
+    cudaGetLastError,
+    cudaGetErrorString,
+    cudaGetErrorName
+)
+
+
+class CudaRuntimeError(RuntimeError):
+    def __init__(self, extraMsg=None):
+        cdef cudaError_t e = cudaGetLastError()
+        cdef bytes errMsg = cudaGetErrorString(e)
+        cdef bytes errName = cudaGetErrorName(e)
+        msg = "Error! %s reason='%s'" % (errName.decode(), errMsg.decode())
+        if extraMsg is not None:
+            msg += " extraMsg='%s'" % extraMsg
+        super(CudaRuntimeError, self).__init__(msg)
+
+
+cdef class Stream:
+    """
+    Stream represents a thin-wrapper around cudaStream_t and its operations.
+
+    Examples
+    --------
+
+    .. code-block:: python
+
+        from raft.common.cuda import Stream
+        stream = Stream()
+        stream.sync()
+        del stream  # optional!
+    """
+    def __cinit__(self):
+        cdef cudaStream_t stream
+        cdef cudaError_t e = cudaStreamCreate(&stream)
+        if e != cudaSuccess:
+            raise CudaRuntimeError("Stream create")
+        self.s = stream
+
+    def __dealloc__(self):
+        self.sync()
+        cdef cudaError_t e = cudaStreamDestroy(self.s)
+        if e != cudaSuccess:
+            raise CudaRuntimeError("Stream destroy")
+
+    def sync(self):
+        """
+        Synchronize on the cudastream owned by this object. Note that this
+        could raise exception due to issues with previous asynchronous
+        launches
+        """
+        cdef cudaError_t e = cudaStreamSynchronize(self.s)
+        if e != cudaSuccess:
+            raise CudaRuntimeError("Stream sync")
+
+    cdef cudaStream_t getStream(self):
+        return self.s
diff --git a/python/pylibraft/pylibraft/common/handle.pxd b/python/pylibraft/pylibraft/common/handle.pxd
@@ -25,7 +25,7 @@ from rmm._lib.cuda_stream_pool cimport cuda_stream_pool
 from libcpp.memory cimport shared_ptr
 from libcpp.memory cimport unique_ptr
 
-cdef extern from "raft/handle.hpp" namespace "raft" nogil:
+cdef extern from "raft/core/handle.hpp" namespace "raft" nogil:
     cdef cppclass handle_t:
         handle_t() except +
         handle_t(cuda_stream_view stream_view) except +

diff --git a/python/pylibraft/pylibraft/common/handle.pyx b/python/pylibraft/pylibraft/common/handle.pyx
@@ -0,0 +1,89 @@
+#
+# Copyright (c) 2022, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# cython: profile=False
+# distutils: language = c++
+# cython: embedsignature = True
+# cython: language_level = 3
+
+# import raft
+from rmm._lib.cuda_stream_view cimport cuda_stream_per_thread
+from rmm._lib.cuda_stream_view cimport cuda_stream_view
+
+from .cuda cimport Stream
+from .cuda import CudaRuntimeError
+
+
+cdef class Handle:
+    """
+    Handle is a lightweight python wrapper around the corresponding C++ class
+    of handle_t exposed by RAFT's C++ interface. Refer to the header file
+    raft/handle.hpp for interface level details of this struct
+
+    Examples
+    --------
+
+    .. code-block:: python
+
+        from raft.common import Stream, Handle
+        stream = Stream()
+        handle = Handle(stream)
+
+        # call algos here
+
+        # final sync of all work launched in the stream of this handle
+        # this is same as `raft.cuda.Stream.sync()` call, but safer in case
+        # the default stream inside the `handle_t` is being used
+        handle.sync()
+        del handle  # optional!
+    """
+
+    def __cinit__(self, stream: Stream = None, n_streams=0):
+        self.n_streams = n_streams
+        if n_streams > 0:
+            self.stream_pool.reset(new cuda_stream_pool(n_streams))
+
+        cdef cuda_stream_view c_stream
+        if stream is None:
+            # this constructor will construct a "main" handle on
+            # per-thread default stream, which is non-blocking
+            self.c_obj.reset(new handle_t(cuda_stream_per_thread,
+                                          self.stream_pool))
+        else:
+            # this constructor constructs a handle on user stream
+            c_stream = cuda_stream_view(stream.getStream())
+            self.c_obj.reset(new handle_t(c_stream,
+                                          self.stream_pool))
+
+    def sync(self):
+        """
+        Issues a sync on the stream set for this handle.
+        """
+        self.c_obj.get()[0].sync_stream()
+
+    def getHandle(self):
+        return <size_t> self.c_obj.get()
+
+    def __getstate__(self):
+        return self.n_streams
+
+    def __setstate__(self, state):
+        self.n_streams = state
+        if self.n_streams > 0:
+            self.stream_pool.reset(new cuda_stream_pool(self.n_streams))
+
+        self.c_obj.reset(new handle_t(cuda_stream_per_thread,
+                                      self.stream_pool))