rapidsai · ajschmidt8 · Sep 16, 2021 · May 19, 2021 · May 24, 2021 · May 24, 2021
@@ -1,11 +1,11 @@
 #cpp code owners
-cpp/               @divyegala @rapidsai/cuml-cpp-codeowners @rapidsai/cugraph-cpp-codeowners
+cpp/               @rapidsai/cuml-cpp-codeowners @rapidsai/cugraph-cpp-codeowners
 
 #python code owners
-python/            @divyegala @rapidsai/cuml-python-codeowners @rapidsai/cugraph-python-codeowners
+python/            @rapidsai/cuml-python-codeowners @rapidsai/cugraph-python-codeowners
 
 #cmake code owners
-**/CMakeLists.txt  @divyegala @rapidsai/cuml-cmake-codeowners @rapidsai/cugraph-cmake-codeowners
+**/CMakeLists.txt  @rapidsai/cuml-cmake-codeowners @rapidsai/cugraph-cmake-codeowners
 **/cmake/          @rapidsai/cuml-cmake-codeowners @rapidsai/cugraph-cmake-codeowners
 python/setup.py    @rapidsai/cuml-cmake-codeowners @rapidsai/cugraph-cmake-codeowners
 build.sh           @rapidsai/cuml-cmake-codeowners @rapidsai/cugraph-cmake-codeowners

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,44 @@
+# raft 21.08.00 (4 Aug 2021)
+
+## 🚨 Breaking Changes
+
+- expose epsilon parameter to allow precision to to be specified ([#275](https://github.com/rapidsai/raft/pull/275)) [@ChuckHastings](https://github.com/ChuckHastings)
+
+## 🐛 Bug Fixes
+
+- Fix support for different input and output types in linalg::reduce ([#296](https://github.com/rapidsai/raft/pull/296)) [@Nyrio](https://github.com/Nyrio)
+- Const raft handle in sparse bfknn ([#280](https://github.com/rapidsai/raft/pull/280)) [@cjnolet](https://github.com/cjnolet)
+- Add `cuco::cuco` to list of linked libraries ([#279](https://github.com/rapidsai/raft/pull/279)) [@trxcllnt](https://github.com/trxcllnt)
+- Use nested include in destination of install headers to avoid docker permission issues ([#263](https://github.com/rapidsai/raft/pull/263)) [@dantegd](https://github.com/dantegd)
+- Update UCX-Py version to 0.21 ([#255](https://github.com/rapidsai/raft/pull/255)) [@pentschev](https://github.com/pentschev)
+- Fix mst knn test build failure due to RMM device_buffer change ([#253](https://github.com/rapidsai/raft/pull/253)) [@mdoijade](https://github.com/mdoijade)
+
+## 🚀 New Features
+
+- Add chebyshev, canberra, minkowksi and hellinger distance metrics ([#276](https://github.com/rapidsai/raft/pull/276)) [@mdoijade](https://github.com/mdoijade)
+- Move FAISS ANN wrappers to RAFT ([#265](https://github.com/rapidsai/raft/pull/265)) [@cjnolet](https://github.com/cjnolet)
+- Remaining sparse semiring distances ([#261](https://github.com/rapidsai/raft/pull/261)) [@cjnolet](https://github.com/cjnolet)
+- removing divye from codeowners ([#257](https://github.com/rapidsai/raft/pull/257)) [@divyegala](https://github.com/divyegala)
+
+## 🛠️ Improvements
+
+- Pinning cuco to a specific commit hash for release ([#304](https://github.com/rapidsai/raft/pull/304)) [@rlratzel](https://github.com/rlratzel)
+- Pin max `dask` &amp; `distributed` versions ([#301](https://github.com/rapidsai/raft/pull/301)) [@galipremsagar](https://github.com/galipremsagar)
+- Overlap epilog compute with ldg of next grid stride in pairwise distance &amp; fusedL2NN kernels ([#292](https://github.com/rapidsai/raft/pull/292)) [@mdoijade](https://github.com/mdoijade)
+- Always add faiss library alias if it&#39;s missing ([#287](https://github.com/rapidsai/raft/pull/287)) [@trxcllnt](https://github.com/trxcllnt)
+- Use `NVIDIA/cuCollections` repo again ([#284](https://github.com/rapidsai/raft/pull/284)) [@trxcllnt](https://github.com/trxcllnt)
+- Use the 21.08 branch of rapids-cmake as rmm requires it ([#278](https://github.com/rapidsai/raft/pull/278)) [@robertmaynard](https://github.com/robertmaynard)
+- expose epsilon parameter to allow precision to to be specified ([#275](https://github.com/rapidsai/raft/pull/275)) [@ChuckHastings](https://github.com/ChuckHastings)
+- Fix `21.08` forward-merge conflicts ([#274](https://github.com/rapidsai/raft/pull/274)) [@ajschmidt8](https://github.com/ajschmidt8)
+- Add lds and sts inline ptx instructions to force vector instruction generation ([#273](https://github.com/rapidsai/raft/pull/273)) [@mdoijade](https://github.com/mdoijade)
+- Move ANN to RAFT (additional updates) ([#270](https://github.com/rapidsai/raft/pull/270)) [@cjnolet](https://github.com/cjnolet)
+- Sparse semirings cleanup + hash table &amp; batching strategies ([#269](https://github.com/rapidsai/raft/pull/269)) [@divyegala](https://github.com/divyegala)
+- Revert &quot;pin dask versions in CI ([#260)&quot; (#264](https://github.com/rapidsai/raft/pull/260)&quot; (#264)) [@ajschmidt8](https://github.com/ajschmidt8)
+- Pass stream to device_scalar::value() calls. ([#259](https://github.com/rapidsai/raft/pull/259)) [@harrism](https://github.com/harrism)
+- Update get_rmm.cmake to better support CalVer ([#258](https://github.com/rapidsai/raft/pull/258)) [@harrism](https://github.com/harrism)
+- Add Grid stride pairwise dist and fused L2 NN kernels ([#250](https://github.com/rapidsai/raft/pull/250)) [@mdoijade](https://github.com/mdoijade)
+- Fix merge conflicts ([#236](https://github.com/rapidsai/raft/pull/236)) [@ajschmidt8](https://github.com/ajschmidt8)
+
 # raft 21.06.00 (9 Jun 2021)
 
 ## 🐛 Bug Fixes

@@ -51,16 +51,16 @@ gpuci_conda_retry install -c conda-forge -c rapidsai -c rapidsai-nightly -c nvid
       "rmm=${MINOR_VERSION}" \
       "dask-cudf=${MINOR_VERSION}" \
       "dask-cuda=${MINOR_VERSION}" \
-      "ucx-py=0.20.*" \
+      "ucx-py=0.21.*" \
       "rapids-build-env=${MINOR_VERSION}.*" \
       "rapids-notebook-env=${MINOR_VERSION}.*" \
       "rapids-doc-env=${MINOR_VERSION}.*"
 
 # Install the master version of dask, distributed, and dask-ml
 gpuci_logger "Install the master version of dask and distributed"
 set -x
-pip install "git+https://github.com/dask/distributed.git@2021.05.1" --upgrade --no-deps
-pip install "git+https://github.com/dask/dask.git@2021.05.1" --upgrade --no-deps
+pip install "git+https://github.com/dask/distributed.git@2021.07.1" --upgrade --no-deps
+pip install "git+https://github.com/dask/dask.git@2021.07.1" --upgrade --no-deps
 set +x
 
 

@@ -56,7 +56,7 @@ gpuci_conda_retry install -c conda-forge -c rapidsai -c rapidsai-nightly -c nvid
       "distributed>=2.12.0" \
       "dask-cudf=${MINOR_VERSION}" \
       "dask-cuda=${MINOR_VERSION}" \
-      "ucx-py=0.20.*"
+      "ucx-py=0.21.*"
 
 if [ "$RUN_CUML_LIBCUML_TESTS" = "ON" ] || [ "$RUN_CUML_PRIMS_TESTS" = "ON" ] || [ "$RUN_CUML_PYTHON_TESTS" = "ON" ]; then
   gpuci_conda_retry install -c conda-forge -c rapidsai -c rapidsai-nightly -c nvidia \
@@ -81,8 +81,8 @@ fi
 
 # Install the master version of dask, distributed, and dask-ml
 set -x
-pip install "git+https://github.com/dask/distributed.git@2021.05.1" --upgrade --no-deps
-pip install "git+https://github.com/dask/dask.git@2021.05.1" --upgrade --no-deps
+pip install "git+https://github.com/dask/distributed.git@2021.07.1" --upgrade --no-deps
+pip install "git+https://github.com/dask/dask.git@2021.07.1" --upgrade --no-deps
 set +x
 
 

@@ -19,7 +19,7 @@ include(FetchContent)
 FetchContent_Declare(
   rapids-cmake
   GIT_REPOSITORY https://github.com/rapidsai/rapids-cmake.git
-  GIT_TAG        origin/branch-21.06
+  GIT_TAG        origin/branch-21.08
   )
 FetchContent_MakeAvailable(rapids-cmake)
 include(rapids-cmake)
@@ -30,7 +30,7 @@ include(rapids-find)
 
 rapids_cuda_init_architectures(RAFT)
 
-project(RAFT VERSION 21.06.00 LANGUAGES CXX CUDA)
+project(RAFT VERSION 21.08.00 LANGUAGES CXX CUDA)
 
 ##############################################################################
 # - build type ---------------------------------------------------------------
@@ -104,6 +104,7 @@ endif(NOT DISABLE_OPENMP OR NOT ${DISABLE_OPENMP})
 # add third party dependencies using CPM
 rapids_cpm_init()
 
+include(cmake/thirdparty/get_thrust.cmake)
 include(cmake/thirdparty/get_rmm.cmake)
 include(cmake/thirdparty/get_cuco.cmake)
 
@@ -117,6 +118,8 @@ endif()
 ##############################################################################
 # - install targets-----------------------------------------------------------
 
+include(CPack)
+
 add_library(raft INTERFACE)
 add_library(raft::raft ALIAS raft)
 target_include_directories(raft INTERFACE "$<BUILD_INTERFACE:${RAFT_SOURCE_DIR}/include>"
@@ -130,6 +133,7 @@ INTERFACE
   CUDA::cudart
   CUDA::cusparse
   rmm::rmm
+  cuco::cuco
   )
 
 target_compile_features(raft INTERFACE cxx_std_17 $<BUILD_INTERFACE:cuda_std_17>)
@@ -140,10 +144,15 @@ install(TARGETS raft
         )
 
 include(GNUInstallDirs)
-install(DIRECTORY include/
-        DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
+install(DIRECTORY include/raft/
+        DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/raft
         )
 
+# Temporary install of raft.hpp while the file is removed
+install(FILES include/raft.hpp
+	DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/raft
+	)
+
 ##############################################################################
 # - install export -----------------------------------------------------------
 set(doc_string

@@ -16,20 +16,22 @@
 
 function(find_and_configure_cuco VERSION)
 
+    if(TARGET cuco::cuco)
+      return()
+    endif()
+
     rapids_cpm_find(cuco ${VERSION}
-      GLOBAL_TARGETS cuco cuco::cuco
+      GLOBAL_TARGETS      cuco::cuco
+      BUILD_EXPORT_SET    raft-exports
+      INSTALL_EXPORT_SET  raft-exports
       CPM_ARGS
         GIT_REPOSITORY https://github.com/NVIDIA/cuCollections.git
-        GIT_TAG        0b672bbde7c85a79df4d7ca5f82e15e5b4a57700
+        GIT_TAG        b1fea0cbe4c384160740af00f7c8760846539abb
         OPTIONS        "BUILD_TESTS OFF"
                        "BUILD_BENCHMARKS OFF"
                        "BUILD_EXAMPLES OFF"
     )
 
-    if(NOT TARGET cuco::cuco)
-      add_library(cuco::cuco ALIAS cuco)
-    endif()
-
 endfunction()
 
 find_and_configure_cuco(0.0.1)
@@ -40,7 +40,10 @@ function(find_and_configure_faiss)
 
     if(FAISS_ADDED)
       set(FAISS_GPU_HEADERS ${FAISS_SOURCE_DIR} PARENT_SCOPE)
-      add_library(FAISS::FAISS ALIAS faiss)
+    endif()
+
+    if(TARGET faiss AND NOT TARGET FAISS::FAISS)
+        add_library(FAISS::FAISS ALIAS faiss)
     endif()
 
 endfunction()

@@ -20,13 +20,19 @@ function(find_and_configure_rmm VERSION)
         return()
     endif()
 
+    if(${VERSION} MATCHES [=[([0-9]+)\.([0-9]+)\.([0-9]+)]=])
+        set(MAJOR_AND_MINOR "${CMAKE_MATCH_1}.${CMAKE_MATCH_2}")
+    else()
+        set(MAJOR_AND_MINOR "${VERSION}")
+    endif()
+
     rapids_cpm_find(rmm ${VERSION}
         GLOBAL_TARGETS      rmm::rmm
         BUILD_EXPORT_SET    raft-exports
         INSTALL_EXPORT_SET  raft-exports
         CPM_ARGS
             GIT_REPOSITORY  https://github.com/rapidsai/rmm.git
-            GIT_TAG         branch-${VERSION}
+            GIT_TAG         branch-${MAJOR_AND_MINOR}
             GIT_SHALLOW     TRUE
             OPTIONS         "BUILD_TESTS OFF"
                             "BUILD_BENCHMARKS OFF"
@@ -36,6 +42,6 @@ function(find_and_configure_rmm VERSION)
 
 endfunction()
 
-set(RAFT_MIN_VERSION_rmm "${RAFT_VERSION_MAJOR}.${RAFT_VERSION_MINOR}")
+set(RAFT_MIN_VERSION_rmm "${RAFT_VERSION_MAJOR}.${RAFT_VERSION_MINOR}.00")
 
 find_and_configure_rmm(${RAFT_MIN_VERSION_rmm})
@@ -0,0 +1,30 @@
+# =============================================================================
+# Copyright (c) 2021, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+
+# Use CPM to find or clone thrust
+function(find_and_configure_thrust VERSION)
+
+  rapids_cpm_find(
+    Thrust ${VERSION}
+    BUILD_EXPORT_SET raft-exports
+    INSTALL_EXPORT_SET raft-exports
+    CPM_ARGS
+    GIT_REPOSITORY https://github.com/NVIDIA/thrust.git
+    GIT_TAG ${VERSION}
+    GIT_SHALLOW TRUE
+    OPTIONS "THRUST_INSTALL OFF")
+
+endfunction()
+
+find_and_configure_thrust(1.12.0)
@@ -24,60 +24,95 @@ namespace raft {
  * @defgroup SmemStores Shared memory store operations
  * @{
  * @brief Stores to shared memory (both vectorized and non-vectorized forms)
- * @param[out] addr shared memory address
+ *        requires the given shmem pointer to be aligned by the vector
+          length, like for float4 lds/sts shmem pointer should be aligned
+          by 16 bytes else it might silently fail or can also give
+          runtime error.
+ * @param[out] addr shared memory address (should be aligned to vector size)
  * @param[in]  x    data to be stored at this address
  */
-DI void sts(float* addr, const float& x) { *addr = x; }
-DI void sts(float* addr, const float (&x)[1]) { *addr = x[0]; }
+DI void sts(float* addr, const float& x) {
+  auto s1 = __cvta_generic_to_shared(reinterpret_cast<float*>(addr));
+  asm volatile("st.shared.f32 [%0], {%1};" : : "l"(s1), "f"(x));
+}
+DI void sts(float* addr, const float (&x)[1]) {
+  auto s1 = __cvta_generic_to_shared(reinterpret_cast<float*>(addr));
+  asm volatile("st.shared.f32 [%0], {%1};" : : "l"(s1), "f"(x[0]));
+}
 DI void sts(float* addr, const float (&x)[2]) {
-  float2 v2 = make_float2(x[0], x[1]);
-  auto* s2 = reinterpret_cast<float2*>(addr);
-  *s2 = v2;
+  auto s2 = __cvta_generic_to_shared(reinterpret_cast<float2*>(addr));
+  asm volatile("st.shared.v2.f32 [%0], {%1, %2};"
+               :
+               : "l"(s2), "f"(x[0]), "f"(x[1]));
 }
 DI void sts(float* addr, const float (&x)[4]) {
-  float4 v4 = make_float4(x[0], x[1], x[2], x[3]);
-  auto* s4 = reinterpret_cast<float4*>(addr);
-  *s4 = v4;
+  auto s4 = __cvta_generic_to_shared(reinterpret_cast<float4*>(addr));
+  asm volatile("st.shared.v4.f32 [%0], {%1, %2, %3, %4};"
+               :
+               : "l"(s4), "f"(x[0]), "f"(x[1]), "f"(x[2]), "f"(x[3]));
+}
+
+DI void sts(double* addr, const double& x) {
+  auto s1 = __cvta_generic_to_shared(reinterpret_cast<double*>(addr));
+  asm volatile("st.shared.f64 [%0], {%1};" : : "l"(s1), "d"(x));
+}
+DI void sts(double* addr, const double (&x)[1]) {
+  auto s1 = __cvta_generic_to_shared(reinterpret_cast<double*>(addr));
+  asm volatile("st.shared.f64 [%0], {%1};" : : "l"(s1), "d"(x[0]));
 }
-DI void sts(double* addr, const double& x) { *addr = x; }
-DI void sts(double* addr, const double (&x)[1]) { *addr = x[0]; }
 DI void sts(double* addr, const double (&x)[2]) {
-  double2 v2 = make_double2(x[0], x[1]);
-  auto* s2 = reinterpret_cast<double2*>(addr);
-  *s2 = v2;
+  auto s2 = __cvta_generic_to_shared(reinterpret_cast<double2*>(addr));
+  asm volatile("st.shared.v2.f64 [%0], {%1, %2};"
+               :
+               : "l"(s2), "d"(x[0]), "d"(x[1]));
 }
 /** @} */
 
 /**
  * @defgroup SmemLoads Shared memory load operations
  * @{
  * @brief Loads from shared memory (both vectorized and non-vectorized forms)
+          requires the given shmem pointer to be aligned by the vector
+          length, like for float4 lds/sts shmem pointer should be aligned
+          by 16 bytes else it might silently fail or can also give
+          runtime error.
  * @param[out] x    the data to be loaded
  * @param[in]  addr shared memory address from where to load
+ *                  (should be aligned to vector size)
  */
-DI void lds(float& x, float* addr) { x = *addr; }
-DI void lds(float (&x)[1], float* addr) { x[0] = *addr; }
+DI void lds(float& x, float* addr) {
+  auto s1 = __cvta_generic_to_shared(reinterpret_cast<float*>(addr));
+  asm volatile("ld.shared.f32 {%0}, [%1];" : "=f"(x) : "l"(s1));
+}
+DI void lds(float (&x)[1], float* addr) {
+  auto s1 = __cvta_generic_to_shared(reinterpret_cast<float*>(addr));
+  asm volatile("ld.shared.f32 {%0}, [%1];" : "=f"(x[0]) : "l"(s1));
+}
 DI void lds(float (&x)[2], float* addr) {
-  auto* s2 = reinterpret_cast<float2*>(addr);
-  auto v2 = *s2;
-  x[0] = v2.x;
-  x[1] = v2.y;
+  auto s2 = __cvta_generic_to_shared(reinterpret_cast<float2*>(addr));
+  asm volatile("ld.shared.v2.f32 {%0, %1}, [%2];"
+               : "=f"(x[0]), "=f"(x[1])
+               : "l"(s2));
 }
 DI void lds(float (&x)[4], float* addr) {
-  auto* s4 = reinterpret_cast<float4*>(addr);
-  auto v4 = *s4;
-  x[0] = v4.x;
-  x[1] = v4.y;
-  x[2] = v4.z;
-  x[3] = v4.w;
-}
-DI void lds(double& x, double* addr) { x = *addr; }
-DI void lds(double (&x)[1], double* addr) { x[0] = *addr; }
+  auto s4 = __cvta_generic_to_shared(reinterpret_cast<float4*>(addr));
+  asm volatile("ld.shared.v4.f32 {%0, %1, %2, %3}, [%4];"
+               : "=f"(x[0]), "=f"(x[1]), "=f"(x[2]), "=f"(x[3])
+               : "l"(s4));
+}
+DI void lds(double& x, double* addr) {
+  auto s1 = __cvta_generic_to_shared(reinterpret_cast<double*>(addr));
+  asm volatile("ld.shared.f64 {%0}, [%1];" : "=d"(x) : "l"(s1));
+}
+DI void lds(double (&x)[1], double* addr) {
+  auto s1 = __cvta_generic_to_shared(reinterpret_cast<double*>(addr));
+  asm volatile("ld.shared.f64 {%0}, [%1];" : "=d"(x[0]) : "l"(s1));
+}
 DI void lds(double (&x)[2], double* addr) {
-  auto* s2 = reinterpret_cast<double2*>(addr);
-  auto v2 = *s2;
-  x[0] = v2.x;
-  x[1] = v2.y;
+  auto s2 = __cvta_generic_to_shared(reinterpret_cast<double2*>(addr));
+  asm volatile("ld.shared.v2.f64 {%0, %1}, [%2];"
+               : "=d"(x[0]), "=d"(x[1])
+               : "l"(s2));
 }
 /** @} */