diff --git a/cpp/bench/spatial/fused_l2_nn.cu b/cpp/bench/spatial/fused_l2_nn.cu index aa36483145..2463089675 100644 --- a/cpp/bench/spatial/fused_l2_nn.cu +++ b/cpp/bench/spatial/fused_l2_nn.cu @@ -53,7 +53,7 @@ struct fused_l2_nn : public fixture { uniform(handle, r, y.data(), p.n * p.k, T(-1.0), T(1.0)); raft::linalg::rowNorm(xn.data(), x.data(), p.k, p.m, raft::linalg::L2Norm, true, stream); raft::linalg::rowNorm(yn.data(), y.data(), p.k, p.n, raft::linalg::L2Norm, true, stream); - raft::distance::initialize, int>( + raft::distance::initialize, int>( handle, out.data(), p.m, std::numeric_limits::max(), op); } @@ -61,20 +61,20 @@ struct fused_l2_nn : public fixture { { loop_on_state(state, [this]() { // it is enough to only benchmark the L2-squared metric - raft::distance::fusedL2NN, int>(out.data(), - x.data(), - y.data(), - xn.data(), - yn.data(), - params.m, - params.n, - params.k, - (void*)workspace.data(), - op, - pairRedOp, - false, - false, - stream); + raft::distance::fusedL2NN, int>(out.data(), + x.data(), + y.data(), + xn.data(), + yn.data(), + params.m, + params.n, + params.k, + (void*)workspace.data(), + op, + pairRedOp, + false, + false, + stream); }); // Num distance calculations @@ -92,7 +92,7 @@ struct fused_l2_nn : public fixture { state.counters["FLOP/s"] = benchmark::Counter( num_flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::OneK::kIs1000); - state.counters["BW Wr"] = benchmark::Counter(write_elts * sizeof(cub::KeyValuePair), + state.counters["BW Wr"] = benchmark::Counter(write_elts * sizeof(raft::KeyValuePair), benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::OneK::kIs1000); state.counters["BW Rd"] = benchmark::Counter(read_elts * sizeof(float), @@ -105,7 +105,7 @@ struct fused_l2_nn : public fixture { private: fused_l2_nn_inputs params; rmm::device_uvector x, y, xn, yn; - rmm::device_uvector> out; + rmm::device_uvector> out; rmm::device_uvector workspace; raft::distance::KVPMinReduce pairRedOp; raft::distance::MinAndDistanceReduceOp op; diff --git a/cpp/include/raft/cluster/detail/kmeans.cuh b/cpp/include/raft/cluster/detail/kmeans.cuh index 26005f58a0..94fee3edbf 100644 --- a/cpp/include/raft/cluster/detail/kmeans.cuh +++ b/cpp/include/raft/cluster/detail/kmeans.cuh @@ -32,6 +32,7 @@ #include #include #include +#include #include #include #include @@ -278,7 +279,7 @@ void kmeans_fit_main(const raft::handle_t& handle, // - key is the index of nearest cluster // - value is the distance to the nearest cluster auto minClusterAndDistance = - raft::make_device_vector, IndexT>(handle, n_samples); + raft::make_device_vector, IndexT>(handle, n_samples); // temporary buffer to store L2 norm of centroids or distance matrix, // destructor releases the resource @@ -292,7 +293,7 @@ void kmeans_fit_main(const raft::handle_t& handle, // resource auto wtInCluster = raft::make_device_vector(handle, n_clusters); - rmm::device_scalar> clusterCostD(stream); + rmm::device_scalar> clusterCostD(stream); // L2 norm of X: ||x||^2 auto L2NormX = raft::make_device_vector(handle, n_samples); @@ -337,12 +338,12 @@ void kmeans_fit_main(const raft::handle_t& handle, workspace); // Using TransformInputIteratorT to dereference an array of - // cub::KeyValuePair and converting them to just return the Key to be used + // raft::KeyValuePair and converting them to just return the Key to be used // in reduce_rows_by_key prims detail::KeyValueIndexOp conversion_op; cub::TransformInputIterator, - cub::KeyValuePair*> + raft::KeyValuePair*> itr(minClusterAndDistance.data_handle(), conversion_op); workspace.resize(n_samples, stream); @@ -400,14 +401,14 @@ void kmeans_fit_main(const raft::handle_t& handle, itr_wt, wtInCluster.size(), newCentroids.data_handle(), - [=] __device__(cub::KeyValuePair map) { // predicate + [=] __device__(raft::KeyValuePair map) { // predicate // copy when the # of samples in the cluster is 0 if (map.value == 0) return true; else return false; }, - [=] __device__(cub::KeyValuePair map) { // map + [=] __device__(raft::KeyValuePair map) { // map return map.key; }, stream); @@ -439,9 +440,9 @@ void kmeans_fit_main(const raft::handle_t& handle, minClusterAndDistance.view(), workspace, raft::make_device_scalar_view(clusterCostD.data()), - [] __device__(const cub::KeyValuePair& a, - const cub::KeyValuePair& b) { - cub::KeyValuePair res; + [] __device__(const raft::KeyValuePair& a, + const raft::KeyValuePair& b) { + raft::KeyValuePair res; res.key = 0; res.value = a.value + b.value; return res; @@ -489,8 +490,8 @@ void kmeans_fit_main(const raft::handle_t& handle, minClusterAndDistance.data_handle() + minClusterAndDistance.size(), weight.data_handle(), minClusterAndDistance.data_handle(), - [=] __device__(const cub::KeyValuePair kvp, DataT wt) { - cub::KeyValuePair res; + [=] __device__(const raft::KeyValuePair kvp, DataT wt) { + raft::KeyValuePair res; res.value = kvp.value * wt; res.key = kvp.key; return res; @@ -501,9 +502,9 @@ void kmeans_fit_main(const raft::handle_t& handle, minClusterAndDistance.view(), workspace, raft::make_device_scalar_view(clusterCostD.data()), - [] __device__(const cub::KeyValuePair& a, - const cub::KeyValuePair& b) { - cub::KeyValuePair res; + [] __device__(const raft::KeyValuePair& a, + const raft::KeyValuePair& b) { + raft::KeyValuePair res; res.key = 0; res.value = a.value + b.value; return res; @@ -970,7 +971,7 @@ void kmeans_predict(handle_t const& handle, if (normalize_weight) checkWeight(handle, weight.view(), workspace); auto minClusterAndDistance = - raft::make_device_vector, IndexT>(handle, n_samples); + raft::make_device_vector, IndexT>(handle, n_samples); rmm::device_uvector L2NormBuf_OR_DistBuf(0, stream); // L2 norm of X: ||x||^2 @@ -1001,15 +1002,15 @@ void kmeans_predict(handle_t const& handle, workspace); // calculate cluster cost phi_x(C) - rmm::device_scalar> clusterCostD(stream); + rmm::device_scalar> clusterCostD(stream); // TODO: add different templates for InType of binaryOp to avoid thrust transform thrust::transform(handle.get_thrust_policy(), minClusterAndDistance.data_handle(), minClusterAndDistance.data_handle() + minClusterAndDistance.size(), weight.data_handle(), minClusterAndDistance.data_handle(), - [=] __device__(const cub::KeyValuePair kvp, DataT wt) { - cub::KeyValuePair res; + [=] __device__(const raft::KeyValuePair kvp, DataT wt) { + raft::KeyValuePair res; res.value = kvp.value * wt; res.key = kvp.key; return res; @@ -1019,9 +1020,9 @@ void kmeans_predict(handle_t const& handle, minClusterAndDistance.view(), workspace, raft::make_device_scalar_view(clusterCostD.data()), - [] __device__(const cub::KeyValuePair& a, - const cub::KeyValuePair& b) { - cub::KeyValuePair res; + [] __device__(const raft::KeyValuePair& a, + const raft::KeyValuePair& b) { + raft::KeyValuePair res; res.key = 0; res.value = a.value + b.value; return res; @@ -1033,7 +1034,7 @@ void kmeans_predict(handle_t const& handle, minClusterAndDistance.data_handle(), minClusterAndDistance.data_handle() + minClusterAndDistance.size(), labels.data_handle(), - [=] __device__(cub::KeyValuePair pair) { return pair.key; }); + [=] __device__(raft::KeyValuePair pair) { return pair.key; }); } template diff --git a/cpp/include/raft/cluster/detail/kmeans_common.cuh b/cpp/include/raft/cluster/detail/kmeans_common.cuh index e9929a089d..d4dd565ea0 100644 --- a/cpp/include/raft/cluster/detail/kmeans_common.cuh +++ b/cpp/include/raft/cluster/detail/kmeans_common.cuh @@ -31,6 +31,7 @@ #include #include #include +#include #include #include #include @@ -66,7 +67,7 @@ struct SamplingOp { } __host__ __device__ __forceinline__ bool operator()( - const cub::KeyValuePair& a) const + const raft::KeyValuePair& a) const { DataT prob_threshold = (DataT)rnd[a.key]; @@ -79,7 +80,7 @@ struct SamplingOp { template struct KeyValueIndexOp { __host__ __device__ __forceinline__ IndexT - operator()(const cub::KeyValuePair& a) const + operator()(const raft::KeyValuePair& a) const { return a.key; } @@ -224,7 +225,7 @@ void sampleCentroids(const raft::handle_t& handle, auto nSelected = raft::make_device_scalar(handle, 0); cub::ArgIndexInputIterator ip_itr(minClusterDistance.data_handle()); auto sampledMinClusterDistance = - raft::make_device_vector, IndexT>(handle, n_local_samples); + raft::make_device_vector, IndexT>(handle, n_local_samples); size_t temp_storage_bytes = 0; RAFT_CUDA_TRY(cub::DeviceSelect::If(nullptr, temp_storage_bytes, @@ -254,7 +255,7 @@ void sampleCentroids(const raft::handle_t& handle, thrust::for_each_n(handle.get_thrust_policy(), sampledMinClusterDistance.data_handle(), nPtsSampledInRank, - [=] __device__(cub::KeyValuePair val) { + [=] __device__(raft::KeyValuePair val) { rawPtr_isSampleCentroid[val.key] = 1; }); @@ -266,7 +267,7 @@ void sampleCentroids(const raft::handle_t& handle, sampledMinClusterDistance.data_handle(), nPtsSampledInRank, inRankCp.data(), - [=] __device__(cub::KeyValuePair val) { // MapTransformOp + [=] __device__(raft::KeyValuePair val) { // MapTransformOp return val.key; }, stream); @@ -355,7 +356,7 @@ void minClusterAndDistanceCompute( const KMeansParams& params, const raft::device_matrix_view X, const raft::device_matrix_view centroids, - const raft::device_vector_view, IndexT> minClusterAndDistance, + const raft::device_vector_view, IndexT> minClusterAndDistance, const raft::device_vector_view L2NormX, rmm::device_uvector& L2NormBuf_OR_DistBuf, rmm::device_uvector& workspace) @@ -390,7 +391,7 @@ void minClusterAndDistanceCompute( auto pairwiseDistance = raft::make_device_matrix_view( L2NormBuf_OR_DistBuf.data(), dataBatchSize, centroidsBatchSize); - cub::KeyValuePair initial_value(0, std::numeric_limits::max()); + raft::KeyValuePair initial_value(0, std::numeric_limits::max()); thrust::fill(handle.get_thrust_policy(), minClusterAndDistance.data_handle(), @@ -409,7 +410,7 @@ void minClusterAndDistanceCompute( // minClusterAndDistanceView [ns x n_clusters] auto minClusterAndDistanceView = - raft::make_device_vector_view, IndexT>( + raft::make_device_vector_view, IndexT>( minClusterAndDistance.data_handle() + dIdx, ns); auto L2NormXView = @@ -420,7 +421,7 @@ void minClusterAndDistanceCompute( workspace.resize((sizeof(int)) * ns, stream); // todo(lsugy): remove cIdx - raft::distance::fusedL2NNMinReduce, IndexT>( + raft::distance::fusedL2NNMinReduce, IndexT>( minClusterAndDistanceView.data_handle(), datasetView.data_handle(), centroids.data_handle(), @@ -466,15 +467,15 @@ void minClusterAndDistanceCompute( stream, true, [=] __device__(const DataT val, const IndexT i) { - cub::KeyValuePair pair; + raft::KeyValuePair pair; pair.key = cIdx + i; pair.value = val; return pair; }, - [=] __device__(cub::KeyValuePair a, cub::KeyValuePair b) { + [=] __device__(raft::KeyValuePair a, raft::KeyValuePair b) { return (b.value < a.value) ? b : a; }, - [=] __device__(cub::KeyValuePair pair) { return pair; }); + [=] __device__(raft::KeyValuePair pair) { return pair; }); } } } @@ -623,7 +624,7 @@ void countSamplesInCluster(const raft::handle_t& handle, // - key is the index of nearest cluster // - value is the distance to the nearest cluster auto minClusterAndDistance = - raft::make_device_vector, IndexT>(handle, n_samples); + raft::make_device_vector, IndexT>(handle, n_samples); // temporary buffer to store distance matrix, destructor releases the resource rmm::device_uvector L2NormBuf_OR_DistBuf(0, stream); @@ -642,13 +643,13 @@ void countSamplesInCluster(const raft::handle_t& handle, L2NormBuf_OR_DistBuf, workspace); - // Using TransformInputIteratorT to dereference an array of cub::KeyValuePair + // Using TransformInputIteratorT to dereference an array of raft::KeyValuePair // and converting them to just return the Key to be used in reduce_rows_by_key // prims detail::KeyValueIndexOp conversion_op; cub::TransformInputIterator, - cub::KeyValuePair*> + raft::KeyValuePair*> itr(minClusterAndDistance.data_handle(), conversion_op); // count # of samples in each cluster diff --git a/cpp/include/raft/cluster/kmeans.cuh b/cpp/include/raft/cluster/kmeans.cuh index 539fc33c40..0ce35da4a5 100644 --- a/cpp/include/raft/cluster/kmeans.cuh +++ b/cpp/include/raft/cluster/kmeans.cuh @@ -18,6 +18,7 @@ #include #include #include +#include #include namespace raft::cluster { @@ -353,7 +354,7 @@ void minClusterAndDistanceCompute( const KMeansParams& params, const raft::device_matrix_view X, const raft::device_matrix_view centroids, - const raft::device_vector_view, IndexT>& minClusterAndDistance, + const raft::device_vector_view, IndexT>& minClusterAndDistance, const raft::device_vector_view& L2NormX, rmm::device_uvector& L2NormBuf_OR_DistBuf, rmm::device_uvector& workspace) diff --git a/cpp/include/raft/core/detail/macros.hpp b/cpp/include/raft/core/detail/macros.hpp index 66b67579fc..bfb47437ad 100644 --- a/cpp/include/raft/core/detail/macros.hpp +++ b/cpp/include/raft/core/detail/macros.hpp @@ -37,7 +37,7 @@ #define _RAFT_HOST_DEVICE _RAFT_HOST _RAFT_DEVICE #ifndef RAFT_INLINE_FUNCTION -#define RAFT_INLINE_FUNCTION _RAFT_FORCEINLINE _RAFT_HOST_DEVICE +#define RAFT_INLINE_FUNCTION _RAFT_HOST_DEVICE _RAFT_FORCEINLINE #endif /** diff --git a/cpp/include/raft/core/kvp.hpp b/cpp/include/raft/core/kvp.hpp new file mode 100644 index 0000000000..f6ea841dc4 --- /dev/null +++ b/cpp/include/raft/core/kvp.hpp @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#ifdef _RAFT_HAS_CUDA +#include +#endif +namespace raft { +/** + * \brief A key identifier paired with a corresponding value + * + */ +template +struct KeyValuePair { + typedef _Key Key; ///< Key data type + typedef _Value Value; ///< Value data type + + Key key; ///< Item key + Value value; ///< Item value + + /// Constructor + RAFT_INLINE_FUNCTION KeyValuePair() {} + +#ifdef _RAFT_HAS_CUDA + /// Conversion Constructor to allow integration w/ cub + RAFT_INLINE_FUNCTION KeyValuePair(cub::KeyValuePair<_Key, _Value> kvp) + : key(kvp.key), value(kvp.value) + { + } + + RAFT_INLINE_FUNCTION operator cub::KeyValuePair<_Key, _Value>() + { + return cub::KeyValuePair(key, value); + } +#endif + + /// Constructor + RAFT_INLINE_FUNCTION KeyValuePair(Key const& key, Value const& value) : key(key), value(value) {} + + /// Inequality operator + RAFT_INLINE_FUNCTION bool operator!=(const KeyValuePair& b) + { + return (value != b.value) || (key != b.key); + } +}; +} // end namespace raft diff --git a/cpp/include/raft/distance/detail/fused_l2_nn.cuh b/cpp/include/raft/distance/detail/fused_l2_nn.cuh index 8aae7d40f4..1385d0aa09 100644 --- a/cpp/include/raft/distance/detail/fused_l2_nn.cuh +++ b/cpp/include/raft/distance/detail/fused_l2_nn.cuh @@ -16,8 +16,8 @@ #pragma once -#include #include +#include #include #include #include @@ -25,6 +25,7 @@ namespace raft { namespace distance { + namespace detail { #if (ENABLE_MEMCPY_ASYNC == 1) @@ -34,15 +35,14 @@ using namespace nvcuda::experimental; template struct KVPMinReduceImpl { - typedef cub::KeyValuePair KVP; - + typedef raft::KeyValuePair KVP; DI KVP operator()(LabelT rit, const KVP& a, const KVP& b) { return b.value < a.value ? b : a; } }; // KVPMinReduce template struct MinAndDistanceReduceOpImpl { - typedef typename cub::KeyValuePair KVP; + typedef typename raft::KeyValuePair KVP; DI void operator()(LabelT rid, KVP* out, const KVP& other) { if (other.value < out->value) { @@ -66,7 +66,7 @@ struct MinAndDistanceReduceOpImpl { template struct MinReduceOpImpl { - typedef typename cub::KeyValuePair KVP; + typedef typename raft::KeyValuePair KVP; DI void operator()(LabelT rid, DataT* out, const KVP& other) { if (other.value < *out) { *out = other.value; } @@ -146,7 +146,7 @@ __global__ __launch_bounds__(P::Nthreads, 2) void fusedL2NNkernel(OutT* min, { extern __shared__ char smem[]; - typedef cub::KeyValuePair KVPair; + typedef KeyValuePair KVPair; KVPair val[P::AccRowsPerTh]; #pragma unroll for (int i = 0; i < P::AccRowsPerTh; ++i) { @@ -285,7 +285,7 @@ void fusedL2NNImpl(OutT* min, dim3 blk(P::Nthreads); auto nblks = raft::ceildiv(m, P::Nthreads); constexpr auto maxVal = std::numeric_limits::max(); - typedef cub::KeyValuePair KVPair; + typedef KeyValuePair KVPair; // Accumulation operation lambda auto core_lambda = [] __device__(DataT & acc, DataT & x, DataT & y) { acc += x * y; }; diff --git a/cpp/include/raft/distance/fused_l2_nn.cuh b/cpp/include/raft/distance/fused_l2_nn.cuh index 2915bce360..fb4fb8d34c 100644 --- a/cpp/include/raft/distance/fused_l2_nn.cuh +++ b/cpp/include/raft/distance/fused_l2_nn.cuh @@ -168,7 +168,8 @@ void fusedL2NN(OutT* min, * * @tparam DataT data type * @tparam OutT output type to either store 1-NN indices and their minimum - * distances (e.g. cub::KeyValuePair) or store only the min distances. + * distances (e.g. raft::KeyValuePair) or store only the min + * distances. * @tparam IdxT indexing arithmetic type * @param[out] min will contain the reduced output (Length = `m`) * (on device) diff --git a/cpp/include/raft/distance/specializations/distance.cuh b/cpp/include/raft/distance/specializations/distance.cuh index 3b7d08f2aa..73d075f260 100644 --- a/cpp/include/raft/distance/specializations/distance.cuh +++ b/cpp/include/raft/distance/specializations/distance.cuh @@ -31,4 +31,4 @@ #include #include #include -//#include +#include diff --git a/cpp/include/raft/distance/specializations/fused_l2_nn_min.cuh b/cpp/include/raft/distance/specializations/fused_l2_nn_min.cuh index deddf65b37..88e1216635 100644 --- a/cpp/include/raft/distance/specializations/fused_l2_nn_min.cuh +++ b/cpp/include/raft/distance/specializations/fused_l2_nn_min.cuh @@ -16,13 +16,14 @@ #pragma once +#include #include namespace raft { namespace distance { -extern template void fusedL2NNMinReduce, int>( - cub::KeyValuePair* min, +extern template void fusedL2NNMinReduce, int>( + raft::KeyValuePair* min, const float* x, const float* y, const float* xn, @@ -34,8 +35,8 @@ extern template void fusedL2NNMinReduce, in bool sqrt, bool initOutBuffer, cudaStream_t stream); -extern template void fusedL2NNMinReduce, int64_t>( - cub::KeyValuePair* min, +extern template void fusedL2NNMinReduce, int64_t>( + raft::KeyValuePair* min, const float* x, const float* y, const float* xn, @@ -47,8 +48,8 @@ extern template void fusedL2NNMinReduce bool sqrt, bool initOutBuffer, cudaStream_t stream); -extern template void fusedL2NNMinReduce, int>( - cub::KeyValuePair* min, +extern template void fusedL2NNMinReduce, int>( + raft::KeyValuePair* min, const double* x, const double* y, const double* xn, @@ -60,8 +61,8 @@ extern template void fusedL2NNMinReduce, bool sqrt, bool initOutBuffer, cudaStream_t stream); -extern template void fusedL2NNMinReduce, int64_t>( - cub::KeyValuePair* min, +extern template void fusedL2NNMinReduce, int64_t>( + raft::KeyValuePair* min, const double* x, const double* y, const double* xn, diff --git a/cpp/include/raft/sparse/spatial/detail/connect_components.cuh b/cpp/include/raft/sparse/spatial/detail/connect_components.cuh index f515ab5739..1c14669e28 100644 --- a/cpp/include/raft/sparse/spatial/detail/connect_components.cuh +++ b/cpp/include/raft/sparse/spatial/detail/connect_components.cuh @@ -31,6 +31,7 @@ #include #include +#include #include #include #include @@ -45,43 +46,6 @@ #include namespace raft::sparse::spatial::detail { -/** - * \brief A key identifier paired with a corresponding value - * - * NOTE: This is being included close to where it's being used - * because it's meant to be temporary. There is a conflict - * between the cub and thrust_cub namespaces with older CUDA - * versions so we're using our own as a workaround. - */ -template -struct KeyValuePair { - typedef _Key Key; ///< Key data type - typedef _Value Value; ///< Value data type - - Key key; ///< Item key - Value value; ///< Item value - - /// Constructor - __host__ __device__ __forceinline__ KeyValuePair() {} - - /// Copy Constructor - __host__ __device__ __forceinline__ KeyValuePair(cub::KeyValuePair<_Key, _Value> kvp) - : key(kvp.key), value(kvp.value) - { - } - - /// Constructor - __host__ __device__ __forceinline__ KeyValuePair(Key const& key, Value const& value) - : key(key), value(value) - { - } - - /// Inequality operator - __host__ __device__ __forceinline__ bool operator!=(const KeyValuePair& b) - { - return (value != b.value) || (key != b.key); - } -}; /** * Functor with reduction ops for performing fused 1-nn @@ -97,7 +61,7 @@ struct FixConnectivitiesRedOp { FixConnectivitiesRedOp(value_idx* colors_, value_idx m_) : colors(colors_), m(m_){}; - typedef typename cub::KeyValuePair KVP; + typedef typename raft::KeyValuePair KVP; DI void operator()(value_idx rit, KVP* out, const KVP& other) { if (rit < m && other.value < out->value && colors[rit] != colors[other.key]) { @@ -148,7 +112,7 @@ struct TupleComp { template struct CubKVPMinReduce { - typedef cub::KeyValuePair KVP; + typedef raft::KeyValuePair KVP; DI KVP @@ -197,7 +161,7 @@ struct LookupColorOp { DI value_idx - operator()(const cub::KeyValuePair& kvp) + operator()(const raft::KeyValuePair& kvp) { return colors[kvp.key]; } @@ -218,7 +182,7 @@ struct LookupColorOp { * @param[in] stream cuda stream for which to order cuda operations */ template -void perform_1nn(cub::KeyValuePair* kvp, +void perform_1nn(raft::KeyValuePair* kvp, value_idx* nn_colors, value_idx* colors, const value_t* X, @@ -232,7 +196,7 @@ void perform_1nn(cub::KeyValuePair* kvp, raft::linalg::rowNorm(x_norm.data(), X, n_cols, n_rows, raft::linalg::L2Norm, true, stream); - raft::distance::fusedL2NN, value_idx>( + raft::distance::fusedL2NN, value_idx>( kvp, X, X, @@ -267,7 +231,7 @@ void perform_1nn(cub::KeyValuePair* kvp, template void sort_by_color(value_idx* colors, value_idx* nn_colors, - cub::KeyValuePair* kvp, + raft::KeyValuePair* kvp, value_idx* src_indices, size_t n_rows, cudaStream_t stream) @@ -289,7 +253,7 @@ __global__ void min_components_by_color_kernel(value_idx* out_rows, value_t* out_vals, const value_idx* out_index, const value_idx* indices, - const cub::KeyValuePair* kvp, + const raft::KeyValuePair* kvp, size_t nnz) { size_t tid = blockDim.x * blockIdx.x + threadIdx.x; @@ -323,7 +287,7 @@ template void min_components_by_color(raft::sparse::COO& coo, const value_idx* out_index, const value_idx* indices, - const cub::KeyValuePair* kvp, + const raft::KeyValuePair* kvp, size_t nnz, cudaStream_t stream) { @@ -384,7 +348,7 @@ void connect_components( * is guaranteed to be != color of its nearest neighbor. */ rmm::device_uvector nn_colors(n_rows, stream); - rmm::device_uvector> temp_inds_dists(n_rows, stream); + rmm::device_uvector> temp_inds_dists(n_rows, stream); rmm::device_uvector src_indices(n_rows, stream); perform_1nn(temp_inds_dists.data(), diff --git a/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh b/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh index 6d3289e14c..bf0df065b2 100644 --- a/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh +++ b/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh @@ -84,9 +84,9 @@ inline void predict_float_core(const handle_t& handle, auto workspace = raft::make_device_mdarray( handle, mr, make_extents((sizeof(int)) * n_rows)); - auto minClusterAndDistance = raft::make_device_mdarray, IdxT>( + auto minClusterAndDistance = raft::make_device_mdarray, IdxT>( handle, mr, make_extents(n_rows)); - cub::KeyValuePair initial_value(0, std::numeric_limits::max()); + raft::KeyValuePair initial_value(0, std::numeric_limits::max()); thrust::fill(handle.get_thrust_policy(), minClusterAndDistance.data_handle(), minClusterAndDistance.data_handle() + minClusterAndDistance.size(), @@ -97,7 +97,7 @@ inline void predict_float_core(const handle_t& handle, raft::linalg::rowNorm( centroidsNorm.data_handle(), centers, dim, n_clusters, raft::linalg::L2Norm, true, stream); - raft::distance::fusedL2NNMinReduce, IdxT>( + raft::distance::fusedL2NNMinReduce, IdxT>( minClusterAndDistance.data_handle(), dataset, centers, @@ -117,7 +117,7 @@ inline void predict_float_core(const handle_t& handle, minClusterAndDistance.data_handle(), minClusterAndDistance.data_handle() + n_rows, labels, - [=] __device__(cub::KeyValuePair kvp) { + [=] __device__(raft::KeyValuePair kvp) { return static_cast(kvp.key); }); break; diff --git a/cpp/src/distance/specializations/fused_l2_nn_double_int.cu b/cpp/src/distance/specializations/fused_l2_nn_double_int.cu index b032261169..4448ee0cc2 100644 --- a/cpp/src/distance/specializations/fused_l2_nn_double_int.cu +++ b/cpp/src/distance/specializations/fused_l2_nn_double_int.cu @@ -14,13 +14,14 @@ * limitations under the License. */ +#include #include namespace raft { namespace distance { -template void fusedL2NNMinReduce, int>( - cub::KeyValuePair* min, +template void fusedL2NNMinReduce, int>( + raft::KeyValuePair* min, const double* x, const double* y, const double* xn, diff --git a/cpp/src/distance/specializations/fused_l2_nn_double_int64.cu b/cpp/src/distance/specializations/fused_l2_nn_double_int64.cu index a208b013d5..54478a1656 100644 --- a/cpp/src/distance/specializations/fused_l2_nn_double_int64.cu +++ b/cpp/src/distance/specializations/fused_l2_nn_double_int64.cu @@ -14,13 +14,14 @@ * limitations under the License. */ +#include #include namespace raft { namespace distance { -template void fusedL2NNMinReduce, int64_t>( - cub::KeyValuePair* min, +template void fusedL2NNMinReduce, int64_t>( + raft::KeyValuePair* min, const double* x, const double* y, const double* xn, diff --git a/cpp/src/distance/specializations/fused_l2_nn_float_int.cu b/cpp/src/distance/specializations/fused_l2_nn_float_int.cu index f58349a826..e25c9fad91 100644 --- a/cpp/src/distance/specializations/fused_l2_nn_float_int.cu +++ b/cpp/src/distance/specializations/fused_l2_nn_float_int.cu @@ -14,13 +14,14 @@ * limitations under the License. */ +#include #include namespace raft { namespace distance { -template void fusedL2NNMinReduce, int>( - cub::KeyValuePair* min, +template void fusedL2NNMinReduce, int>( + raft::KeyValuePair* min, const float* x, const float* y, const float* xn, diff --git a/cpp/src/distance/specializations/fused_l2_nn_float_int64.cu b/cpp/src/distance/specializations/fused_l2_nn_float_int64.cu index e43c3aa4e9..b7abd91304 100644 --- a/cpp/src/distance/specializations/fused_l2_nn_float_int64.cu +++ b/cpp/src/distance/specializations/fused_l2_nn_float_int64.cu @@ -14,13 +14,14 @@ * limitations under the License. */ +#include #include namespace raft { namespace distance { -template void fusedL2NNMinReduce, int64_t>( - cub::KeyValuePair* min, +template void fusedL2NNMinReduce, int64_t>( + raft::KeyValuePair* min, const float* x, const float* y, const float* xn, diff --git a/cpp/test/distance/fused_l2_nn.cu b/cpp/test/distance/fused_l2_nn.cu index 2838a2209e..fdb6bf68fe 100644 --- a/cpp/test/distance/fused_l2_nn.cu +++ b/cpp/test/distance/fused_l2_nn.cu @@ -16,6 +16,7 @@ #include "../test_utils.h" #include +#include #include #include #include @@ -23,19 +24,20 @@ #include #include -// TODO: Once fusedL2NN is specialized in the raft_distance shared library, add -// the following: -// -// #if defined RAFT_NN_COMPILED -// #include -// #endif +#if defined RAFT_NN_COMPILED +#include +#endif + +#if defined RAFT_DISTANCE_COMPILED +#include +#endif namespace raft { namespace distance { template struct CubKVPMinReduce { - typedef cub::KeyValuePair KVP; + typedef raft::KeyValuePair KVP; DI KVP operator()(LabelT rit, const KVP& a, const KVP& b) { return b.value < a.value ? b : a; } @@ -44,7 +46,7 @@ struct CubKVPMinReduce { }; // KVPMinReduce template -__global__ void naiveKernel(cub::KeyValuePair* min, +__global__ void naiveKernel(raft::KeyValuePair* min, DataT* x, DataT* y, int m, @@ -64,10 +66,10 @@ __global__ void naiveKernel(cub::KeyValuePair* min, } if (Sqrt) { acc = raft::mySqrt(acc); } ReduceOpT redOp; - typedef cub::WarpReduce> WarpReduce; + typedef cub::WarpReduce> WarpReduce; __shared__ typename WarpReduce::TempStorage temp[NWARPS]; int warpId = threadIdx.x / raft::WarpSize; - cub::KeyValuePair tmp; + raft::KeyValuePair tmp; tmp.key = nidx; tmp.value = midx >= m || nidx >= n ? maxVal : acc; tmp = WarpReduce(temp[warpId]).Reduce(tmp, CubKVPMinReduce()); @@ -82,7 +84,7 @@ __global__ void naiveKernel(cub::KeyValuePair* min, } template -void naive(cub::KeyValuePair* min, +void naive(raft::KeyValuePair* min, DataT* x, DataT* y, int m, @@ -96,7 +98,7 @@ void naive(cub::KeyValuePair* min, RAFT_CUDA_TRY(cudaMemsetAsync(workspace, 0, sizeof(int) * m, stream)); auto blks = raft::ceildiv(m, 256); MinAndDistanceReduceOp op; - detail::initKernel, int> + detail::initKernel, int> <<>>(min, m, std::numeric_limits::max(), op); RAFT_CUDA_TRY(cudaGetLastError()); naiveKernel, 16> @@ -165,8 +167,8 @@ class FusedL2NNTest : public ::testing::TestWithParam> { rmm::device_uvector y; rmm::device_uvector xn; rmm::device_uvector yn; - rmm::device_uvector> min; - rmm::device_uvector> min_ref; + rmm::device_uvector> min; + rmm::device_uvector> min_ref; rmm::device_uvector workspace; raft::handle_t handle; cudaStream_t stream; @@ -179,33 +181,34 @@ class FusedL2NNTest : public ::testing::TestWithParam> { naive(min_ref.data(), x.data(), y.data(), m, n, k, (int*)workspace.data(), stream); } - void runTest(cub::KeyValuePair* out) + void runTest(raft::KeyValuePair* out) { int m = params.m; int n = params.n; int k = params.k; MinAndDistanceReduceOp redOp; - fusedL2NN, int>(out, - x.data(), - y.data(), - xn.data(), - yn.data(), - m, - n, - k, - (void*)workspace.data(), - redOp, - raft::distance::KVPMinReduce(), - Sqrt, - true, - stream); + fusedL2NN, int>( + out, + x.data(), + y.data(), + xn.data(), + yn.data(), + m, + n, + k, + (void*)workspace.data(), + redOp, + raft::distance::KVPMinReduce(), + Sqrt, + true, + stream); RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); } }; template struct CompareApproxAbsKVP { - typedef typename cub::KeyValuePair KVP; + typedef typename raft::KeyValuePair KVP; CompareApproxAbsKVP(T eps_) : eps(eps_) {} bool operator()(const KVP& a, const KVP& b) const { @@ -221,7 +224,7 @@ struct CompareApproxAbsKVP { template struct CompareExactKVP { - typedef typename cub::KeyValuePair KVP; + typedef typename raft::KeyValuePair KVP; bool operator()(const KVP& a, const KVP& b) const { if (a.value != b.value) return false; @@ -230,13 +233,13 @@ struct CompareExactKVP { }; template -::testing::AssertionResult devArrMatch(const cub::KeyValuePair* expected, - const cub::KeyValuePair* actual, +::testing::AssertionResult devArrMatch(const raft::KeyValuePair* expected, + const raft::KeyValuePair* actual, size_t size, L eq_compare, cudaStream_t stream = 0) { - typedef typename cub::KeyValuePair KVP; + typedef typename raft::KeyValuePair KVP; std::shared_ptr exp_h(new KVP[size]); std::shared_ptr act_h(new KVP[size]); raft::update_host(exp_h.get(), expected, size, stream); @@ -384,7 +387,7 @@ class FusedL2NNDetTest : public FusedL2NNTest { raft::handle_t handle; cudaStream_t stream; - rmm::device_uvector> min1; + rmm::device_uvector> min1; static const int NumRepeats = 100;