From 87d620ff15de25086d83be8dbfa19756e906cb61 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Thu, 20 Oct 2022 19:06:27 -0400 Subject: [PATCH 01/14] Correcting spatial::knn docs to raft::neighbors --- docs/source/cpp_api.rst | 6 +-- .../cpp_api/{clustering.rst => cluster.rst} | 6 +-- docs/source/cpp_api/distance.rst | 10 +++++ docs/source/cpp_api/neighbors.rst | 43 +++++++++++++++++++ .../cpp_api/{optimization.rst => solver.rst} | 0 docs/source/cpp_api/spatial.rst | 31 ------------- 6 files changed, 59 insertions(+), 37 deletions(-) rename docs/source/cpp_api/{clustering.rst => cluster.rst} (77%) create mode 100644 docs/source/cpp_api/distance.rst create mode 100644 docs/source/cpp_api/neighbors.rst rename docs/source/cpp_api/{optimization.rst => solver.rst} (100%) delete mode 100644 docs/source/cpp_api/spatial.rst diff --git a/docs/source/cpp_api.rst b/docs/source/cpp_api.rst index db139031a2..05d3686dc3 100644 --- a/docs/source/cpp_api.rst +++ b/docs/source/cpp_api.rst @@ -9,11 +9,11 @@ RAFT C++ API Reference :maxdepth: 4 cpp_api/core.rst - cpp_api/clustering.rst + cpp_api/cluster.rst cpp_api/linalg.rst cpp_api/matrix.rst - cpp_api/optimization.rst + cpp_api/solver.rst cpp_api/random.rst - cpp_api/spatial.rst + cpp_api/distance.rst cpp_api/sparse.rst cpp_api/stats.rst \ No newline at end of file diff --git a/docs/source/cpp_api/clustering.rst b/docs/source/cpp_api/cluster.rst similarity index 77% rename from docs/source/cpp_api/clustering.rst rename to docs/source/cpp_api/cluster.rst index 90ca786cc1..781180a72a 100644 --- a/docs/source/cpp_api/clustering.rst +++ b/docs/source/cpp_api/cluster.rst @@ -1,7 +1,7 @@ -Clustering -========== +Cluster +======= -This page provides C++ class references for the publicly-exposed elements of the clustering package. +This page provides C++ class references for the publicly-exposed elements of the cluster package. .. doxygennamespace:: raft::cluster :project: RAFT diff --git a/docs/source/cpp_api/distance.rst b/docs/source/cpp_api/distance.rst new file mode 100644 index 0000000000..c2bce860d5 --- /dev/null +++ b/docs/source/cpp_api/distance.rst @@ -0,0 +1,10 @@ +Distance +======== + +This page provides C++ class references for the publicly-exposed elements of the distance package. + +Distance +######## + +.. doxygennamespace:: raft::distance + :project: RAFT diff --git a/docs/source/cpp_api/neighbors.rst b/docs/source/cpp_api/neighbors.rst new file mode 100644 index 0000000000..962bbd1efe --- /dev/null +++ b/docs/source/cpp_api/neighbors.rst @@ -0,0 +1,43 @@ +Neighbors +========= + +This page provides C++ class references for the publicly-exposed elements of the neighbors package. + + +Brute-force +----------- + +.. doxygennamespace:: raft::neighbors::brute_force + :project: RAFT + + +IVF-Flat +-------- + +.. doxygennamespace:: raft::neighbors::ivf_flat + :project: RAFT + :members: + + +IVF-PQ +-------- + +.. doxygennamespace:: raft::neighbors::ivf_pq + :project: RAFT + :members: + + +Epsilon Neighborhood +-------------------- + +.. doxygennamespace:: raft::neighbors::epsilon_neighborhood + :project: RAFT + :members: + + +Random Ball Cover +----------------- + +.. doxygennamespace:: raft::neighbors::ball_cover + :project: RAFT + :members: diff --git a/docs/source/cpp_api/optimization.rst b/docs/source/cpp_api/solver.rst similarity index 100% rename from docs/source/cpp_api/optimization.rst rename to docs/source/cpp_api/solver.rst diff --git a/docs/source/cpp_api/spatial.rst b/docs/source/cpp_api/spatial.rst deleted file mode 100644 index 9bda00dab7..0000000000 --- a/docs/source/cpp_api/spatial.rst +++ /dev/null @@ -1,31 +0,0 @@ -Spatial -======= - -This page provides C++ class references for the publicly-exposed elements of the spatial package. - -Distance -######## - -.. doxygennamespace:: raft::distance - :project: RAFT - - -Nearest Neighbors -################# - -.. doxygenfunction:: raft::spatial::knn::brute_force_knn - :project: RAFT - -.. doxygenfunction:: raft::spatial::knn::select_k - :project: RAFT - -.. doxygenfunction:: raft::spatial::knn::knn_merge_parts - :project: RAFT - - -IVF-Flat --------- - -.. doxygennamespace:: raft::spatial::knn::ivf_flat - :project: RAFT - :members: From febd1d40eb2bc511731b9335912e8bbf9cbeb3f8 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Thu, 20 Oct 2022 19:08:48 -0400 Subject: [PATCH 02/14] Adding neighbors to index --- docs/source/cpp_api.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/source/cpp_api.rst b/docs/source/cpp_api.rst index 05d3686dc3..d10d9773a5 100644 --- a/docs/source/cpp_api.rst +++ b/docs/source/cpp_api.rst @@ -10,10 +10,11 @@ RAFT C++ API Reference cpp_api/core.rst cpp_api/cluster.rst + cpp_api/distance.rst cpp_api/linalg.rst cpp_api/matrix.rst + cpp_api/neighbors.rst cpp_api/solver.rst cpp_api/random.rst - cpp_api/distance.rst cpp_api/sparse.rst cpp_api/stats.rst \ No newline at end of file From 63e2e8bdaa1bc71f41bc59c46f86bf61e9511b6a Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Thu, 20 Oct 2022 20:01:34 -0400 Subject: [PATCH 03/14] Updating docs --- cpp/include/raft/cluster/kmeans.cuh | 470 ++++++++++++++++++ cpp/include/raft/cluster/kmeans_types.hpp | 13 +- cpp/include/raft/cluster/single_linkage.cuh | 81 +++ .../raft/cluster/single_linkage_types.hpp | 8 + docs/source/cpp_api/cluster.rst | 17 +- docs/source/cpp_api/core.rst | 63 ++- docs/source/cpp_api/solver.rst | 5 +- 7 files changed, 647 insertions(+), 10 deletions(-) diff --git a/cpp/include/raft/cluster/kmeans.cuh b/cpp/include/raft/cluster/kmeans.cuh index 0ce35da4a5..d737b1b736 100644 --- a/cpp/include/raft/cluster/kmeans.cuh +++ b/cpp/include/raft/cluster/kmeans.cuh @@ -21,6 +21,476 @@ #include #include +namespace raft::cluster::kmeans { + +/** + * @brief Find clusters with k-means algorithm. + * Initial centroids are chosen with k-means++ algorithm. Empty + * clusters are reinitialized by choosing new centroids with + * k-means++ algorithm. + * @tparam DataT the type of data used for weights, distances. + * @tparam IndexT the type of data used for indexing. + * @param[in] handle The raft handle. + * @param[in] params Parameters for KMeans model. + * @param[in] X Training instances to cluster. The data must + * be in row-major format. + * [dim = n_samples x n_features] + * @param[in] sample_weight Optional weights for each observation in X. + * [len = n_samples] + * @param[inout] centroids [in] When init is InitMethod::Array, use + * centroids as the initial cluster centers. + * [out] The generated centroids from the + * kmeans algorithm are stored at the address + * pointed by 'centroids'. + * [dim = n_clusters x n_features] + * @param[out] inertia Sum of squared distances of samples to their + * closest cluster center. + * @param[out] n_iter Number of iterations run. + */ +template +void fit(handle_t const& handle, + const KMeansParams& params, + raft::device_matrix_view X, + std::optional> sample_weight, + raft::device_matrix_view centroids, + raft::host_scalar_view inertia, + raft::host_scalar_view n_iter) +{ + detail::kmeans_fit(handle, params, X, sample_weight, centroids, inertia, n_iter); +} + +template +void fit(handle_t const& handle, + const KMeansParams& params, + const DataT* X, + const DataT* sample_weight, + DataT* centroids, + IndexT n_samples, + IndexT n_features, + DataT& inertia, + IndexT& n_iter) +{ + detail::kmeans_fit( + handle, params, X, sample_weight, centroids, n_samples, n_features, inertia, n_iter); +} + +/** + * @brief Predict the closest cluster each sample in X belongs to. + * @tparam DataT the type of data used for weights, distances. + * @tparam IndexT the type of data used for indexing. + * @param[in] handle The raft handle. + * @param[in] params Parameters for KMeans model. + * @param[in] X New data to predict. + * [dim = n_samples x n_features] + * @param[in] sample_weight Optional weights for each observation in X. + * [len = n_samples] + * @param[in] centroids Cluster centroids. The data must be in + * row-major format. + * [dim = n_clusters x n_features] + * @param[in] normalize_weight True if the weights should be normalized + * @param[out] labels Index of the cluster each sample in X + * belongs to. + * [len = n_samples] + * @param[out] inertia Sum of squared distances of samples to + * their closest cluster center. + */ +template +void predict(handle_t const& handle, + const KMeansParams& params, + raft::device_matrix_view X, + std::optional> sample_weight, + raft::device_matrix_view centroids, + raft::device_vector_view labels, + bool normalize_weight, + raft::host_scalar_view inertia) +{ + detail::kmeans_predict( + handle, params, X, sample_weight, centroids, labels, normalize_weight, inertia); +} + +template +void predict(handle_t const& handle, + const KMeansParams& params, + const DataT* X, + const DataT* sample_weight, + const DataT* centroids, + IndexT n_samples, + IndexT n_features, + IndexT* labels, + bool normalize_weight, + DataT& inertia) +{ + detail::kmeans_predict(handle, + params, + X, + sample_weight, + centroids, + n_samples, + n_features, + labels, + normalize_weight, + inertia); +} + +/** + * @brief Compute k-means clustering and predicts cluster index for each sample + * in the input. + * + * @tparam DataT the type of data used for weights, distances. + * @tparam IndexT the type of data used for indexing. + * @param[in] handle The raft handle. + * @param[in] params Parameters for KMeans model. + * @param[in] X Training instances to cluster. The data must be + * in row-major format. + * [dim = n_samples x n_features] + * @param[in] sample_weight Optional weights for each observation in X. + * [len = n_samples] + * @param[inout] centroids Optional + * [in] When init is InitMethod::Array, use + * centroids as the initial cluster centers + * [out] The generated centroids from the + * kmeans algorithm are stored at the address + * pointed by 'centroids'. + * [dim = n_clusters x n_features] + * @param[out] labels Index of the cluster each sample in X belongs + * to. + * [len = n_samples] + * @param[out] inertia Sum of squared distances of samples to their + * closest cluster center. + * @param[out] n_iter Number of iterations run. + */ +template +void fit_predict(handle_t const& handle, + const KMeansParams& params, + raft::device_matrix_view X, + std::optional> sample_weight, + std::optional> centroids, + raft::device_vector_view labels, + raft::host_scalar_view inertia, + raft::host_scalar_view n_iter) +{ + detail::kmeans_fit_predict( + handle, params, X, sample_weight, centroids, labels, inertia, n_iter); +} + +template +void fit_predict(handle_t const& handle, + const KMeansParams& params, + const DataT* X, + const DataT* sample_weight, + DataT* centroids, + IndexT n_samples, + IndexT n_features, + IndexT* labels, + DataT& inertia, + IndexT& n_iter) +{ + detail::kmeans_fit_predict( + handle, params, X, sample_weight, centroids, n_samples, n_features, labels, inertia, n_iter); +} + +/** + * @brief Transform X to a cluster-distance space. + * + * @tparam DataT the type of data used for weights, distances. + * @tparam IndexT the type of data used for indexing. + * @param[in] handle The raft handle. + * @param[in] params Parameters for KMeans model. + * @param[in] X Training instances to cluster. The data must + * be in row-major format + * [dim = n_samples x n_features] + * @param[in] centroids Cluster centroids. The data must be in row-major format. + * [dim = n_clusters x n_features] + * @param[out] X_new X transformed in the new space. + * [dim = n_samples x n_features] + */ +template +void transform(const raft::handle_t& handle, + const KMeansParams& params, + raft::device_matrix_view X, + raft::device_matrix_view centroids, + raft::device_matrix_view X_new) +{ + detail::kmeans_transform(handle, params, X, centroids, X_new); +} + +template +void transform(const raft::handle_t& handle, + const KMeansParams& params, + const DataT* X, + const DataT* centroids, + IndexT n_samples, + IndexT n_features, + DataT* X_new) +{ + detail::kmeans_transform( + handle, params, X, centroids, n_samples, n_features, X_new); +} + +template +using SamplingOp = detail::SamplingOp; + +template +using KeyValueIndexOp = detail::KeyValueIndexOp; + +/** + * @brief Select centroids according to a sampling operation + * + * @tparam DataT the type of data used for weights, distances. + * @tparam IndexT the type of data used for indexing. + * + * @param[in] handle The raft handle + * @param[in] X The data in row-major format + * [dim = n_samples x n_features] + * @param[in] minClusterDistance Distance for every sample to it's nearest centroid + * [dim = n_samples] + * @param[in] isSampleCentroid Flag the sample choosen as initial centroid + * [dim = n_samples] + * @param[in] select_op The sampling operation used to select the centroids + * @param[out] inRankCp The sampled centroids + * [dim = n_selected_centroids x n_features] + * @param[in] workspace Temporary workspace buffer which can get resized + * + */ +template +void sample_centroids(const raft::handle_t& handle, + const raft::device_matrix_view& X, + const raft::device_vector_view& minClusterDistance, + const raft::device_vector_view& isSampleCentroid, + SamplingOp& select_op, + rmm::device_uvector& inRankCp, + rmm::device_uvector& workspace) +{ + detail::sampleCentroids( + handle, X, minClusterDistance, isSampleCentroid, select_op, inRankCp, workspace); +} + +/** + * @brief Compute cluster cost + * + * @tparam DataT the type of data used for weights, distances. + * @tparam ReductionOpT the type of data used for the reduction operation. + * + * @param[in] handle The raft handle + * @param[in] minClusterDistance Distance for every sample to it's nearest centroid + * [dim = n_samples] + * @param[in] workspace Temporary workspace buffer which can get resized + * @param[out] clusterCost Resulting cluster cost + * @param[in] reduction_op The reduction operation used for the cost + * + */ +template +void cluster_cost(const raft::handle_t& handle, + const raft::device_vector_view& minClusterDistance, + rmm::device_uvector& workspace, + const raft::device_scalar_view& clusterCost, + ReductionOpT reduction_op) +{ + detail::computeClusterCost( + handle, minClusterDistance, workspace, clusterCost, reduction_op); +} + +/** + * @brief Compute distance for every sample to it's nearest centroid + * + * @tparam DataT the type of data used for weights, distances. + * @tparam IndexT the type of data used for indexing. + * + * @param[in] handle The raft handle + * @param[in] params The parameters for KMeans + * @param[in] X The data in row-major format + * [dim = n_samples x n_features] + * @param[in] centroids Centroids data + * [dim = n_cluster x n_features] + * @param[out] minClusterDistance Distance for every sample to it's nearest centroid + * [dim = n_samples] + * @param[in] L2NormX L2 norm of X : ||x||^2 + * [dim = n_samples] + * @param[out] L2NormBuf_OR_DistBuf Resizable buffer to store L2 norm of centroids or distance + * matrix + * @param[in] workspace Temporary workspace buffer which can get resized + * + */ +template +void min_cluster_distance_compute(const raft::handle_t& handle, + const KMeansParams& params, + const raft::device_matrix_view& X, + const raft::device_matrix_view& centroids, + const raft::device_vector_view& minClusterDistance, + const raft::device_vector_view& L2NormX, + rmm::device_uvector& L2NormBuf_OR_DistBuf, + rmm::device_uvector& workspace) +{ + detail::minClusterDistanceCompute( + handle, params, X, centroids, minClusterDistance, L2NormX, L2NormBuf_OR_DistBuf, workspace); +} + +/** + * @brief Calculates a pair for every sample in input 'X' where key is an + * index of one of the 'centroids' (index of the nearest centroid) and 'value' + * is the distance between the sample and the 'centroid[key]' + * + * @tparam DataT the type of data used for weights, distances. + * @tparam IndexT the type of data used for indexing. + * + * @param[in] handle The raft handle + * @param[in] params The parameters for KMeans + * @param[in] X The data in row-major format + * [dim = n_samples x n_features] + * @param[in] centroids Centroids data + * [dim = n_cluster x n_features] + * @param[out] minClusterAndDistance Distance vector that contains for every sample, the nearest + * centroid and it's distance + * [dim = n_samples] + * @param[in] L2NormX L2 norm of X : ||x||^2 + * [dim = n_samples] + * @param[out] L2NormBuf_OR_DistBuf Resizable buffer to store L2 norm of centroids or distance + * matrix + * @param[in] workspace Temporary workspace buffer which can get resized + * + */ +template +void min_cluster_and_distance( + const raft::handle_t& handle, + const KMeansParams& params, + const raft::device_matrix_view X, + const raft::device_matrix_view centroids, + const raft::device_vector_view, IndexT>& minClusterAndDistance, + const raft::device_vector_view& L2NormX, + rmm::device_uvector& L2NormBuf_OR_DistBuf, + rmm::device_uvector& workspace) +{ + detail::minClusterAndDistanceCompute( + handle, params, X, centroids, minClusterAndDistance, L2NormX, L2NormBuf_OR_DistBuf, workspace); +} + +/** + * @brief Shuffle and randomly select 'n_samples_to_gather' from input 'in' and stores + * in 'out' does not modify the input + * + * @tparam DataT the type of data used for weights, distances. + * @tparam IndexT the type of data used for indexing. + * + * @param[in] handle The raft handle + * @param[in] in The data to shuffle and gather + * [dim = n_samples x n_features] + * @param[out] out The sampled data + * [dim = n_samples_to_gather x n_features] + * @param[in] n_samples_to_gather Number of sample to gather + * @param[in] seed Seed for the shuffle + * @param[in] workspace Temporary workspace buffer which can get resized + * + */ +template +void shuffle_and_gather(const raft::handle_t& handle, + const raft::device_matrix_view& in, + const raft::device_matrix_view& out, + uint32_t n_samples_to_gather, + uint64_t seed, + rmm::device_uvector* workspace = nullptr) +{ + detail::shuffleAndGather(handle, in, out, n_samples_to_gather, seed, workspace); +} + +/** + * @brief Count the number of samples in each cluster + * + * @tparam DataT the type of data used for weights, distances. + * @tparam IndexT the type of data used for indexing. + * + * @param[in] handle The raft handle + * @param[in] params The parameters for KMeans + * @param[in] X The data in row-major format + * [dim = n_samples x n_features] + * @param[in] L2NormX L2 norm of X : ||x||^2 + * [dim = n_samples] + * @param[in] centroids Centroids data + * [dim = n_cluster x n_features] + * @param[in] workspace Temporary workspace buffer which can get resized + * @param[out] sampleCountInCluster The count for each centroid + * [dim = n_cluster] + * + */ +template +void count_samples_in_cluster(const raft::handle_t& handle, + const KMeansParams& params, + const raft::device_matrix_view& X, + const raft::device_vector_view& L2NormX, + const raft::device_matrix_view& centroids, + rmm::device_uvector& workspace, + const raft::device_vector_view& sampleCountInCluster) +{ + detail::countSamplesInCluster( + handle, params, X, L2NormX, centroids, workspace, sampleCountInCluster); +} + +/* + * @brief Selects 'n_clusters' samples from the input X using kmeans++ algorithm. + + * @note This is the algorithm described in + * "k-means++: the advantages of careful seeding". 2007, Arthur, D. and Vassilvitskii, S. + * ACM-SIAM symposium on Discrete algorithms. + * + * @tparam DataT the type of data used for weights, distances. + * @tparam IndexT the type of data used for indexing. + * + * @param[in] handle The raft handle + * @param[in] params The parameters for KMeans + * @param[in] X The data in row-major format + * [dim = n_samples x n_features] + * @param[out] centroids Centroids data + * [dim = n_cluster x n_features] + * @param[in] workspace Temporary workspace buffer which can get resized + */ +template +void init_plus_plus(const raft::handle_t& handle, + const KMeansParams& params, + const raft::device_matrix_view& X, + const raft::device_matrix_view& centroidsRawData, + rmm::device_uvector& workspace) +{ + detail::kmeansPlusPlus(handle, params, X, centroidsRawData, workspace); +} + +/* + * @brief Main function used to fit KMeans (after cluster initialization) + * + * @tparam DataT the type of data used for weights, distances. + * @tparam IndexT the type of data used for indexing. + * + * @param[in] handle The raft handle. + * @param[in] params Parameters for KMeans model. + * @param[in] X Training instances to cluster. The data must + * be in row-major format. + * [dim = n_samples x n_features] + * @param[in] sample_weight Weights for each observation in X. + * [len = n_samples] + * @param[inout] centroids [in] Initial cluster centers. + * [out] The generated centroids from the + * kmeans algorithm are stored at the address + * pointed by 'centroids'. + * [dim = n_clusters x n_features] + * @param[out] inertia Sum of squared distances of samples to their + * closest cluster center. + * @param[out] n_iter Number of iterations run. + * @param[in] workspace Temporary workspace buffer which can get resized + */ +template +void fit_main(const raft::handle_t& handle, + const KMeansParams& params, + const raft::device_matrix_view& X, + const raft::device_vector_view& weight, + const raft::device_matrix_view& centroidsRawData, + const raft::host_scalar_view& inertia, + const raft::host_scalar_view& n_iter, + rmm::device_uvector& workspace) +{ + detail::kmeans_fit_main( + handle, params, X, weight, centroidsRawData, inertia, n_iter, workspace); +} + +} // end namespace raft::cluster::kmeans + namespace raft::cluster { /** * @brief Find clusters with k-means algorithm. diff --git a/cpp/include/raft/cluster/kmeans_types.hpp b/cpp/include/raft/cluster/kmeans_types.hpp index 87fc7c1880..bb8e1a2b73 100644 --- a/cpp/include/raft/cluster/kmeans_types.hpp +++ b/cpp/include/raft/cluster/kmeans_types.hpp @@ -18,8 +18,9 @@ #include #include -namespace raft { -namespace cluster { +namespace raft::cluster { + +namespace kmeans { struct KMeansParams { enum InitMethod { KMeansPlusPlus, Random, Array }; @@ -69,5 +70,9 @@ struct KMeansParams { bool inertia_check = false; }; -} // namespace cluster -} // namespace raft + +} // namespace kmeans + +using kmeans::KMeansParams; + +} // namespace raft::cluster diff --git a/cpp/include/raft/cluster/single_linkage.cuh b/cpp/include/raft/cluster/single_linkage.cuh index 8e33b8389d..ca2234f01f 100644 --- a/cpp/include/raft/cluster/single_linkage.cuh +++ b/cpp/include/raft/cluster/single_linkage.cuh @@ -21,6 +21,87 @@ namespace raft::cluster { +namespace hierarchy { +constexpr int DEFAULT_CONST_C = 15; + +/** + * Single-linkage clustering, capable of constructing a KNN graph to + * scale the algorithm beyond the n^2 memory consumption of implementations + * that use the fully-connected graph of pairwise distances by connecting + * a knn graph when k is not large enough to connect it. + + * @tparam value_idx + * @tparam value_t + * @tparam dist_type method to use for constructing connectivities graph + * @param[in] handle raft handle + * @param[in] X dense input matrix in row-major layout + * @param[in] m number of rows in X + * @param[in] n number of columns in X + * @param[in] metric distance metrix to use when constructing connectivities graph + * @param[out] out struct containing output dendrogram and cluster assignments + * @param[in] c a constant used when constructing connectivities from knn graph. Allows the indirect + control + * of k. The algorithm will set `k = log(n) + c` + * @param[in] n_clusters number of clusters to assign data samples + */ +template +void single_linkage(const raft::handle_t& handle, + const value_t* X, + size_t m, + size_t n, + raft::distance::DistanceType metric, + linkage_output* out, + int c, + size_t n_clusters) +{ + detail::single_linkage( + handle, X, m, n, metric, out, c, n_clusters); +} + +/** + * Single-linkage clustering, capable of constructing a KNN graph to + * scale the algorithm beyond the n^2 memory consumption of implementations + * that use the fully-connected graph of pairwise distances by connecting + * a knn graph when k is not large enough to connect it. + + * @tparam value_idx + * @tparam value_t + * @tparam dist_type method to use for constructing connectivities graph + * @param[in] handle raft handle + * @param[in] X dense input matrix in row-major layout + * @param[out] dendrogram output dendrogram (size [n_rows - 1] * 2) + * @param[out] labels output labels vector (size n_rows) + * @param[in] metric distance metrix to use when constructing connectivities graph + * @param[in] n_clusters number of clusters to assign data samples + * @param[in] c a constant used when constructing connectivities from knn graph. Allows the indirect + control of k. The algorithm will set `k = log(n) + c` + */ +template +void single_linkage(const raft::handle_t& handle, + raft::device_matrix_view X, + raft::device_matrix_view dendrogram, + raft::device_vector_view labels, + raft::distance::DistanceType metric, + size_t n_clusters, + std::optional c = std::make_optional(DEFAULT_CONST_C)) +{ + linkage_output out_arrs; + out_arrs.children = dendrogram.data_handle(); + out_arrs.labels = labels.data_handle(); + + single_linkage(handle, + X.data_handle(), + static_cast(X.extent(0)), + static_cast(X.extent(1)), + metric, + &out_arrs, + c.has_value() ? c.value() : DEFAULT_CONST_C, + n_clusters); +} +} // namespace hierarchy + constexpr int DEFAULT_CONST_C = 15; /** diff --git a/cpp/include/raft/cluster/single_linkage_types.hpp b/cpp/include/raft/cluster/single_linkage_types.hpp index 79f2ede482..28b245a2cf 100644 --- a/cpp/include/raft/cluster/single_linkage_types.hpp +++ b/cpp/include/raft/cluster/single_linkage_types.hpp @@ -20,6 +20,7 @@ namespace raft::cluster { +namespace hierarchy { enum LinkageDistance { PAIRWISE = 0, KNN_GRAPH = 1 }; /** @@ -58,4 +59,11 @@ class linkage_output_int : public linkage_output { class linkage_output_int64 : public linkage_output { }; +} // end namespace hierarchy + +using hierarchy::linkage_output; +using hierarchy::linkage_output_int; +using hierarchy::linkage_output_int64; +using hierarchy::LinkageDistance; + }; // namespace raft::cluster \ No newline at end of file diff --git a/docs/source/cpp_api/cluster.rst b/docs/source/cpp_api/cluster.rst index 781180a72a..41816482cc 100644 --- a/docs/source/cpp_api/cluster.rst +++ b/docs/source/cpp_api/cluster.rst @@ -3,10 +3,25 @@ Cluster This page provides C++ class references for the publicly-exposed elements of the cluster package. -.. doxygennamespace:: raft::cluster +K-Means +------- + +.. doxygennamespace:: raft::cluster::kmeans + :project: RAFT + :members: + + +Hierarchical Clustering +----------------------- + +.. doxygennamespace:: raft::cluster::hierarchy :project: RAFT :members: + +Spectral Clustering +------------------- + .. doxygennamespace:: raft::spectral :project: RAFT :members: \ No newline at end of file diff --git a/docs/source/cpp_api/core.rst b/docs/source/cpp_api/core.rst index ef6270556e..d4891bf0b3 100644 --- a/docs/source/cpp_api/core.rst +++ b/docs/source/cpp_api/core.rst @@ -4,7 +4,6 @@ Core This page provides C++ class references for the publicly-exposed elements of the core package. - handle_t ######## @@ -20,6 +19,13 @@ interruptible :project: RAFT :members: +NVTX +#### + +.. doxygennamespace:: raft::common::nvtx + :project: RAFT + :members: + mdarray ####### @@ -28,11 +34,64 @@ mdarray :project: RAFT :members: +.. doxygenclass:: raft::make_device_matrix + :project: RAFT + +.. doxygenclass:: raft::make_device_vector + :project: RAFT + +.. doxygenclass:: raft::make_device_scalar + :project: RAFT + +.. doxygenclass:: raft::make_host_matrix + :project: RAFT + +.. doxygenclass:: raft::make_host_vector + :project: RAFT + +.. doxygenclass:: raft::make_device_scalar + :project: RAFT + + +mdspan +####### + +.. doxygenfunction:: raft::make_device_mdspan + :project: RAFT + +.. doxygenfunction:: raft::make_device_matrix_view + :project: RAFT + +.. doxygenfunction:: raft::make_device_vector_view + :project: RAFT + +.. doxygenfunction:: raft::make_device_scalar_view + :project: RAFT + +.. doxygenfunction:: raft::make_host_matrix_view + :project: RAFT + +.. doxygenfunction:: raft::make_host_vector_view + :project: RAFT + +.. doxygenfunction:: raft::make_device_scalar_view + :project: RAFT span #### -.. doxygenclass:: raft::span +.. doxygenclass:: raft::device_span + :project: RAFT + :members: + +.. doxygenclass:: raft::host_span + :project: RAFT + :members: + +Key-Value Pair +############## + +.. doxygenclass:: raft::KeyValuePair :project: RAFT :members: diff --git a/docs/source/cpp_api/solver.rst b/docs/source/cpp_api/solver.rst index 75cec2494e..a8b93ca046 100644 --- a/docs/source/cpp_api/solver.rst +++ b/docs/source/cpp_api/solver.rst @@ -7,13 +7,12 @@ This page provides C++ class references for the publicly-exposed elements of the Linear Assignment Problem ######################### -.. doxygenclass:: raft::lap::LinearAssignmentProblem +.. doxygenclass:: raft::solver::LinearAssignmentProblem :project: RAFT :members: Minimum Spanning Tree ##################### -.. doxygennamespace:: raft::mst +.. doxygenfunction:: raft::sparse::solver::mst :project: RAFT - :members: From 831c3d244dcb7c428a5f2e63ad964e265c87575d Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Thu, 20 Oct 2022 20:29:15 -0400 Subject: [PATCH 04/14] Making sure we call new cluster namespaced code from deprecated code to spot any bugs/syntax issues --- cpp/include/raft/cluster/kmeans.cuh | 71 ++++++++++----------- cpp/include/raft/cluster/single_linkage.cuh | 20 ++---- 2 files changed, 39 insertions(+), 52 deletions(-) diff --git a/cpp/include/raft/cluster/kmeans.cuh b/cpp/include/raft/cluster/kmeans.cuh index d737b1b736..6384cfdeaa 100644 --- a/cpp/include/raft/cluster/kmeans.cuh +++ b/cpp/include/raft/cluster/kmeans.cuh @@ -312,14 +312,14 @@ void cluster_cost(const raft::handle_t& handle, * */ template -void min_cluster_distance_compute(const raft::handle_t& handle, - const KMeansParams& params, - const raft::device_matrix_view& X, - const raft::device_matrix_view& centroids, - const raft::device_vector_view& minClusterDistance, - const raft::device_vector_view& L2NormX, - rmm::device_uvector& L2NormBuf_OR_DistBuf, - rmm::device_uvector& workspace) +void min_cluster_distance(const raft::handle_t& handle, + const KMeansParams& params, + const raft::device_matrix_view& X, + const raft::device_matrix_view& centroids, + const raft::device_vector_view& minClusterDistance, + const raft::device_vector_view& L2NormX, + rmm::device_uvector& L2NormBuf_OR_DistBuf, + rmm::device_uvector& workspace) { detail::minClusterDistanceCompute( handle, params, X, centroids, minClusterDistance, L2NormX, L2NormBuf_OR_DistBuf, workspace); @@ -525,7 +525,7 @@ void kmeans_fit(handle_t const& handle, raft::host_scalar_view inertia, raft::host_scalar_view n_iter) { - detail::kmeans_fit(handle, params, X, sample_weight, centroids, inertia, n_iter); + kmeans::fit(handle, params, X, sample_weight, centroids, inertia, n_iter); } template @@ -539,7 +539,7 @@ void kmeans_fit(handle_t const& handle, DataT& inertia, IndexT& n_iter) { - detail::kmeans_fit( + kmeans::fit( handle, params, X, sample_weight, centroids, n_samples, n_features, inertia, n_iter); } @@ -573,7 +573,7 @@ void kmeans_predict(handle_t const& handle, bool normalize_weight, raft::host_scalar_view inertia) { - detail::kmeans_predict( + kmeans::predict( handle, params, X, sample_weight, centroids, labels, normalize_weight, inertia); } @@ -589,16 +589,16 @@ void kmeans_predict(handle_t const& handle, bool normalize_weight, DataT& inertia) { - detail::kmeans_predict(handle, - params, - X, - sample_weight, - centroids, - n_samples, - n_features, - labels, - normalize_weight, - inertia); + kmeans::predict(handle, + params, + X, + sample_weight, + centroids, + n_samples, + n_features, + labels, + normalize_weight, + inertia); } /** @@ -638,7 +638,7 @@ void kmeans_fit_predict(handle_t const& handle, raft::host_scalar_view inertia, raft::host_scalar_view n_iter) { - detail::kmeans_fit_predict( + kmeans::fit_predict( handle, params, X, sample_weight, centroids, labels, inertia, n_iter); } @@ -654,7 +654,7 @@ void kmeans_fit_predict(handle_t const& handle, DataT& inertia, IndexT& n_iter) { - detail::kmeans_fit_predict( + kmeans::fit_predict( handle, params, X, sample_weight, centroids, n_samples, n_features, labels, inertia, n_iter); } @@ -680,7 +680,7 @@ void kmeans_transform(const raft::handle_t& handle, raft::device_matrix_view centroids, raft::device_matrix_view X_new) { - detail::kmeans_transform(handle, params, X, centroids, X_new); + kmeans::transform(handle, params, X, centroids, X_new); } template @@ -692,15 +692,14 @@ void kmeans_transform(const raft::handle_t& handle, IndexT n_features, DataT* X_new) { - detail::kmeans_transform( - handle, params, X, centroids, n_samples, n_features, X_new); + kmeans::transform(handle, params, X, centroids, n_samples, n_features, X_new); } template -using SamplingOp = detail::SamplingOp; +using SamplingOp = kmeans::SamplingOp; template -using KeyValueIndexOp = detail::KeyValueIndexOp; +using KeyValueIndexOp = kmeans::KeyValueIndexOp; /** * @brief Select centroids according to a sampling operation @@ -730,7 +729,7 @@ void sampleCentroids(const raft::handle_t& handle, rmm::device_uvector& inRankCp, rmm::device_uvector& workspace) { - detail::sampleCentroids( + kmeans::sample_entroids( handle, X, minClusterDistance, isSampleCentroid, select_op, inRankCp, workspace); } @@ -755,7 +754,7 @@ void computeClusterCost(const raft::handle_t& handle, const raft::device_scalar_view& clusterCost, ReductionOpT reduction_op) { - detail::computeClusterCost( + kmeans::cluster_cost( handle, minClusterDistance, workspace, clusterCost, reduction_op); } @@ -790,7 +789,7 @@ void minClusterDistanceCompute(const raft::handle_t& handle, rmm::device_uvector& L2NormBuf_OR_DistBuf, rmm::device_uvector& workspace) { - detail::minClusterDistanceCompute( + kmeans::min_cluster_distance( handle, params, X, centroids, minClusterDistance, L2NormX, L2NormBuf_OR_DistBuf, workspace); } @@ -829,7 +828,7 @@ void minClusterAndDistanceCompute( rmm::device_uvector& L2NormBuf_OR_DistBuf, rmm::device_uvector& workspace) { - detail::minClusterAndDistanceCompute( + kmeans::min_cluster_and_distance( handle, params, X, centroids, minClusterAndDistance, L2NormX, L2NormBuf_OR_DistBuf, workspace); } @@ -858,7 +857,7 @@ void shuffleAndGather(const raft::handle_t& handle, uint64_t seed, rmm::device_uvector* workspace = nullptr) { - detail::shuffleAndGather(handle, in, out, n_samples_to_gather, seed, workspace); + kmeans::shuffle_and_gather(handle, in, out, n_samples_to_gather, seed, workspace); } /** @@ -889,7 +888,7 @@ void countSamplesInCluster(const raft::handle_t& handle, rmm::device_uvector& workspace, const raft::device_vector_view& sampleCountInCluster) { - detail::countSamplesInCluster( + kmeans::count_samples_in_cluster( handle, params, X, L2NormX, centroids, workspace, sampleCountInCluster); } @@ -918,7 +917,7 @@ void kmeansPlusPlus(const raft::handle_t& handle, const raft::device_matrix_view& centroidsRawData, rmm::device_uvector& workspace) { - detail::kmeansPlusPlus(handle, params, X, centroidsRawData, workspace); + kmeans::init_plus_plus(handle, params, X, centroidsRawData, workspace); } /* @@ -954,7 +953,7 @@ void kmeans_fit_main(const raft::handle_t& handle, const raft::host_scalar_view& n_iter, rmm::device_uvector& workspace) { - detail::kmeans_fit_main( + kmeans::fit_main( handle, params, X, weight, centroidsRawData, inertia, n_iter, workspace); } } // namespace raft::cluster diff --git a/cpp/include/raft/cluster/single_linkage.cuh b/cpp/include/raft/cluster/single_linkage.cuh index ca2234f01f..36fc812445 100644 --- a/cpp/include/raft/cluster/single_linkage.cuh +++ b/cpp/include/raft/cluster/single_linkage.cuh @@ -102,8 +102,6 @@ void single_linkage(const raft::handle_t& handle, } } // namespace hierarchy -constexpr int DEFAULT_CONST_C = 15; - /** * Single-linkage clustering, capable of constructing a KNN graph to * scale the algorithm beyond the n^2 memory consumption of implementations @@ -136,7 +134,7 @@ void single_linkage(const raft::handle_t& handle, int c, size_t n_clusters) { - detail::single_linkage( + hierarchy::single_linkage( handle, X, m, n, metric, out, c, n_clusters); } @@ -165,20 +163,10 @@ void single_linkage(const raft::handle_t& handle, raft::device_vector_view labels, raft::distance::DistanceType metric, size_t n_clusters, - std::optional c = std::make_optional(DEFAULT_CONST_C)) + std::optional c = std::make_optional(hierarchy::DEFAULT_CONST_C)) { - linkage_output out_arrs; - out_arrs.children = dendrogram.data_handle(); - out_arrs.labels = labels.data_handle(); - - single_linkage(handle, - X.data_handle(), - static_cast(X.extent(0)), - static_cast(X.extent(1)), - metric, - &out_arrs, - c.has_value() ? c.value() : DEFAULT_CONST_C, - n_clusters); + hierarchy::single_linkage( + handle, X, dendrogram, labels, metric, n_clusters, c); } }; // namespace raft::cluster From 31047e638b94316af12d8fcd1b7057bf1f9caf1d Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Thu, 20 Oct 2022 20:41:11 -0400 Subject: [PATCH 05/14] Deprecation warnings --- cpp/include/raft/cluster/single_linkage.cuh | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/cpp/include/raft/cluster/single_linkage.cuh b/cpp/include/raft/cluster/single_linkage.cuh index 36fc812445..0a79601647 100644 --- a/cpp/include/raft/cluster/single_linkage.cuh +++ b/cpp/include/raft/cluster/single_linkage.cuh @@ -102,6 +102,12 @@ void single_linkage(const raft::handle_t& handle, } } // namespace hierarchy +/** + * Note: All of the functions below in the raft::cluster namespace are deprecated + * and will be removed in a future release. Please use raft::cluster::hierarchy + * instead. + */ + /** * Single-linkage clustering, capable of constructing a KNN graph to * scale the algorithm beyond the n^2 memory consumption of implementations From fd8899ce7a2f5afa0066cacf83c919db20a9cee0 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Thu, 20 Oct 2022 21:10:38 -0400 Subject: [PATCH 06/14] Fixing typo --- cpp/include/raft/cluster/kmeans.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/include/raft/cluster/kmeans.cuh b/cpp/include/raft/cluster/kmeans.cuh index 10004b69bf..cfd47d4058 100644 --- a/cpp/include/raft/cluster/kmeans.cuh +++ b/cpp/include/raft/cluster/kmeans.cuh @@ -735,7 +735,7 @@ void sampleCentroids(const raft::handle_t& handle, rmm::device_uvector& inRankCp, rmm::device_uvector& workspace) { - kmeans::sample_entroids( + kmeans::sample_centroids( handle, X, minClusterDistance, isSampleCentroid, select_op, inRankCp, workspace); } From 4968ce88263150065d0d13dead09a0b5b73f85e3 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Thu, 20 Oct 2022 21:32:52 -0400 Subject: [PATCH 07/14] Fixing hierarchical compile error --- cpp/include/raft/cluster/single_linkage.cuh | 106 ++++-------------- .../raft/cluster/single_linkage_types.hpp | 16 ++- cpp/test/CMakeLists.txt | 2 +- cpp/test/{sparse => cluster}/linkage.cu | 21 ++-- 4 files changed, 39 insertions(+), 106 deletions(-) rename cpp/test/{sparse => cluster}/linkage.cu (98%) diff --git a/cpp/include/raft/cluster/single_linkage.cuh b/cpp/include/raft/cluster/single_linkage.cuh index 0a79601647..2d74c364b2 100644 --- a/cpp/include/raft/cluster/single_linkage.cuh +++ b/cpp/include/raft/cluster/single_linkage.cuh @@ -21,8 +21,11 @@ namespace raft::cluster { -namespace hierarchy { -constexpr int DEFAULT_CONST_C = 15; +/** + * Note: All of the functions below in the raft::cluster namespace are deprecated + * and will be removed in a future release. Please use raft::cluster::hierarchy + * instead. + */ /** * Single-linkage clustering, capable of constructing a KNN graph to @@ -59,6 +62,11 @@ void single_linkage(const raft::handle_t& handle, detail::single_linkage( handle, X, m, n, metric, out, c, n_clusters); } +}; // namespace raft::cluster + +namespace raft::cluster::hierarchy { + +constexpr int DEFAULT_CONST_C = 15; /** * Single-linkage clustering, capable of constructing a KNN graph to @@ -91,88 +99,14 @@ void single_linkage(const raft::handle_t& handle, out_arrs.children = dendrogram.data_handle(); out_arrs.labels = labels.data_handle(); - single_linkage(handle, - X.data_handle(), - static_cast(X.extent(0)), - static_cast(X.extent(1)), - metric, - &out_arrs, - c.has_value() ? c.value() : DEFAULT_CONST_C, - n_clusters); + raft::cluster::single_linkage( + handle, + X.data_handle(), + static_cast(X.extent(0)), + static_cast(X.extent(1)), + metric, + &out_arrs, + c.has_value() ? c.value() : DEFAULT_CONST_C, + n_clusters); } -} // namespace hierarchy - -/** - * Note: All of the functions below in the raft::cluster namespace are deprecated - * and will be removed in a future release. Please use raft::cluster::hierarchy - * instead. - */ - -/** - * Single-linkage clustering, capable of constructing a KNN graph to - * scale the algorithm beyond the n^2 memory consumption of implementations - * that use the fully-connected graph of pairwise distances by connecting - * a knn graph when k is not large enough to connect it. - - * @tparam value_idx - * @tparam value_t - * @tparam dist_type method to use for constructing connectivities graph - * @param[in] handle raft handle - * @param[in] X dense input matrix in row-major layout - * @param[in] m number of rows in X - * @param[in] n number of columns in X - * @param[in] metric distance metrix to use when constructing connectivities graph - * @param[out] out struct containing output dendrogram and cluster assignments - * @param[in] c a constant used when constructing connectivities from knn graph. Allows the indirect - control - * of k. The algorithm will set `k = log(n) + c` - * @param[in] n_clusters number of clusters to assign data samples - */ -template -void single_linkage(const raft::handle_t& handle, - const value_t* X, - size_t m, - size_t n, - raft::distance::DistanceType metric, - linkage_output* out, - int c, - size_t n_clusters) -{ - hierarchy::single_linkage( - handle, X, m, n, metric, out, c, n_clusters); -} - -/** - * Single-linkage clustering, capable of constructing a KNN graph to - * scale the algorithm beyond the n^2 memory consumption of implementations - * that use the fully-connected graph of pairwise distances by connecting - * a knn graph when k is not large enough to connect it. - - * @tparam value_idx - * @tparam value_t - * @tparam dist_type method to use for constructing connectivities graph - * @param[in] handle raft handle - * @param[in] X dense input matrix in row-major layout - * @param[out] dendrogram output dendrogram (size [n_rows - 1] * 2) - * @param[out] labels output labels vector (size n_rows) - * @param[in] metric distance metrix to use when constructing connectivities graph - * @param[in] n_clusters number of clusters to assign data samples - * @param[in] c a constant used when constructing connectivities from knn graph. Allows the indirect - control of k. The algorithm will set `k = log(n) + c` - */ -template -void single_linkage(const raft::handle_t& handle, - raft::device_matrix_view X, - raft::device_matrix_view dendrogram, - raft::device_vector_view labels, - raft::distance::DistanceType metric, - size_t n_clusters, - std::optional c = std::make_optional(hierarchy::DEFAULT_CONST_C)) -{ - hierarchy::single_linkage( - handle, X, dendrogram, labels, metric, n_clusters, c); -} - -}; // namespace raft::cluster +}; // namespace raft::cluster::hierarchy diff --git a/cpp/include/raft/cluster/single_linkage_types.hpp b/cpp/include/raft/cluster/single_linkage_types.hpp index 28b245a2cf..55239ff6d6 100644 --- a/cpp/include/raft/cluster/single_linkage_types.hpp +++ b/cpp/include/raft/cluster/single_linkage_types.hpp @@ -18,10 +18,15 @@ #include +namespace raft::cluster::hierarchy { +enum LinkageDistance { PAIRWISE = 0, KNN_GRAPH = 1 }; + +}; // end namespace raft::cluster::hierarchy + +// The code below is legacy namespace raft::cluster { -namespace hierarchy { -enum LinkageDistance { PAIRWISE = 0, KNN_GRAPH = 1 }; +using hierarchy::LinkageDistance; /** * Simple POCO for consolidating linkage results. This closely @@ -59,11 +64,4 @@ class linkage_output_int : public linkage_output { class linkage_output_int64 : public linkage_output { }; -} // end namespace hierarchy - -using hierarchy::linkage_output; -using hierarchy::linkage_output_int; -using hierarchy::linkage_output_int64; -using hierarchy::LinkageDistance; - }; // namespace raft::cluster \ No newline at end of file diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt index 07ec85bf1e..0d5af9be5c 100644 --- a/cpp/test/CMakeLists.txt +++ b/cpp/test/CMakeLists.txt @@ -82,7 +82,7 @@ if(BUILD_TESTS) PATH test/cluster/kmeans.cu test/cluster_solvers.cu - test/sparse/linkage.cu + test/cluster/linkage.cu OPTIONAL DIST NN ) diff --git a/cpp/test/sparse/linkage.cu b/cpp/test/cluster/linkage.cu similarity index 98% rename from cpp/test/sparse/linkage.cu rename to cpp/test/cluster/linkage.cu index ce5741d06b..5533f552bd 100644 --- a/cpp/test/sparse/linkage.cu +++ b/cpp/test/cluster/linkage.cu @@ -180,20 +180,21 @@ class LinkageTest : public ::testing::TestWithParam> { raft::handle_t handle; - auto data_view = - raft::make_device_matrix_view(data.data(), params.n_row, params.n_col); + auto data_view = raft::make_device_matrix_view( + data.data(), params.n_row, params.n_col); auto dendrogram_view = raft::make_device_matrix_view(out_children.data(), params.n_row, 2); auto labels_view = raft::make_device_vector_view(labels.data(), params.n_row); - raft::cluster::single_linkage( - handle, - data_view, - dendrogram_view, - labels_view, - raft::distance::DistanceType::L2SqrtExpanded, - params.n_clusters, - std::make_optional(params.c)); + raft::cluster::hierarchy:: + single_linkage( + handle, + data_view, + dendrogram_view, + labels_view, + raft::distance::DistanceType::L2SqrtExpanded, + params.n_clusters, + std::make_optional(params.c)); handle.sync_stream(stream); From 6ccf61fbac6ae3d6eb6a098b9eb1e7ea4c77d27e Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Thu, 20 Oct 2022 22:03:46 -0400 Subject: [PATCH 08/14] Removing namespace conflict --- cpp/include/raft/cluster/kmeans.cuh | 4 +- cpp/include/raft/cluster/kmeans_types.hpp | 8 ++-- cpp/test/cluster_solvers_deprecated.cu | 48 ----------------------- 3 files changed, 6 insertions(+), 54 deletions(-) diff --git a/cpp/include/raft/cluster/kmeans.cuh b/cpp/include/raft/cluster/kmeans.cuh index cfd47d4058..2025a15ecf 100644 --- a/cpp/include/raft/cluster/kmeans.cuh +++ b/cpp/include/raft/cluster/kmeans.cuh @@ -489,7 +489,7 @@ void fit_main(const raft::handle_t& handle, handle, params, X, weight, centroidsRawData, inertia, n_iter, workspace); } -} // end namespace raft::cluster::kmeans +}; // end namespace raft::cluster::kmeans namespace raft::cluster { @@ -962,4 +962,4 @@ void kmeans_fit_main(const raft::handle_t& handle, kmeans::fit_main( handle, params, X, weight, centroidsRawData, inertia, n_iter, workspace); } -} // namespace raft::cluster +}; // namespace raft::cluster diff --git a/cpp/include/raft/cluster/kmeans_types.hpp b/cpp/include/raft/cluster/kmeans_types.hpp index bb8e1a2b73..d6eadd1ba6 100644 --- a/cpp/include/raft/cluster/kmeans_types.hpp +++ b/cpp/include/raft/cluster/kmeans_types.hpp @@ -18,9 +18,7 @@ #include #include -namespace raft::cluster { - -namespace kmeans { +namespace raft::cluster::kmeans { struct KMeansParams { enum InitMethod { KMeansPlusPlus, Random, Array }; @@ -71,7 +69,9 @@ struct KMeansParams { bool inertia_check = false; }; -} // namespace kmeans +} // namespace raft::cluster::kmeans + +namespace raft::cluster { using kmeans::KMeansParams; diff --git a/cpp/test/cluster_solvers_deprecated.cu b/cpp/test/cluster_solvers_deprecated.cu index 1e9ec0c15b..167a710b34 100644 --- a/cpp/test/cluster_solvers_deprecated.cu +++ b/cpp/test/cluster_solvers_deprecated.cu @@ -20,7 +20,6 @@ #include #include -#include namespace raft { namespace spectral { @@ -54,52 +53,5 @@ TEST(Raft, ClusterSolvers) EXPECT_ANY_THROW(cluster_solver.solve(h, n, d, eigvecs, codes)); } -TEST(Raft, ModularitySolvers) -{ - using namespace matrix; - using index_type = int; - using value_type = double; - - handle_t h; - ASSERT_EQ(0, - h. - - get_device() - - ); - - index_type neigvs{10}; - index_type maxiter{100}; - index_type restart_iter{10}; - value_type tol{1.0e-10}; - bool reorthog{true}; - - // nullptr expected to trigger exceptions: - // - index_type* clusters{nullptr}; - value_type* eigvals{nullptr}; - value_type* eigvecs{nullptr}; - - unsigned long long seed{100110021003}; - - eigen_solver_config_t eig_cfg{ - neigvs, maxiter, restart_iter, tol, reorthog, seed}; - lanczos_solver_t eig_solver{eig_cfg}; - - index_type k{5}; - - cluster_solver_config_deprecated_t clust_cfg{k, maxiter, tol, seed}; - kmeans_solver_deprecated_t cluster_solver{clust_cfg}; - - auto stream = h.get_stream(); - sparse_matrix_t sm{h, nullptr, nullptr, nullptr, 0, 0}; - - EXPECT_ANY_THROW(spectral::modularity_maximization( - h, sm, eig_solver, cluster_solver, clusters, eigvals, eigvecs)); - - value_type modularity{0}; - EXPECT_ANY_THROW(spectral::analyzeModularity(h, sm, k, clusters, modularity)); -} - } // namespace spectral } // namespace raft From a36f26e8e378856946578258e7b7a40e56628e97 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Fri, 21 Oct 2022 18:41:31 -0400 Subject: [PATCH 09/14] Many many updates to the docs, including a quick-start --- build.sh | 44 ++++- cpp/doxygen/Doxyfile.in | 4 +- cpp/include/raft/cluster/kmeans.cuh | 161 ++++++++++-------- cpp/include/raft/cluster/kmeans_params.hpp | 11 +- cpp/include/raft/cluster/kmeans_types.hpp | 66 +++++-- .../raft/cluster/single_linkage_types.hpp | 24 ++- cpp/include/raft/neighbors/ball_cover.cuh | 60 +++++++ cpp/include/raft/neighbors/brute_force.cuh | 91 +++++++--- .../raft/neighbors/epsilon_neighborhood.cuh | 17 +- cpp/include/raft/neighbors/ivf_flat.cuh | 102 ++++++----- cpp/include/raft/neighbors/ivf_pq.cuh | 2 +- cpp/include/raft/solver/linear_assignment.cuh | 49 +++++- cpp/include/raft/sparse/solver/mst.cuh | 23 +++ cpp/include/raft/spatial/knn/ivf_flat.cuh | 1 - .../raft/stats/adjusted_rand_index.cuh | 8 +- cpp/include/raft/stats/common.hpp | 59 +------ cpp/include/raft/stats/detail/histogram.cuh | 6 + cpp/include/raft/stats/histogram.cuh | 8 + cpp/include/raft/stats/stats_types.hpp | 62 +++++++ cpp/test/neighbors/ann_ivf_flat.cu | 2 +- cpp/test/neighbors/knn.cu | 3 +- docs/source/cpp_api.rst | 7 +- docs/source/cpp_api/cluster.rst | 3 +- docs/source/cpp_api/core.rst | 75 ++++++-- docs/source/cpp_api/distance.rst | 3 +- docs/source/cpp_api/linalg.rst | 5 +- docs/source/cpp_api/matrix.rst | 3 +- docs/source/cpp_api/solver.rst | 6 +- docs/source/cpp_api/sparse.rst | 15 +- docs/source/index.rst | 37 +++- docs/source/quick_start.md | 129 ++++++++++++++ 31 files changed, 811 insertions(+), 275 deletions(-) create mode 100644 cpp/include/raft/stats/stats_types.hpp create mode 100644 docs/source/quick_start.md diff --git a/build.sh b/build.sh index 9548fbec44..61e6d1a007 100755 --- a/build.sh +++ b/build.sh @@ -227,18 +227,50 @@ fi if hasArg tests || (( ${NUMARGS} == 0 )); then BUILD_TESTS=ON - COMPILE_DIST_LIBRARY=ON - ENABLE_NN_DEPENDENCIES=ON - COMPILE_NN_LIBRARY=ON CMAKE_TARGET="${CMAKE_TARGET};${TEST_TARGETS}" + + # Force compile nn library when needed test targets are specified + if [[ $CMAKE_TARGET == *"CLUSTER_TEST"* || \ + $CMAKE_TARGET == *"SPARSE_DIST_TEST"* || \ + $CMAKE_TARGET == *"SPARSE_NEIGHBORS_TEST"* || \ + $CMAKE_TARGET == *"NEIGHBORS_TEST"* || \ + $CMAKE_TARGET == *"STATS_TEST"* ]]; then + echo "-- Enabling nearest neighbors lib for gtests" + ENABLE_NN_DEPENDENCIES=ON + COMPILE_NN_LIBRARY=ON + fi + + # Force compile distance library when needed test targets are specified + if [[ $CMAKE_TARGET == *"CLUSTER_TEST"* || \ + $CMAKE_TARGET == *"DISTANCE_TEST"* || \ + $CMAKE_TARGET == *"SPARSE_DIST_TEST" || \ + $CMAKE_TARGET == *"SPARSE_NEIGHBORS_TEST"* || \ + $CMAKE_TARGET == *"NEIGHBORS_TEST" || \ + $CMAKE_TARGET == *"STATS_TEST"* ]]; then + echo "-- Enabling distance lib for gtests" + COMPILE_DIST_LIBRARY=ON + fi fi if hasArg bench || (( ${NUMARGS} == 0 )); then BUILD_BENCH=ON - COMPILE_DIST_LIBRARY=ON - ENABLE_NN_DEPENDENCIES=ON - COMPILE_NN_LIBRARY=ON CMAKE_TARGET="${CMAKE_TARGET};${BENCH_TARGETS}" + + # Force compile nn library when needed benchmark targets are specified + if [[ $CMAKE_TARGET == *"CLUSTER_BENCH"* || \ + $CMAKE_TARGET == *"NEIGHBORS_BENCH"* ]]; then + echo "-- Enabling nearest neighbors lib for benchmarks" + ENABLE_NN_DEPENDENCIES=ON + COMPILE_NN_LIBRARY=ON + fi + + # Force compile distance library when needed benchmark targets are specified + if [[ $CMAKE_TARGET == *"CLUSTER_BENCH"* || \ + $CMAKE_TARGET == *"NEIGHBORS_BENCH"* ]]; then + echo "-- Enabling distance lib for benchmarks" + COMPILE_DIST_LIBRARY=ON + fi + fi if hasArg --buildfaiss; then diff --git a/cpp/doxygen/Doxyfile.in b/cpp/doxygen/Doxyfile.in index 5517562a9f..07056e503d 100644 --- a/cpp/doxygen/Doxyfile.in +++ b/cpp/doxygen/Doxyfile.in @@ -900,7 +900,9 @@ EXCLUDE = @CMAKE_CURRENT_SOURCE_DIR@/include/raft/sparse/linalg/s @CMAKE_CURRENT_SOURCE_DIR@/include/raft/span.hpp \ @CMAKE_CURRENT_SOURCE_DIR@/include/raft/vectorized.cuh \ @CMAKE_CURRENT_SOURCE_DIR@/include/raft/raft.hpp \ - @CMAKE_CURRENT_SOURCE_DIR@/include/raft/core/cudart_utils.hpp + @CMAKE_CURRENT_SOURCE_DIR@/include/raft/core/cudart_utils.hpp \ + @CMAKE_CURRENT_SOURCE_DIR@/include/raft/matrix/math.cuh \ + @CMAKE_CURRENT_SOURCE_DIR@/include/raft/matrix/matrix.cuh # The EXCLUDE_SYMLINKS tag can be used to select whether or not files or # directories that are symbolic links (a Unix file system feature) are excluded diff --git a/cpp/include/raft/cluster/kmeans.cuh b/cpp/include/raft/cluster/kmeans.cuh index 2025a15ecf..a85fd1b38b 100644 --- a/cpp/include/raft/cluster/kmeans.cuh +++ b/cpp/include/raft/cluster/kmeans.cuh @@ -28,6 +28,27 @@ namespace raft::cluster::kmeans { * Initial centroids are chosen with k-means++ algorithm. Empty * clusters are reinitialized by choosing new centroids with * k-means++ algorithm. + * + * @code{.cpp} + * #include + * #include + * #include + * using namespace raft::cluster; + * ... + * raft::handle_t handle; + * raft::cluster::KMeansParams params; + * int n_features = 15, inertia, n_iter; + * auto centroids = raft::make_device_matrix(handle, params.n_clusters, n_features); + * + * kmeans::fit(handle, + * params, + * X, + * std::nullopt, + * centroids, + * raft::make_scalar_view(&inertia), + * raft::make_scalar_view(&n_iter)); + * @endcode + * * @tparam DataT the type of data used for weights, distances. * @tparam IndexT the type of data used for indexing. * @param[in] handle The raft handle. @@ -47,7 +68,7 @@ namespace raft::cluster::kmeans { * closest cluster center. * @param[out] n_iter Number of iterations run. */ -template +template void fit(handle_t const& handle, const KMeansParams& params, raft::device_matrix_view X, @@ -59,23 +80,40 @@ void fit(handle_t const& handle, detail::kmeans_fit(handle, params, X, sample_weight, centroids, inertia, n_iter); } -template -void fit(handle_t const& handle, - const KMeansParams& params, - const DataT* X, - const DataT* sample_weight, - DataT* centroids, - IndexT n_samples, - IndexT n_features, - DataT& inertia, - IndexT& n_iter) -{ - detail::kmeans_fit( - handle, params, X, sample_weight, centroids, n_samples, n_features, inertia, n_iter); -} - /** * @brief Predict the closest cluster each sample in X belongs to. + * + * @code{.cpp} + * #include + * #include + * #include + * using namespace raft::cluster; + * ... + * raft::handle_t handle; + * raft::cluster::KMeansParams params; + * int n_features = 15, inertia, n_iter; + * auto centroids = raft::make_device_matrix(handle, params.n_clusters, n_features); + * + * kmeans::fit(handle, + * params, + * X, + * std::nullopt, + * centroids.view(), + * raft::make_scalar_view(&inertia), + * raft::make_scalar_view(&n_iter)); + * ... + * auto labels = raft::make_device_vector(handle, X.extent(0)); + * + * kmeans::predict(handle, + * params, + * X, + * std::nullopt, + * centroids.view(), + * false, + * labels.view(), + * raft::make_scalar_view(&ineratia)); + * @endcode + * * @tparam DataT the type of data used for weights, distances. * @tparam IndexT the type of data used for indexing. * @param[in] handle The raft handle. @@ -94,7 +132,7 @@ void fit(handle_t const& handle, * @param[out] inertia Sum of squared distances of samples to * their closest cluster center. */ -template +template void predict(handle_t const& handle, const KMeansParams& params, raft::device_matrix_view X, @@ -108,34 +146,32 @@ void predict(handle_t const& handle, handle, params, X, sample_weight, centroids, labels, normalize_weight, inertia); } -template -void predict(handle_t const& handle, - const KMeansParams& params, - const DataT* X, - const DataT* sample_weight, - const DataT* centroids, - IndexT n_samples, - IndexT n_features, - IndexT* labels, - bool normalize_weight, - DataT& inertia) -{ - detail::kmeans_predict(handle, - params, - X, - sample_weight, - centroids, - n_samples, - n_features, - labels, - normalize_weight, - inertia); -} - /** * @brief Compute k-means clustering and predicts cluster index for each sample * in the input. * + * @code{.cpp} + * #include + * #include + * #include + * using namespace raft::cluster; + * ... + * raft::handle_t handle; + * raft::cluster::KMeansParams params; + * int n_features = 15, inertia, n_iter; + * auto centroids = raft::make_device_matrix(handle, params.n_clusters, n_features); + * auto labels = raft::make_device_vector(handle, X.extent(0)); + * + * kmeans::fit_predict(handle, + * params, + * X, + * std::nullopt, + * centroids.view(), + * labels.view(), + * raft::make_scalar_view(&inertia), + * raft::make_scalar_view(&n_iter)); + * @endcode + * * @tparam DataT the type of data used for weights, distances. * @tparam IndexT the type of data used for indexing. * @param[in] handle The raft handle. @@ -159,7 +195,7 @@ void predict(handle_t const& handle, * closest cluster center. * @param[out] n_iter Number of iterations run. */ -template +template void fit_predict(handle_t const& handle, const KMeansParams& params, raft::device_matrix_view X, @@ -173,22 +209,6 @@ void fit_predict(handle_t const& handle, handle, params, X, sample_weight, centroids, labels, inertia, n_iter); } -template -void fit_predict(handle_t const& handle, - const KMeansParams& params, - const DataT* X, - const DataT* sample_weight, - DataT* centroids, - IndexT n_samples, - IndexT n_features, - IndexT* labels, - DataT& inertia, - IndexT& n_iter) -{ - detail::kmeans_fit_predict( - handle, params, X, sample_weight, centroids, n_samples, n_features, labels, inertia, n_iter); -} - /** * @brief Transform X to a cluster-distance space. * @@ -204,7 +224,7 @@ void fit_predict(handle_t const& handle, * @param[out] X_new X transformed in the new space. * [dim = n_samples x n_features] */ -template +template void transform(const raft::handle_t& handle, const KMeansParams& params, raft::device_matrix_view X, @@ -214,7 +234,7 @@ void transform(const raft::handle_t& handle, detail::kmeans_transform(handle, params, X, centroids, X_new); } -template +template void transform(const raft::handle_t& handle, const KMeansParams& params, const DataT* X, @@ -227,7 +247,7 @@ void transform(const raft::handle_t& handle, handle, params, X, centroids, n_samples, n_features, X_new); } -template +template using SamplingOp = detail::SamplingOp; template @@ -252,7 +272,7 @@ using KeyValueIndexOp = detail::KeyValueIndexOp; * @param[in] workspace Temporary workspace buffer which can get resized * */ -template +template void sample_centroids(const raft::handle_t& handle, raft::device_matrix_view X, raft::device_vector_view minClusterDistance, @@ -279,7 +299,7 @@ void sample_centroids(const raft::handle_t& handle, * @param[in] reduction_op The reduction operation used for the cost * */ -template +template void cluster_cost(const raft::handle_t& handle, raft::device_vector_view minClusterDistance, rmm::device_uvector workspace, @@ -424,11 +444,10 @@ void count_samples_in_cluster(const raft::handle_t& handle, handle, params, X, L2NormX, centroids, workspace, sampleCountInCluster); } -/* +/** * @brief Selects 'n_clusters' samples from the input X using kmeans++ algorithm. - - * @note This is the algorithm described in - * "k-means++: the advantages of careful seeding". 2007, Arthur, D. and Vassilvitskii, S. + * + * @see "k-means++: the advantages of careful seeding". 2007, Arthur, D. and Vassilvitskii, S. * ACM-SIAM symposium on Discrete algorithms. * * @tparam DataT the type of data used for weights, distances. @@ -446,10 +465,10 @@ template void init_plus_plus(const raft::handle_t& handle, const KMeansParams& params, raft::device_matrix_view X, - raft::device_matrix_view centroidsRawData, + raft::device_matrix_view centroids, rmm::device_uvector& workspace) { - detail::kmeansPlusPlus(handle, params, X, centroidsRawData, workspace); + detail::kmeansPlusPlus(handle, params, X, centroids, workspace); } /* @@ -480,13 +499,13 @@ void fit_main(const raft::handle_t& handle, const KMeansParams& params, raft::device_matrix_view X, raft::device_vector_view weight, - raft::device_matrix_view centroidsRawData, + raft::device_matrix_view centroids, raft::host_scalar_view inertia, raft::host_scalar_view n_iter, rmm::device_uvector& workspace) { detail::kmeans_fit_main( - handle, params, X, weight, centroidsRawData, inertia, n_iter, workspace); + handle, params, X, weight, centroids, inertia, n_iter, workspace); } }; // end namespace raft::cluster::kmeans diff --git a/cpp/include/raft/cluster/kmeans_params.hpp b/cpp/include/raft/cluster/kmeans_params.hpp index 433e32f5ff..a1532d9dd4 100644 --- a/cpp/include/raft/cluster/kmeans_params.hpp +++ b/cpp/include/raft/cluster/kmeans_params.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -13,15 +13,6 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -/** - * This file is deprecated and will be removed in release 22.06. - * Please use the cuh version instead. - */ - -/** - * DISCLAIMER: this file is deprecated: use lap.cuh instead - */ - #pragma once #pragma message(__FILE__ \ diff --git a/cpp/include/raft/cluster/kmeans_types.hpp b/cpp/include/raft/cluster/kmeans_types.hpp index d6eadd1ba6..f411b12b5c 100644 --- a/cpp/include/raft/cluster/kmeans_types.hpp +++ b/cpp/include/raft/cluster/kmeans_types.hpp @@ -20,14 +20,34 @@ namespace raft::cluster::kmeans { +/** + * Simple object to specify hyper-parameters to the kmeans algorithm. + */ struct KMeansParams { - enum InitMethod { KMeansPlusPlus, Random, Array }; - - // The number of clusters to form as well as the number of centroids to - // generate (default:8). + enum InitMethod { + + /** + * Sample the centroids using the kmeans++ strategy + */ + KMeansPlusPlus, + + /** + * Sample the centroids uniformly at random + */ + Random, + + /** + * User provides the array of initial centroids + */ + Array + }; + + /** + * The number of clusters to form as well as the number of centroids to generate (default:8). + */ int n_clusters = 8; - /* + /** * Method for initialization, defaults to k-means++: * - InitMethod::KMeansPlusPlus (k-means++): Use scalable k-means++ algorithm * to select the initial cluster centers. @@ -37,34 +57,52 @@ struct KMeansParams { */ InitMethod init = KMeansPlusPlus; - // Maximum number of iterations of the k-means algorithm for a single run. + /** + * Maximum number of iterations of the k-means algorithm for a single run. + */ int max_iter = 300; - // Relative tolerance with regards to inertia to declare convergence. + /** + * Relative tolerance with regards to inertia to declare convergence. + */ double tol = 1e-4; - // verbosity level. + /** + * verbosity level. + */ int verbosity = RAFT_LEVEL_INFO; - // Seed to the random number generator. + /** + * Seed to the random number generator. + */ raft::random::RngState rng_state = raft::random::RngState(0, raft::random::GeneratorType::GenPhilox); - // Metric to use for distance computation. + /** + * Metric to use for distance computation. + */ raft::distance::DistanceType metric = raft::distance::DistanceType::L2Expanded; - // Number of instance k-means algorithm will be run with different seeds. + /** + * Number of instance k-means algorithm will be run with different seeds. + */ int n_init = 1; - // Oversampling factor for use in the k-means|| algorithm. + /** + * Oversampling factor for use in the k-means|| algorithm + */ double oversampling_factor = 2.0; // batch_samples and batch_centroids are used to tile 1NN computation which is // useful to optimize/control the memory footprint // Default tile is [batch_samples x n_clusters] i.e. when batch_centroids is 0 // then don't tile the centroids - int batch_samples = 1 << 15; - int batch_centroids = 0; // if 0 then batch_centroids = n_clusters + int batch_samples = 1 << 15; + + /** + * if 0 then batch_centroids = n_clusters + */ + int batch_centroids = 0; // bool inertia_check = false; }; diff --git a/cpp/include/raft/cluster/single_linkage_types.hpp b/cpp/include/raft/cluster/single_linkage_types.hpp index 55239ff6d6..c8d6e4c8a6 100644 --- a/cpp/include/raft/cluster/single_linkage_types.hpp +++ b/cpp/include/raft/cluster/single_linkage_types.hpp @@ -19,17 +19,35 @@ #include namespace raft::cluster::hierarchy { -enum LinkageDistance { PAIRWISE = 0, KNN_GRAPH = 1 }; + +/** + * Determines the method for computing the minimum spanning tree (MST) + */ +enum LinkageDistance { + + /** + * Use a pairwise distance matrix as input to the mst. This + * is very fast and the best option for fairly small datasets (~50k data points) + */ + PAIRWISE = 0, + + /** + * Construct a KNN graph as input to the mst and provide additional + * edges if the mst does not converge. This is slower but scales + * to very large datasets. + */ + KNN_GRAPH = 1 +}; }; // end namespace raft::cluster::hierarchy -// The code below is legacy +// The code below is now considered legacy namespace raft::cluster { using hierarchy::LinkageDistance; /** - * Simple POCO for consolidating linkage results. This closely + * Simple container object for consolidating linkage results. This closely * mirrors the trained instance variables populated in * Scikit-learn's AgglomerativeClustering estimator. * @tparam value_idx diff --git a/cpp/include/raft/neighbors/ball_cover.cuh b/cpp/include/raft/neighbors/ball_cover.cuh index 780a9cfce2..28ff8491b6 100644 --- a/cpp/include/raft/neighbors/ball_cover.cuh +++ b/cpp/include/raft/neighbors/ball_cover.cuh @@ -30,6 +30,23 @@ namespace raft::neighbors::ball_cover { /** * Builds and populates a previously unbuilt BallCoverIndex + * + * Usage example: + * @code{.cpp} + * + * #include + * #include + * #include + * using namespace raft::neighbors; + * + * raft::handle_t handle; + * ... + * auto metric = raft::distance::DistanceType::L2Expanded; + * BallCoverIndex index(handle, X, metric); + * + * ball_cover::build_index(handle, index); + * @endcode + * * @tparam idx_t knn index type * @tparam value_t knn value type * @tparam int_t integral type for knn params @@ -130,10 +147,31 @@ void all_knn_query(const raft::handle_t& handle, * the index and query are the same array. This function will * build the index and assumes rbc_build_index() has not already * been called. + * + * Usage example: + * @code{.cpp} + * + * #include + * #include + * #include + * using namespace raft::neighbors; + * + * raft::handle_t handle; + * ... + * auto metric = raft::distance::DistanceType::L2Expanded; + * + * // Construct a ball cover index + * BallCoverIndex index(handle, X, metric); + * + * // Perform all neighbors knn query + * ball_cover::all_knn_query(handle, index, inds, dists, k); + * @endcode + * * @tparam idx_t knn index type * @tparam value_t knn distance type * @tparam int_t type for integers, such as number of rows/cols * @tparam matrix_idx_t matrix indexing type + * * @param[in] handle raft handle for resource management * @param[in] index ball cover index which has not yet been built * @param[out] inds output knn indices @@ -250,6 +288,28 @@ void knn_query(const raft::handle_t& handle, * function does not build the index and assumes rbc_build_index() has * already been called. Use this function when the index and * query arrays are different, otherwise use rbc_all_knn_query(). + * + * Usage example: + * @code{.cpp} + * + * #include + * #include + * #include + * using namespace raft::neighbors; + * + * raft::handle_t handle; + * ... + * auto metric = raft::distance::DistanceType::L2Expanded; + * + * // Build a ball cover index + * BallCoverIndex index(handle, X, metric); + * ball_cover::build_index(handle, index); + * + * // Perform all neighbors knn query + * ball_cover::knn_query(handle, index, inds, dists, k); + * @endcode + + * * @tparam idx_t index type * @tparam value_t distances type * @tparam int_t integer type for size info diff --git a/cpp/include/raft/neighbors/brute_force.cuh b/cpp/include/raft/neighbors/brute_force.cuh index 3641a38991..772ccb67d2 100644 --- a/cpp/include/raft/neighbors/brute_force.cuh +++ b/cpp/include/raft/neighbors/brute_force.cuh @@ -23,26 +23,52 @@ namespace raft::neighbors::brute_force { /** - * @brief Performs a k-select across row partitioned index/distance + * @brief Performs a k-select across several (contiguous) row-partitioned index/distance * matrices formatted like the following: - * row1: k0, k1, k2 - * row2: k0, k1, k2 - * row3: k0, k1, k2 - * row1: k0, k1, k2 - * row2: k0, k1, k2 - * row3: k0, k1, k2 * + * part1row1: k0, k1, k2, k3 + * part1row2: k0, k1, k2, k3 + * part1row3: k0, k1, k2, k3 + * part2row1: k0, k1, k2, k3 + * part2row2: k0, k1, k2, k3 + * part2row3: k0, k1, k2, k3 * etc... * + * The example above shows what an aggregated index/distance matrix + * would look like with two partitions when n_samples=3 and k=4. + * + * When working with extremely large data sets that have been broken + * over multiple indexes, such as when computing over multiple GPUs, + * the ids will often start at 0 for each local knn index but the + * global ids need to be used when merging them together. An optional + * translations vector can be supplied to map the starting id of + * each partition to its global id so that the final merged knn + * is based on the global ids. + * + * Usage example: + * @code{.cpp} + * #include + * #include + * using namespace raft::neighbors; + * + * raft::handle_t handle; + * ... + * compute multiple knn graphs and aggregate row-wise + * (see detailed description above) + * ... + * brute_force::knn_merge_parts(handle, in_keys, in_values, out_keys, out_values, n_samples); + * @endcode + * * @tparam idx_t * @tparam value_t + * * @param[in] handle * @param[in] in_keys matrix of input keys (size n_samples * n_parts * k) * @param[in] in_values matrix of input values (size n_samples * n_parts * k) * @param[out] out_keys matrix of output keys (size n_samples * k) * @param[out] out_values matrix of output values (size n_samples * k) - * @param[in] n_samples number of rows in each part - * @param[in] translations optional vector of starting index mappings for each partition + * @param[in] n_samples number of rows in each partition + * @param[in] translations optional vector of starting global id mappings for each local partition */ template inline void knn_merge_parts( @@ -81,17 +107,31 @@ inline void knn_merge_parts( * row- or column-major but the output matrices will always be in * row-major format. * - * @param[in] handle the cuml handle to use - * @param[in] index vector of device matrices (each size m_i*d) to be used as the knn index - * @param[in] search matrix (size n*d) to be used for searching the index - * @param[out] indices matrix (size n*k) to store output knn indices - * @param[out] distances matrix (size n*k) to store the output knn distance - * @param[in] k the number of nearest neighbors to return - * @param[in] metric distance metric to use. Euclidean (L2) is used by default - * @param[in] metric_arg the value of `p` for Minkowski (l-p) distances. This + * Usage example: + * @code{.cpp} + * #include + * #include + * #include + * using namespace raft::neighbors; + * + * raft::handle_t handle; + * ... + * int k = 10; + * auto metric = raft::distance::DistanceType::L2SqrtExpanded; + * brute_force::knn(handle, index, search, indices, distances, k, metric); + * @endcode + * + * @param[in] handle: the cuml handle to use + * @param[in] index: vector of device matrices (each size m_i*d) to be used as the knn index + * @param[in] search: matrix (size n*d) to be used for searching the index + * @param[out] indices: matrix (size n*k) to store output knn indices + * @param[out] distances: matrix (size n*k) to store the output knn distance + * @param[in] k: the number of nearest neighbors to return + * @param[in] metric: distance metric to use. Euclidean (L2) is used by default + * @param[in] metric_arg: the value of `p` for Minkowski (l-p) distances. This * is ignored if the metric_type is not Minkowski. - * @param[in] translations starting offsets for partitions. should be the same size - * as input vector. + * @param[in] global_id_offset: optional starting global id mapping for the local partition + * (assumes the index contains contiguous ids in the global id space) */ template indices, raft::device_matrix_view distances, value_int k, - distance::DistanceType metric = distance::DistanceType::L2Unexpanded, - std::optional metric_arg = std::make_optional(2.0f), - std::optional> translations = std::nullopt) + distance::DistanceType metric = distance::DistanceType::L2Unexpanded, + std::optional metric_arg = std::make_optional(2.0f), + std::optional global_id_offset = std::nullopt) { RAFT_EXPECTS(index[0].extent(1) == search.extent(1), "Number of dimensions for both index and search matrices must be equal"); @@ -129,7 +169,10 @@ void knn(raft::handle_t const& handle, sizes.push_back(index[i].extent(0)); } - std::vector* trans = translations.has_value() ? &(*translations) : nullptr; + std::vector trans; + if (global_id_offset.has_value()) { trans.push_back(global_id_offset.value()); } + + std::vector* trans_arg = global_id_offset.has_value() ? &trans : nullptr; raft::spatial::knn::detail::brute_force_knn_impl(handle, inputs, @@ -143,7 +186,7 @@ void knn(raft::handle_t const& handle, k, rowMajorIndex, rowMajorQuery, - trans, + trans_arg, metric, metric_arg.value_or(2.0f)); } diff --git a/cpp/include/raft/neighbors/epsilon_neighborhood.cuh b/cpp/include/raft/neighbors/epsilon_neighborhood.cuh index b0e9b842ec..114216fc50 100644 --- a/cpp/include/raft/neighbors/epsilon_neighborhood.cuh +++ b/cpp/include/raft/neighbors/epsilon_neighborhood.cuh @@ -60,7 +60,22 @@ void epsUnexpL2SqNeighborhood(bool* adj, } /** - * @brief Computes epsilon neighborhood for the L2-Squared distance metric + * @brief Computes epsilon neighborhood for the L2-Squared distance metric and given ball size. + * The epsilon neighbors is represented by a dense boolean adjacency matrix of size m * n and + * an array of degrees for each vertex, which can be used as a compressed sparse row (CSR) + * indptr array. + * + * @code{.cpp} + * #include + * #include + * #include + * using namespace raft::neighbors; + * raft::handle_t handle; + * ... + * auto adj = raft::make_device_matrix(handle, m * n); + * auto vd = raft::make_device_vector(handle, m+1); + * epsilon_neighborhood::eps_neighbors_l2sq(handle, x, y, adj.view(), vd.view(), eps); + * @endcode * * @tparam value_t IO and math type * @tparam idx_t Index type diff --git a/cpp/include/raft/neighbors/ivf_flat.cuh b/cpp/include/raft/neighbors/ivf_flat.cuh index 23ae6c42bf..87400a9b93 100644 --- a/cpp/include/raft/neighbors/ivf_flat.cuh +++ b/cpp/include/raft/neighbors/ivf_flat.cuh @@ -38,7 +38,7 @@ namespace raft::neighbors::ivf_flat { * * Usage example: * @code{.cpp} - * using namespace raft::spatial::knn; + * using namespace raft::neighbors; * // use default index parameters * ivf_flat::index_params index_params; * // create and fill the index from a [N, D] dataset @@ -61,7 +61,7 @@ namespace raft::neighbors::ivf_flat { * @return the constructed ivf-flat index */ template -inline auto build( +auto build( const handle_t& handle, const index_params& params, const T* dataset, IdxT n_rows, uint32_t dim) -> index { @@ -78,15 +78,15 @@ inline auto build( * * Usage example: * @code{.cpp} - * using namespace raft::spatial::knn; + * using namespace raft::neighbors; * // use default index parameters * ivf_flat::index_params index_params; * // create and fill the index from a [N, D] dataset - * auto index = ivf_flat::build(handle, index_params, dataset, N, D); + * auto index = ivf_flat::build(handle, dataset, index_params); * // use default search parameters * ivf_flat::search_params search_params; * // search K nearest neighbours for each of the N queries - * ivf_flat::search(handle, search_params, index, queries, N, K, out_inds, out_dists); + * ivf_flat::search(handle, index, queries, out_inds, out_dists, search_params, k); * @endcode * * @tparam value_t data element type @@ -101,9 +101,9 @@ inline auto build( * @return the constructed ivf-flat index */ template -auto build_index(const handle_t& handle, - raft::device_matrix_view dataset, - const index_params& params) -> index +auto build(const handle_t& handle, + raft::device_matrix_view dataset, + const index_params& params) -> index { return raft::spatial::knn::ivf_flat::detail::build(handle, params, @@ -145,11 +145,11 @@ auto build_index(const handle_t& handle, * @return the constructed extended ivf-flat index */ template -inline auto extend(const handle_t& handle, - const index& orig_index, - const T* new_vectors, - const IdxT* new_indices, - IdxT n_rows) -> index +auto extend(const handle_t& handle, + const index& orig_index, + const T* new_vectors, + const IdxT* new_indices, + IdxT n_rows) -> index { return raft::spatial::knn::ivf_flat::detail::extend( handle, orig_index, new_vectors, new_indices, n_rows); @@ -169,9 +169,9 @@ inline auto extend(const handle_t& handle, * index_params.add_data_on_build = false; // don't populate index on build * index_params.kmeans_trainset_fraction = 1.0; // use whole dataset for kmeans training * // train the index from a [N, D] dataset - * auto index_empty = ivf_flat::build(handle, index_params, dataset, N, D); + * auto index_empty = ivf_flat::build(handle, dataset, index_params, dataset); * // fill the index with the data - * auto index = ivf_flat::extend(handle, index_empty, dataset, nullptr, N); + * auto index = ivf_flat::extend(handle, index_empty, dataset); * @endcode * * @tparam value_t data element type @@ -204,8 +204,20 @@ auto extend(const handle_t& handle, } /** - * @brief Extend the index with the new data. - * * + * @brief Extend the index in-place with the new data. + * + * Usage example: + * @code{.cpp} + * using namespace raft::spatial::knn; + * ivf_flat::index_params index_params; + * index_params.add_data_on_build = false; // don't populate index on build + * index_params.kmeans_trainset_fraction = 1.0; // use whole dataset for kmeans training + * // train the index from a [N, D] dataset + * auto index_empty = ivf_flat::build(handle, index_params, dataset, N, D); + * // fill the index with the data + * ivf_flat::extend(handle, index_empty, dataset, nullptr, N); + * @endcode + * * @tparam T data element type * @tparam IdxT type of the indices in the source dataset * @@ -218,18 +230,30 @@ auto extend(const handle_t& handle, * @param[in] n_rows the number of samples */ template -inline void extend(const handle_t& handle, - index* index, - const T* new_vectors, - const IdxT* new_indices, - IdxT n_rows) +void extend(const handle_t& handle, + index* index, + const T* new_vectors, + const IdxT* new_indices, + IdxT n_rows) { *index = extend(handle, *index, new_vectors, new_indices, n_rows); } /** - * @brief Extend the index with the new data. - * * + * @brief Extend the index in-place with the new data. + * + * Usage example: + * @code{.cpp} + * using namespace raft::spatial::knn; + * ivf_flat::index_params index_params; + * index_params.add_data_on_build = false; // don't populate index on build + * index_params.kmeans_trainset_fraction = 1.0; // use whole dataset for kmeans training + * // train the index from a [N, D] dataset + * auto index_empty = ivf_flat::build(handle, dataset, index_params, dataset); + * // fill the index with the data + * ivf_flat::extend(handle, index_empty, dataset); + * @endcode + * * @tparam value_t data element type * @tparam idx_t type of the indices in the source dataset * @tparam int_t precision / type of integral arguments @@ -298,15 +322,15 @@ void extend(const handle_t& handle, * enough memory pool here to avoid memory allocations within search). */ template -inline void search(const handle_t& handle, - const search_params& params, - const index& index, - const T* queries, - uint32_t n_queries, - uint32_t k, - IdxT* neighbors, - float* distances, - rmm::mr::device_memory_resource* mr = nullptr) +void search(const handle_t& handle, + const search_params& params, + const index& index, + const T* queries, + uint32_t n_queries, + uint32_t k, + IdxT* neighbors, + float* distances, + rmm::mr::device_memory_resource* mr = nullptr) { return raft::spatial::knn::ivf_flat::detail::search( handle, params, index, queries, n_queries, k, neighbors, distances, mr); @@ -323,21 +347,15 @@ inline void search(const handle_t& handle, * eliminate entirely allocations happening within `search`: * @code{.cpp} * ... - * // Create a pooling memory resource with a pre-defined initial size. - * rmm::mr::pool_memory_resource mr( - * rmm::mr::get_current_device_resource(), 1024 * 1024); * // use default search parameters * ivf_flat::search_params search_params; * // Use the same allocator across multiple searches to reduce the number of * // cuda memory allocations - * ivf_flat::search(handle, search_params, index, queries1, N1, K, out_inds1, out_dists1, &mr); - * ivf_flat::search(handle, search_params, index, queries2, N2, K, out_inds2, out_dists2, &mr); - * ivf_flat::search(handle, search_params, index, queries3, N3, K, out_inds3, out_dists3, &mr); + * ivf_flat::search(handle, index, queries1, out_inds1, out_dists1, search_params, K); + * ivf_flat::search(handle, index, queries2, out_inds2, out_dists2, search_params, K); + * ivf_flat::search(handle, index, queries3, out_inds3, out_dists3, search_params, K); * ... * @endcode - * The exact size of the temporary buffer depends on multiple factors and is an implementation - * detail. However, you can safely specify a small initial size for the memory pool, so that only a - * few allocations happen to grow it during the first invocations of the `search`. * * @tparam value_t data element type * @tparam idx_t type of the indices diff --git a/cpp/include/raft/neighbors/ivf_pq.cuh b/cpp/include/raft/neighbors/ivf_pq.cuh index 1e32d5d7ba..5d619c5bec 100644 --- a/cpp/include/raft/neighbors/ivf_pq.cuh +++ b/cpp/include/raft/neighbors/ivf_pq.cuh @@ -37,7 +37,7 @@ namespace raft::neighbors::ivf_pq { * * Usage example: * @code{.cpp} - * using namespace raft::spatial::knn; + * using namespace raft::neighbors; * // use default index parameters * ivf_pq::index_params index_params; * // create and fill the index from a [N, D] dataset diff --git a/cpp/include/raft/solver/linear_assignment.cuh b/cpp/include/raft/solver/linear_assignment.cuh index 4c24dcbc29..3e17b557f2 100644 --- a/cpp/include/raft/solver/linear_assignment.cuh +++ b/cpp/include/raft/solver/linear_assignment.cuh @@ -39,8 +39,19 @@ namespace raft::solver { +/** + * @brief CUDA Implementation of O(n^3) alternating tree Hungarian Algorithm + * @note This is a port to RAFT from original authors Ketan Date and Rakesh Nagi + * + * @see Date, Ketan, and Rakesh Nagi. "GPU-accelerated Hungarian algorithms + * for the Linear Assignment Problem." Parallel Computing 57 (2016): 52-72. + * + * @tparam vertex_t + * @tparam weight_t + */ template class LinearAssignmentProblem { + private: vertex_t size_; vertex_t batchsize_; weight_t epsilon_; @@ -66,6 +77,13 @@ class LinearAssignmentProblem { rmm::device_uvector obj_val_dual_v; public: + /** + * @brief Constructor + * @param handle raft handle for managing resources + * @param size size of square matrix + * @param batchsize + * @param epsilon + */ LinearAssignmentProblem(raft::handle_t const& handle, vertex_t size, vertex_t batchsize, @@ -91,7 +109,12 @@ class LinearAssignmentProblem { { } - // Executes Hungarian algorithm on the input cost matrix. + /** + * Executes Hungarian algorithm on the input cost matrix. + * @param d_cost_matrix + * @param d_row_assignment + * @param d_col_assignment + */ void solve(weight_t const* d_cost_matrix, vertex_t* d_row_assignment, vertex_t* d_col_assignment) { initializeDevice(); @@ -118,19 +141,31 @@ class LinearAssignmentProblem { d_costs_ = nullptr; } - // Function for getting optimal row dual vector for subproblem spId. + /** + * Function for getting optimal row dual vector for subproblem spId. + * @param spId + * @return + */ std::pair getRowDualVector(int spId) const { return std::make_pair(row_duals_v.data() + spId * size_, size_); } - // Function for getting optimal col dual vector for subproblem spId. + /** + * Function for getting optimal col dual vector for subproblem spId. + * @param spId + * @return + */ std::pair getColDualVector(int spId) { return std::make_pair(col_duals_v.data() + spId * size_, size_); } - // Function for getting optimal primal objective value for subproblem spId. + /** + * Function for getting optimal primal objective value for subproblem spId. + * @param spId + * @return + */ weight_t getPrimalObjectiveValue(int spId) { weight_t result; @@ -139,7 +174,11 @@ class LinearAssignmentProblem { return result; } - // Function for getting optimal dual objective value for subproblem spId. + /** + * Function for getting optimal dual objective value for subproblem spId. + * @param spId + * @return + */ weight_t getDualObjectiveValue(int spId) { weight_t result; diff --git a/cpp/include/raft/sparse/solver/mst.cuh b/cpp/include/raft/sparse/solver/mst.cuh index 33beeb1915..5f55a567ca 100644 --- a/cpp/include/raft/sparse/solver/mst.cuh +++ b/cpp/include/raft/sparse/solver/mst.cuh @@ -20,6 +20,29 @@ namespace raft::sparse::solver { +/** + * Compute the minimium spanning tree (MST) or minimum spanning forest (MSF) depending on + * the connected components of the given graph. + * + * @tparam vertex_t integral type for precision of vertex indexing + * @tparam edge_t integral type for precision of edge indexing + * @tparam weight_t type of weights array + * @tparam alteration_t type to use for random alteration + * + * @param handle + * @param offsets csr inptr array of row offsets (size v+1) + * @param indices csr array of column indices (size e) + * @param weights csr array of weights (size e) + * @param v number of vertices in graph + * @param e number of edges in graph + * @param color array to store resulting colors for MSF + * @param stream cuda stream for ordering operations + * @param symmetrize_output should the resulting output edge list should be symmetrized? + * @param initialize_colors should the colors array be initialized inside the MST? + * @param iterations maximum number of iterations to perform + * @return a list of edges containing the mst (or a subset of the edges guaranteed to be in the mst + * when an msf is encountered) + */ template Graph_COO mst(const raft::handle_t& handle, edge_t const* offsets, diff --git a/cpp/include/raft/spatial/knn/ivf_flat.cuh b/cpp/include/raft/spatial/knn/ivf_flat.cuh index d7c3d80fb5..65b6f5ed4b 100644 --- a/cpp/include/raft/spatial/knn/ivf_flat.cuh +++ b/cpp/include/raft/spatial/knn/ivf_flat.cuh @@ -33,7 +33,6 @@ namespace raft::spatial::knn::ivf_flat { using raft::neighbors::ivf_flat::build; -using raft::neighbors::ivf_flat::build_index; using raft::neighbors::ivf_flat::extend; using raft::neighbors::ivf_flat::search; diff --git a/cpp/include/raft/stats/adjusted_rand_index.cuh b/cpp/include/raft/stats/adjusted_rand_index.cuh index e1b6a241c4..93fd07eb0b 100644 --- a/cpp/include/raft/stats/adjusted_rand_index.cuh +++ b/cpp/include/raft/stats/adjusted_rand_index.cuh @@ -31,8 +31,8 @@ namespace raft { namespace stats { /** - * @brief Function to calculate Adjusted RandIndex as described - * here + * @brief Function to calculate Adjusted RandIndex + * @see https://en.wikipedia.org/wiki/Rand_index * @tparam T data-type for input label arrays * @tparam MathT integral data-type used for computing n-choose-r * @param firstClusterArray: the array of classes @@ -50,8 +50,8 @@ double adjusted_rand_index(const T* firstClusterArray, } /** - * @brief Function to calculate Adjusted RandIndex as described - * here + * @brief Function to calculate Adjusted RandIndex + * @see https://en.wikipedia.org/wiki/Rand_index * @tparam value_t data-type for input label arrays * @tparam math_t integral data-type used for computing n-choose-r * @tparam idx_t Index type of matrix extent. diff --git a/cpp/include/raft/stats/common.hpp b/cpp/include/raft/stats/common.hpp index 8392bd50fe..724ca224c6 100644 --- a/cpp/include/raft/stats/common.hpp +++ b/cpp/include/raft/stats/common.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -13,59 +13,10 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - #pragma once -#include - -// This file is a shameless amalgamation of independent works done by -// Lars Nyland and Andy Adinets - -///@todo: add cub's histogram as another option - -namespace raft { -namespace stats { - -/** Default mapper which just returns the value of the data itself */ -template -struct IdentityBinner { - DI int operator()(DataT val, IdxT row, IdxT col) { return int(val); } -}; - -/** Types of support histogram implementations */ -enum HistType { - /** shared mem atomics but with bins to be 1b int's */ - HistTypeSmemBits1 = 1, - /** shared mem atomics but with bins to be 2b int's */ - HistTypeSmemBits2 = 2, - /** shared mem atomics but with bins to be 4b int's */ - HistTypeSmemBits4 = 4, - /** shared mem atomics but with bins to ba 1B int's */ - HistTypeSmemBits8 = 8, - /** shared mem atomics but with bins to be 2B int's */ - HistTypeSmemBits16 = 16, - /** use only global atomics */ - HistTypeGmem, - /** uses shared mem atomics to reduce global traffic */ - HistTypeSmem, - /** - * uses shared mem atomics with match_any intrinsic to further reduce shared - * memory traffic. This can only be enabled on Volta and later architectures. - * If one tries to enable this for older arch's, it will fall back to - * `HistTypeSmem`. - * @note This is to be used only when the input dataset leads to a lot of - * repetitions in a given warp, else, this algo can be much slower than - * `HistTypeSmem`! - */ - HistTypeSmemMatchAny, - /** builds a hashmap of active bins in shared mem */ - HistTypeSmemHash, - /** decide at runtime the best algo for the given inputs */ - HistTypeAuto -}; - -/// Supported types of information criteria -enum IC_Type { AIC, AICc, BIC }; +#pragma message(__FILE__ \ + " is deprecated and will be removed in a future release." \ + " Please use the raft/stats/stats_types.hpp version instead.") -}; // end namespace stats -}; // end namespace raft +#include diff --git a/cpp/include/raft/stats/detail/histogram.cuh b/cpp/include/raft/stats/detail/histogram.cuh index 777e0b7816..69bd721ded 100644 --- a/cpp/include/raft/stats/detail/histogram.cuh +++ b/cpp/include/raft/stats/detail/histogram.cuh @@ -32,6 +32,12 @@ namespace raft { namespace stats { namespace detail { +/** Default mapper which just returns the value of the data itself */ +template +struct IdentityBinner { + DI int operator()(DataT val, IdxT row, IdxT col) { return int(val); } +}; + static const int ThreadsPerBlock = 256; template diff --git a/cpp/include/raft/stats/histogram.cuh b/cpp/include/raft/stats/histogram.cuh index df1c2772f1..8efb2e8df8 100644 --- a/cpp/include/raft/stats/histogram.cuh +++ b/cpp/include/raft/stats/histogram.cuh @@ -31,6 +31,14 @@ namespace raft { namespace stats { +/** + * Default mapper which just returns the value of the data itself + */ +template +struct IdentityBinner : public detail::IdentityBinner { + IdentityBinner() : detail::IdentityBinner() {} +}; + /** * @brief Perform histogram on the input data. It chooses the right load size * based on the input data vector length. It also supports large-bin cases diff --git a/cpp/include/raft/stats/stats_types.hpp b/cpp/include/raft/stats/stats_types.hpp new file mode 100644 index 0000000000..5db5ef1c57 --- /dev/null +++ b/cpp/include/raft/stats/stats_types.hpp @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +namespace raft::stats { + +/** + * @brief Types of support histogram implementations + */ +enum HistType { + /** shared mem atomics but with bins to be 1b int's */ + HistTypeSmemBits1 = 1, + /** shared mem atomics but with bins to be 2b int's */ + HistTypeSmemBits2 = 2, + /** shared mem atomics but with bins to be 4b int's */ + HistTypeSmemBits4 = 4, + /** shared mem atomics but with bins to ba 1B int's */ + HistTypeSmemBits8 = 8, + /** shared mem atomics but with bins to be 2B int's */ + HistTypeSmemBits16 = 16, + /** use only global atomics */ + HistTypeGmem, + /** uses shared mem atomics to reduce global traffic */ + HistTypeSmem, + /** + * uses shared mem atomics with match_any intrinsic to further reduce shared + * memory traffic. This can only be enabled on Volta and later architectures. + * If one tries to enable this for older arch's, it will fall back to + * `HistTypeSmem`. + * @note This is to be used only when the input dataset leads to a lot of + * repetitions in a given warp, else, this algo can be much slower than + * `HistTypeSmem`! + */ + HistTypeSmemMatchAny, + /** builds a hashmap of active bins in shared mem */ + HistTypeSmemHash, + /** decide at runtime the best algo for the given inputs */ + HistTypeAuto +}; + +/** + * @brief Supported types of information criteria + */ +enum IC_Type { AIC, AICc, BIC }; + +}; // end namespace raft::stats diff --git a/cpp/test/neighbors/ann_ivf_flat.cu b/cpp/test/neighbors/ann_ivf_flat.cu index 01af7ea0bd..3a5daff4bb 100644 --- a/cpp/test/neighbors/ann_ivf_flat.cu +++ b/cpp/test/neighbors/ann_ivf_flat.cu @@ -154,7 +154,7 @@ class AnnIVFFlatTest : public ::testing::TestWithParam> { auto database_view = raft::make_device_matrix_view( (const DataT*)database.data(), ps.num_db_vecs, ps.dim); - auto index = ivf_flat::build_index(handle_, database_view, index_params); + auto index = ivf_flat::build(handle_, database_view, index_params); rmm::device_uvector vector_indices(ps.num_db_vecs, stream_); thrust::sequence(handle_.get_thrust_policy(), diff --git a/cpp/test/neighbors/knn.cu b/cpp/test/neighbors/knn.cu index 710950e312..eb5ecf663f 100644 --- a/cpp/test/neighbors/knn.cu +++ b/cpp/test/neighbors/knn.cu @@ -94,7 +94,8 @@ class KNNTest : public ::testing::TestWithParam { auto distances = raft::make_device_matrix_view(distances_.data(), rows_, k_); - knn(handle, index, search, indices, distances, k_); + auto metric = raft::distance::DistanceType::L2Unexpanded; + knn(handle, index, search, indices, distances, k_, metric, std::make_optional(0)); build_actual_output<<>>( actual_labels_.data(), rows_, k_, search_labels_.data(), indices_.data()); diff --git a/docs/source/cpp_api.rst b/docs/source/cpp_api.rst index d10d9773a5..e3f650563d 100644 --- a/docs/source/cpp_api.rst +++ b/docs/source/cpp_api.rst @@ -1,6 +1,7 @@ -~~~~~~~~~~~~~~~~~~~~~~ -RAFT C++ API Reference -~~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~ +C++ API Reference +~~~~~~~~~~~~~~~~~ + .. _api: diff --git a/docs/source/cpp_api/cluster.rst b/docs/source/cpp_api/cluster.rst index 41816482cc..0ecfe81bc3 100644 --- a/docs/source/cpp_api/cluster.rst +++ b/docs/source/cpp_api/cluster.rst @@ -1,7 +1,8 @@ Cluster ======= -This page provides C++ class references for the publicly-exposed elements of the cluster package. +This page provides C++ class references for the publicly-exposed elements of the `raft/cluster` headers. RAFT provides +fundamental clustering algorithms which are, themselves, considered reusable building blocks for other algorithms. K-Means ------- diff --git a/docs/source/cpp_api/core.rst b/docs/source/cpp_api/core.rst index d4891bf0b3..7228213d9b 100644 --- a/docs/source/cpp_api/core.rst +++ b/docs/source/cpp_api/core.rst @@ -1,7 +1,10 @@ Core ==== -This page provides C++ class references for the publicly-exposed elements of the core package. +This page provides C++ class references for the publicly-exposed elements of the `raft/core` package. The `raft/core` headers +require minimal dependencies, can be compiled without `nvcc`, and thus are safe to expose on your own public APIs. Aside from +the headers in the `raft/core` include directory, any headers in the codebase with the suffix `_types.hpp` are also safe to +expose in public APIs. handle_t @@ -34,30 +37,67 @@ mdarray :project: RAFT :members: -.. doxygenclass:: raft::make_device_matrix + +Device Factories +---------------- + +.. doxygenfunction:: raft::make_device_matrix :project: RAFT -.. doxygenclass:: raft::make_device_vector +.. doxygenfunction:: raft::make_device_vector :project: RAFT -.. doxygenclass:: raft::make_device_scalar +.. doxygenfunction:: raft::make_device_scalar :project: RAFT -.. doxygenclass:: raft::make_host_matrix +Host Factories +---------------- + +.. doxygenfunction:: raft::make_host_matrix :project: RAFT -.. doxygenclass:: raft::make_host_vector +.. doxygenfunction:: raft::make_host_vector :project: RAFT -.. doxygenclass:: raft::make_device_scalar +.. doxygenfunction:: raft::make_device_scalar :project: RAFT mdspan ####### -.. doxygenfunction:: raft::make_device_mdspan - :project: RAFT +Device Vocabulary +----------------- + +.. doxygentypedef:: raft::device_mdspan + :project: RAFT + +.. doxygentypedef:: raft::device_matrix_view + :project: RAFT + +.. doxygentypedef:: raft::device_vector_view + :project: RAFT + +.. doxygentypedef:: raft::device_scalar_view + :project: RAFT + +Host Vocabulary +--------------- + +.. doxygentypedef:: raft::host_mdspan + :project: RAFT + +.. doxygentypedef:: raft::host_matrix_view + :project: RAFT + +.. doxygentypedef:: raft::host_vector_view + :project: RAFT + +.. doxygentypedef:: raft::host_scalar_view + :project: RAFT + +Device Factories +---------------- .. doxygenfunction:: raft::make_device_matrix_view :project: RAFT @@ -68,6 +108,9 @@ mdspan .. doxygenfunction:: raft::make_device_scalar_view :project: RAFT +Host Factories +-------------- + .. doxygenfunction:: raft::make_host_matrix_view :project: RAFT @@ -80,18 +123,22 @@ mdspan span #### -.. doxygenclass:: raft::device_span - :project: RAFT - :members: +.. doxygentypedef:: raft::device_span + :project: RAFT + +.. doxygentypedef:: raft::host_span + :project: RAFT -.. doxygenclass:: raft::host_span +.. doxygenclass:: raft::span :project: RAFT :members: + + Key-Value Pair ############## -.. doxygenclass:: raft::KeyValuePair +.. doxygenstruct:: raft::KeyValuePair :project: RAFT :members: diff --git a/docs/source/cpp_api/distance.rst b/docs/source/cpp_api/distance.rst index c2bce860d5..2596361f6a 100644 --- a/docs/source/cpp_api/distance.rst +++ b/docs/source/cpp_api/distance.rst @@ -1,7 +1,8 @@ Distance ======== -This page provides C++ class references for the publicly-exposed elements of the distance package. +This page provides C++ class references for the publicly-exposed elements of the `raft/distance` package. RAFT's +distances have been highly optimized and support a wide assortment of different distance measures. Distance ######## diff --git a/docs/source/cpp_api/linalg.rst b/docs/source/cpp_api/linalg.rst index f9986fd2ce..5664e5b3dc 100644 --- a/docs/source/cpp_api/linalg.rst +++ b/docs/source/cpp_api/linalg.rst @@ -1,7 +1,10 @@ Linear Algebra ============== -This page provides C++ class references for the publicly-exposed elements of the (dense) linear algebra package. +This page provides C++ class references for the publicly-exposed elements of the `raft/linalg` (dense) linear algebra headers. +In addition to providing highly optimized arithmetic and matrix/vector operations, RAFT provides a consistent user experience +by providing common BLAS routines, standard linear system solvers, factorization and eigenvalue solvers. Some of these routines +hide the complexities of lower-level C-based libraries provided in the CUDA toolkit .. doxygennamespace:: raft::linalg :project: RAFT diff --git a/docs/source/cpp_api/matrix.rst b/docs/source/cpp_api/matrix.rst index 65534aa6ee..945658eb7b 100644 --- a/docs/source/cpp_api/matrix.rst +++ b/docs/source/cpp_api/matrix.rst @@ -1,7 +1,8 @@ Matrix ====== -This page provides C++ class references for the publicly-exposed elements of the matrix package. +This page provides C++ class references for the publicly-exposed elements of the `raft/matrix` headers. The `raft/matrix` +headers cover many operations on matrices that are otherwise not covered by `raft/linalg`. .. doxygennamespace:: raft::matrix :project: RAFT diff --git a/docs/source/cpp_api/solver.rst b/docs/source/cpp_api/solver.rst index a8b93ca046..f7ca244dc8 100644 --- a/docs/source/cpp_api/solver.rst +++ b/docs/source/cpp_api/solver.rst @@ -1,7 +1,7 @@ -Optimization -============ +Solvers +======= -This page provides C++ class references for the publicly-exposed elements of the optimization package. +This page provides C++ class references for the publicly-exposed elements of the iterative and combinatorial solvers package. Linear Assignment Problem diff --git a/docs/source/cpp_api/sparse.rst b/docs/source/cpp_api/sparse.rst index c0ea61c6f7..a7c32cc65d 100644 --- a/docs/source/cpp_api/sparse.rst +++ b/docs/source/cpp_api/sparse.rst @@ -4,7 +4,6 @@ Sparse This page provides C++ class references for the publicly-exposed elements of the sparse package. - Conversion ########## @@ -26,20 +25,16 @@ Linear Algebra :project: RAFT :members: -Misc Operations -############### +Matrix Operations +################# .. doxygennamespace:: raft::sparse::op :project: RAFT :members: -Selection -######### - -.. doxygennamespace:: raft::sparse::selection - :project: RAFT - :members: +Nearest Neighbors +################# -.. doxygennamespace:: raft::linkage +.. doxygennamespace:: raft::sparse::neighbors :project: RAFT :members: diff --git a/docs/source/index.rst b/docs/source/index.rst index 0d7ab295f4..fb7ce310c8 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -1,15 +1,48 @@ Welcome to RAFT's documentation! ================================= -RAFT contains fundamental widely-used algorithms and primitives for data science and machine learning. +RAFT contains fundamental widely-used algorithms and primitives for scientific computing, data science and machine learning. The algorithms are CUDA-accelerated and form building-blocks for rapidly composing analytics. + +By taking a primitives-based approach to algorithm development, RAFT + +- accelerates algorithm construction time +- reduces the maintenance burden by maximizing reuse across projects, and +- centralizes core reusable computations, allowing future optimizations to benefit all algorithms that use them. + + +While not exhaustive, the following general categories help summarize the accelerated building blocks that RAFT contains: + +.. list-table:: + :widths: 25 50 + :header-rows: 1 + + * - Category + - Examples + * - Data Formats + - sparse & dense, conversions, data generation + * - Dense Operations + - linear algebra, matrix and vector operations, slicing, norms, factorization, least squares, svd & eigenvalue problems + * - Sparse Operations + - linear algebra, arithmetic, eigenvalue problems, slicing, symmetrization, components & labeling + * - Spatial + - pairwise distances, nearest neighbors, neighborhood graph construction + * - Basic Clustering + - spectral clustering, hierarchical clustering, k-means + * - Solvers + - combinatorial optimization, iterative solvers + * - Statistics + - sampling, moments and summary statistics, metrics + * - Tools & Utilities + - common utilities for developing CUDA applications, multi-node multi-gpu infrastructure .. toctree:: :maxdepth: 2 :caption: Contents: + quick_start.md cpp_api.rst - raft_dask_api.rst pylibraft_api.rst + raft_dask_api.rst Indices and tables diff --git a/docs/source/quick_start.md b/docs/source/quick_start.md new file mode 100644 index 0000000000..a09f7beceb --- /dev/null +++ b/docs/source/quick_start.md @@ -0,0 +1,129 @@ +# Quick Start + + +This guide is meant to provide a quick-start tutorial for interacting with RAFT's C++ APIs. + +## RAPIDS Memory Manager (RMM) + +RAFT relies heavily on RMM which eases the burden of configuring different allocation strategies globally across the libraries that use it. + +## Multi-dimensional Spans and Arrays + +The APIs in RAFT currently accept raw pointers to device memory and we are in the process of simplifying the APIs with the [mdspan](https://arxiv.org/abs/2010.06474) multi-dimensional array view for representing data in higher dimensions similar to the `ndarray` in the Numpy Python library. RAFT also contains the corresponding owning `mdarray` structure, which simplifies the allocation and management of multi-dimensional data in both host and device (GPU) memory. + +The `mdarray` forms a convenience layer over RMM and can be constructed in RAFT using a number of different helper functions: + +```c++ +#include + +int n_rows = 10; +int n_cols = 10; + +auto scalar = raft::make_device_scalar(handle, 1.0); +auto vector = raft::make_device_vector(handle, n_cols); +auto matrix = raft::make_device_matrix(handle, n_rows, n_cols); +``` + +The `mdspan` is a lightweight non-owning view that can wrap around any pointer, maintaining shape, layout, and indexing information for accessing elements. + + +We can construct `mdspan` instances directly from the above `mdarray` instances: + +```c++ +// Scalar mdspan on device +auto scalar_view = scalar.view(); + +// Vector mdspan on device +auto vector_view = vector.view(); + +// Matrix mdspan on device +auto matrix_view = matrix.view(); +``` +Since the `mdspan` is just a lightweight wrapper, we can also construct it from the underlying data handles in the `mdarray` instances above. We use the extent to get information about the `mdarray` or `mdspan`'s shape. + +```c++ +#include + +auto scalar_view = raft::make_device_scalar_view(scalar.data_handle()); +auto vector_view = raft::make_device_vector_view(vector.data_handle(), vector.extent(0)); +auto matrix_view = raft::make_device_matrix_view(matrix.data_handle(), matrix.extent(0), matrix.extent(1)); +``` + +Of course, RAFT's `mdspan`/`mdarray` APIs aren't just limited to the `device`. You can also create `host` variants: + +```c++ +#include +#include + +int n_rows = 10; +int n_cols = 10; + +auto scalar = raft::make_host_scalar(handle, 1.0); +auto vector = raft::make_host_vector(handle, n_cols); +auto matrix = raft::make_host_matrix(handle, n_rows, n_cols); + +auto scalar_view = raft::make_host_scalar_view(scalar.data_handle()); +auto vector_view = raft::make_host_vector_view(vector.data_handle(), vector.extent(0)); +auto matrix_view = raft::make_host_matrix_view(matrix.data_handle(), matrix.extent(0), matrix.extent(1)); +``` + +And `managed` variants: + +```c++ +#include + +int n_rows = 10; +int n_cols = 10; + +auto matrix = raft::make_managed_mdspan(managed_ptr, raft::make_matrix_extents(n_rows, n_cols)); +``` + + +## C++ Example + +Most of the primitives in RAFT accept a `raft::handle_t` object for the management of resources which are expensive to create, such CUDA streams, stream pools, and handles to other CUDA libraries like `cublas` and `cusolver`. + +The example below demonstrates creating a RAFT handle and using it with `device_matrix` and `device_vector` to allocate memory, generating random clusters, and computing +pairwise Euclidean distances: + +```c++ +#include +#include +#include +#include + +raft::handle_t handle; + +int n_samples = 5000; +int n_features = 50; + +auto input = raft::make_device_matrix(handle, n_samples, n_features); +auto labels = raft::make_device_vector(handle, n_samples); +auto output = raft::make_device_matrix(handle, n_samples, n_samples); + +raft::random::make_blobs(handle, input.view(), labels.view()); + +auto metric = raft::distance::DistanceType::L2SqrtExpanded; +raft::distance::pairwise_distance(handle, input.view(), input.view(), output.view(), metric); +``` + +## Python Example + +The `pylibraft` package contains a Python API for RAFT algorithms and primitives. `pylibraft` integrates nicely into other libraries by being very lightweight with minimal dependencies and accepting any object that supports the `__cuda_array_interface__`, such as [CuPy's ndarray](https://docs.cupy.dev/en/stable/user_guide/interoperability.html#rmm). The package is currently limited to pairwise distances and RMAT graph generation, but we will continue adding more in future releases. + +The example below demonstrates computing the pairwise Euclidean distances between CuPy arrays. `pylibraft` is a low-level API that prioritizes efficiency and simplicity over being pythonic, which is shown here by pre-allocating the output memory before invoking the `pairwise_distance` function. Note that CuPy is not a required dependency for `pylibraft`. + +```python +import cupy as cp + +from pylibraft.distance import pairwise_distance + +n_samples = 5000 +n_features = 50 + +in1 = cp.random.random_sample((n_samples, n_features), dtype=cp.float32) +in2 = cp.random.random_sample((n_samples, n_features), dtype=cp.float32) +output = cp.empty((n_samples, n_samples), dtype=cp.float32) + +pairwise_distance(in1, in2, output, metric="euclidean") +``` From 004af044465a1be73c5c0e56b680fee771845605 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Fri, 21 Oct 2022 18:53:50 -0400 Subject: [PATCH 10/14] Fixing style --- cpp/include/raft/cluster/kmeans.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/include/raft/cluster/kmeans.cuh b/cpp/include/raft/cluster/kmeans.cuh index 9d19e24806..c109cba713 100644 --- a/cpp/include/raft/cluster/kmeans.cuh +++ b/cpp/include/raft/cluster/kmeans.cuh @@ -982,7 +982,7 @@ void fit_main(const raft::handle_t& handle, handle, params, X, weight, centroidsRawData, inertia, n_iter, workspace); } -}; // end namespace raft::cluster::kmeans +}; // namespace raft::cluster namespace raft::cluster { From cd2b0b448309fc951790c5941202d3226d0a7584 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Fri, 21 Oct 2022 21:00:38 -0400 Subject: [PATCH 11/14] Fixing bad merge --- cpp/include/raft/cluster/kmeans.cuh | 495 +--------------------------- 1 file changed, 14 insertions(+), 481 deletions(-) diff --git a/cpp/include/raft/cluster/kmeans.cuh b/cpp/include/raft/cluster/kmeans.cuh index c109cba713..ef1fb44dfd 100644 --- a/cpp/include/raft/cluster/kmeans.cuh +++ b/cpp/include/raft/cluster/kmeans.cuh @@ -23,6 +23,19 @@ namespace raft::cluster::kmeans { +/** + * Functor used for sampling centroids + */ +template +using SamplingOp = detail::SamplingOp; + +/** + * Functor used to extract the index from a KeyValue pair + * storing both index and a distance. + */ +template +using KeyValueIndexOp = detail::KeyValueIndexOp; + /** * @brief Find clusters with k-means algorithm. * Initial centroids are chosen with k-means++ algorithm. Empty @@ -247,12 +260,6 @@ void transform(const raft::handle_t& handle, handle, params, X, centroids, n_samples, n_features, X_new); } -template -using SamplingOp = detail::SamplingOp; - -template -using KeyValueIndexOp = detail::KeyValueIndexOp; - /** * @brief Select centroids according to a sampling operation * @@ -512,480 +519,6 @@ void fit_main(const raft::handle_t& handle, namespace raft::cluster { -/** - * Note: All of the functions below in raft::cluster are deprecated and will - * be removed in a future release. Please use raft::cluster::kmeans instead. - */ - -/** - * @brief Find clusters with k-means algorithm. - * Initial centroids are chosen with k-means++ algorithm. Empty - * clusters are reinitialized by choosing new centroids with - * k-means++ algorithm. - * @tparam DataT the type of data used for weights, distances. - * @tparam IndexT the type of data used for indexing. - * @param[in] handle The raft handle. - * @param[in] params Parameters for KMeans model. - * @param[in] X Training instances to cluster. The data must - * be in row-major format. - * [dim = n_samples x n_features] - * @param[in] sample_weight Optional weights for each observation in X. - * [len = n_samples] - * @param[inout] centroids [in] When init is InitMethod::Array, use - * centroids as the initial cluster centers. - * [out] The generated centroids from the - * kmeans algorithm are stored at the address - * pointed by 'centroids'. - * [dim = n_clusters x n_features] - * @param[out] inertia Sum of squared distances of samples to their - * closest cluster center. - * @param[out] n_iter Number of iterations run. - */ -template -void fit(handle_t const& handle, - const KMeansParams& params, - raft::device_matrix_view X, - std::optional> sample_weight, - raft::device_matrix_view centroids, - raft::host_scalar_view inertia, - raft::host_scalar_view n_iter) -{ - kmeans::fit(handle, params, X, sample_weight, centroids, inertia, n_iter); -} - -template -void fit(handle_t const& handle, - const KMeansParams& params, - const DataT* X, - const DataT* sample_weight, - DataT* centroids, - IndexT n_samples, - IndexT n_features, - DataT& inertia, - IndexT& n_iter) -{ - kmeans::fit( - handle, params, X, sample_weight, centroids, n_samples, n_features, inertia, n_iter); -} - -/** - * @brief Predict the closest cluster each sample in X belongs to. - * @tparam DataT the type of data used for weights, distances. - * @tparam IndexT the type of data used for indexing. - * @param[in] handle The raft handle. - * @param[in] params Parameters for KMeans model. - * @param[in] X New data to predict. - * [dim = n_samples x n_features] - * @param[in] sample_weight Optional weights for each observation in X. - * [len = n_samples] - * @param[in] centroids Cluster centroids. The data must be in - * row-major format. - * [dim = n_clusters x n_features] - * @param[in] normalize_weight True if the weights should be normalized - * @param[out] labels Index of the cluster each sample in X - * belongs to. - * [len = n_samples] - * @param[out] inertia Sum of squared distances of samples to - * their closest cluster center. - */ -template -void predict(handle_t const& handle, - const KMeansParams& params, - raft::device_matrix_view X, - std::optional> sample_weight, - raft::device_matrix_view centroids, - raft::device_vector_view labels, - bool normalize_weight, - raft::host_scalar_view inertia) -{ - kmeans::predict( - handle, params, X, sample_weight, centroids, labels, normalize_weight, inertia); -} - -template -void predict(handle_t const& handle, - const KMeansParams& params, - const DataT* X, - const DataT* sample_weight, - const DataT* centroids, - IndexT n_samples, - IndexT n_features, - IndexT* labels, - bool normalize_weight, - DataT& inertia) -{ - kmeans::predict(handle, - params, - X, - sample_weight, - centroids, - n_samples, - n_features, - labels, - normalize_weight, - inertia); -} - -/** - * @brief Compute k-means clustering and predicts cluster index for each sample - * in the input. - * - * @tparam DataT the type of data used for weights, distances. - * @tparam IndexT the type of data used for indexing. - * @param[in] handle The raft handle. - * @param[in] params Parameters for KMeans model. - * @param[in] X Training instances to cluster. The data must be - * in row-major format. - * [dim = n_samples x n_features] - * @param[in] sample_weight Optional weights for each observation in X. - * [len = n_samples] - * @param[inout] centroids Optional - * [in] When init is InitMethod::Array, use - * centroids as the initial cluster centers - * [out] The generated centroids from the - * kmeans algorithm are stored at the address - * pointed by 'centroids'. - * [dim = n_clusters x n_features] - * @param[out] labels Index of the cluster each sample in X belongs - * to. - * [len = n_samples] - * @param[out] inertia Sum of squared distances of samples to their - * closest cluster center. - * @param[out] n_iter Number of iterations run. - */ -template -void fit_predict(handle_t const& handle, - const KMeansParams& params, - raft::device_matrix_view X, - std::optional> sample_weight, - std::optional> centroids, - raft::device_vector_view labels, - raft::host_scalar_view inertia, - raft::host_scalar_view n_iter) -{ - kmeans::fit_predict( - handle, params, X, sample_weight, centroids, labels, inertia, n_iter); -} - -template -void fit_predict(handle_t const& handle, - const KMeansParams& params, - const DataT* X, - const DataT* sample_weight, - DataT* centroids, - IndexT n_samples, - IndexT n_features, - IndexT* labels, - DataT& inertia, - IndexT& n_iter) -{ - kmeans::fit_predict( - handle, params, X, sample_weight, centroids, n_samples, n_features, labels, inertia, n_iter); -} - -/** - * @brief Transform X to a cluster-distance space. - * - * @tparam DataT the type of data used for weights, distances. - * @tparam IndexT the type of data used for indexing. - * @param[in] handle The raft handle. - * @param[in] params Parameters for KMeans model. - * @param[in] X Training instances to cluster. The data must - * be in row-major format - * [dim = n_samples x n_features] - * @param[in] centroids Cluster centroids. The data must be in row-major format. - * [dim = n_clusters x n_features] - * @param[out] X_new X transformed in the new space. - * [dim = n_samples x n_features] - */ -template -void transform(const raft::handle_t& handle, - const KMeansParams& params, - raft::device_matrix_view X, - raft::device_matrix_view centroids, - raft::device_matrix_view X_new) -{ - kmeans::transform(handle, params, X, centroids, X_new); -} - -template -void transform(const raft::handle_t& handle, - const KMeansParams& params, - const DataT* X, - const DataT* centroids, - IndexT n_samples, - IndexT n_features, - DataT* X_new) -{ - kmeans::transform(handle, params, X, centroids, n_samples, n_features, X_new); -} - -template -using SamplingOp = kmeans::SamplingOp; - -template -using KeyValueIndexOp = kmeans::KeyValueIndexOp; - -/** - * @brief Select centroids according to a sampling operation - * - * @tparam DataT the type of data used for weights, distances. - * @tparam IndexT the type of data used for indexing. - * - * @param[in] handle The raft handle - * @param[in] X The data in row-major format - * [dim = n_samples x n_features] - * @param[in] minClusterDistance Distance for every sample to it's nearest centroid - * [dim = n_samples] - * @param[in] isSampleCentroid Flag the sample choosen as initial centroid - * [dim = n_samples] - * @param[in] select_op The sampling operation used to select the centroids - * @param[out] inRankCp The sampled centroids - * [dim = n_selected_centroids x n_features] - * @param[in] workspace Temporary workspace buffer which can get resized - * - */ -template -void sample_centroids(const raft::handle_t& handle, - raft::device_matrix_view X, - raft::device_vector_view minClusterDistance, - raft::device_vector_view isSampleCentroid, - SamplingOp& select_op, - rmm::device_uvector& inRankCp, - rmm::device_uvector& workspace) -{ - detail::sampleCentroids( - handle, X, minClusterDistance, isSampleCentroid, select_op, inRankCp, workspace); -} - -/** - * @brief Compute cluster cost - * - * @tparam DataT the type of data used for weights, distances. - * @tparam ReductionOpT the type of data used for the reduction operation. - * - * @param[in] handle The raft handle - * @param[in] minClusterDistance Distance for every sample to it's nearest centroid - * [dim = n_samples] - * @param[in] workspace Temporary workspace buffer which can get resized - * @param[out] clusterCost Resulting cluster cost - * @param[in] reduction_op The reduction operation used for the cost - * - */ -template -void cluster_cost(const raft::handle_t& handle, - raft::device_vector_view minClusterDistance, - rmm::device_uvector workspace, - raft::device_scalar_view clusterCost, - ReductionOpT reduction_op) -{ - detail::computeClusterCost( - handle, minClusterDistance, workspace, clusterCost, reduction_op); -} - -/** - * @brief Compute distance for every sample to it's nearest centroid - * - * @tparam DataT the type of data used for weights, distances. - * @tparam IndexT the type of data used for indexing. - * - * @param[in] handle The raft handle - * @param[in] params The parameters for KMeans - * @param[in] X The data in row-major format - * [dim = n_samples x n_features] - * @param[in] centroids Centroids data - * [dim = n_cluster x n_features] - * @param[out] minClusterDistance Distance for every sample to it's nearest centroid - * [dim = n_samples] - * @param[in] L2NormX L2 norm of X : ||x||^2 - * [dim = n_samples] - * @param[out] L2NormBuf_OR_DistBuf Resizable buffer to store L2 norm of centroids or distance - * matrix - * @param[in] workspace Temporary workspace buffer which can get resized - * - */ -template -void min_cluster_distance(const raft::handle_t& handle, - const KMeansParams& params, - raft::device_matrix_view X, - raft::device_matrix_view centroids, - raft::device_vector_view minClusterDistance, - raft::device_vector_view L2NormX, - rmm::device_uvector& L2NormBuf_OR_DistBuf, - rmm::device_uvector& workspace) -{ - detail::minClusterDistanceCompute( - handle, params, X, centroids, minClusterDistance, L2NormX, L2NormBuf_OR_DistBuf, workspace); -} - -/** - * @brief Calculates a pair for every sample in input 'X' where key is an - * index of one of the 'centroids' (index of the nearest centroid) and 'value' - * is the distance between the sample and the 'centroid[key]' - * - * @tparam DataT the type of data used for weights, distances. - * @tparam IndexT the type of data used for indexing. - * - * @param[in] handle The raft handle - * @param[in] params The parameters for KMeans - * @param[in] X The data in row-major format - * [dim = n_samples x n_features] - * @param[in] centroids Centroids data - * [dim = n_cluster x n_features] - * @param[out] minClusterAndDistance Distance vector that contains for every sample, the nearest - * centroid and it's distance - * [dim = n_samples] - * @param[in] L2NormX L2 norm of X : ||x||^2 - * [dim = n_samples] - * @param[out] L2NormBuf_OR_DistBuf Resizable buffer to store L2 norm of centroids or distance - * matrix - * @param[in] workspace Temporary workspace buffer which can get resized - * - */ -template -void min_cluster_and_distance( - const raft::handle_t& handle, - const KMeansParams& params, - raft::device_matrix_view X, - raft::device_matrix_view centroids, - raft::device_vector_view, IndexT> minClusterAndDistance, - raft::device_vector_view L2NormX, - rmm::device_uvector& L2NormBuf_OR_DistBuf, - rmm::device_uvector& workspace) -{ - detail::minClusterAndDistanceCompute( - handle, params, X, centroids, minClusterAndDistance, L2NormX, L2NormBuf_OR_DistBuf, workspace); -} - -/** - * @brief Shuffle and randomly select 'n_samples_to_gather' from input 'in' and stores - * in 'out' does not modify the input - * - * @tparam DataT the type of data used for weights, distances. - * @tparam IndexT the type of data used for indexing. - * - * @param[in] handle The raft handle - * @param[in] in The data to shuffle and gather - * [dim = n_samples x n_features] - * @param[out] out The sampled data - * [dim = n_samples_to_gather x n_features] - * @param[in] n_samples_to_gather Number of sample to gather - * @param[in] seed Seed for the shuffle - * @param[in] workspace Temporary workspace buffer which can get resized - * - */ -template -void shuffle_and_gather(const raft::handle_t& handle, - raft::device_matrix_view in, - raft::device_matrix_view out, - uint32_t n_samples_to_gather, - uint64_t seed, - rmm::device_uvector* workspace = nullptr) -{ - detail::shuffleAndGather(handle, in, out, n_samples_to_gather, seed, workspace); -} - -/** - * @brief Count the number of samples in each cluster - * - * @tparam DataT the type of data used for weights, distances. - * @tparam IndexT the type of data used for indexing. - * - * @param[in] handle The raft handle - * @param[in] params The parameters for KMeans - * @param[in] X The data in row-major format - * [dim = n_samples x n_features] - * @param[in] L2NormX L2 norm of X : ||x||^2 - * [dim = n_samples] - * @param[in] centroids Centroids data - * [dim = n_cluster x n_features] - * @param[in] workspace Temporary workspace buffer which can get resized - * @param[out] sampleCountInCluster The count for each centroid - * [dim = n_cluster] - * - */ -template -void count_samples_in_cluster(const raft::handle_t& handle, - const KMeansParams& params, - raft::device_matrix_view X, - raft::device_vector_view L2NormX, - raft::device_matrix_view centroids, - rmm::device_uvector& workspace, - raft::device_vector_view sampleCountInCluster) -{ - detail::countSamplesInCluster( - handle, params, X, L2NormX, centroids, workspace, sampleCountInCluster); -} - -/* - * @brief Selects 'n_clusters' samples from the input X using kmeans++ algorithm. - - * @note This is the algorithm described in - * "k-means++: the advantages of careful seeding". 2007, Arthur, D. and Vassilvitskii, S. - * ACM-SIAM symposium on Discrete algorithms. - * - * @tparam DataT the type of data used for weights, distances. - * @tparam IndexT the type of data used for indexing. - * - * @param[in] handle The raft handle - * @param[in] params The parameters for KMeans - * @param[in] X The data in row-major format - * [dim = n_samples x n_features] - * @param[out] centroids Centroids data - * [dim = n_cluster x n_features] - * @param[in] workspace Temporary workspace buffer which can get resized - */ -template -void init_plus_plus(const raft::handle_t& handle, - const KMeansParams& params, - raft::device_matrix_view X, - raft::device_matrix_view centroidsRawData, - rmm::device_uvector& workspace) -{ - detail::kmeansPlusPlus(handle, params, X, centroidsRawData, workspace); -} - -/* - * @brief Main function used to fit KMeans (after cluster initialization) - * - * @tparam DataT the type of data used for weights, distances. - * @tparam IndexT the type of data used for indexing. - * - * @param[in] handle The raft handle. - * @param[in] params Parameters for KMeans model. - * @param[in] X Training instances to cluster. The data must - * be in row-major format. - * [dim = n_samples x n_features] - * @param[in] sample_weight Weights for each observation in X. - * [len = n_samples] - * @param[inout] centroids [in] Initial cluster centers. - * [out] The generated centroids from the - * kmeans algorithm are stored at the address - * pointed by 'centroids'. - * [dim = n_clusters x n_features] - * @param[out] inertia Sum of squared distances of samples to their - * closest cluster center. - * @param[out] n_iter Number of iterations run. - * @param[in] workspace Temporary workspace buffer which can get resized - */ -template -void fit_main(const raft::handle_t& handle, - const KMeansParams& params, - raft::device_matrix_view X, - raft::device_vector_view weight, - raft::device_matrix_view centroidsRawData, - raft::host_scalar_view inertia, - raft::host_scalar_view n_iter, - rmm::device_uvector& workspace) -{ - detail::kmeans_fit_main( - handle, params, X, weight, centroidsRawData, inertia, n_iter, workspace); -} - -}; // namespace raft::cluster - -namespace raft::cluster { - /** * Note: All of the functions below in raft::cluster are deprecated and will * be removed in a future release. Please use raft::cluster::kmeans instead. @@ -1194,7 +727,7 @@ void kmeans_transform(const raft::handle_t& handle, kmeans::transform(handle, params, X, centroids, n_samples, n_features, X_new); } -template +template using SamplingOp = kmeans::SamplingOp; template From 41445e5136a687b9c7c54ce68fed990c4c5462ef Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Sat, 22 Oct 2022 14:42:33 -0400 Subject: [PATCH 12/14] moving build instructions to main docs. This should make them much more accessable for the users. --- README.md | 8 ++--- docs/README.md | 2 +- BUILD.md => docs/source/build.md | 60 +++++++++++--------------------- docs/source/index.rst | 1 + docs/source/quick_start.md | 3 +- 5 files changed, 28 insertions(+), 46 deletions(-) rename BUILD.md => docs/source/build.md (88%) diff --git a/README.md b/README.md index cc32e4d404..79ab874c27 100755 --- a/README.md +++ b/README.md @@ -99,7 +99,7 @@ pairwise_distance(in1, in2, output, metric="euclidean") ## Installing -RAFT itself can be installed through conda, [Cmake Package Manager (CPM)](https://github.com/cpm-cmake/CPM.cmake), or by building the repository from source. Please refer to the [build instructions](BUILD.md) for more a comprehensive guide on building RAFT and using it in downstream projects. +RAFT itself can be installed through conda, [Cmake Package Manager (CPM)](https://github.com/cpm-cmake/CPM.cmake), or by building the repository from source. Please refer to the [build instructions](docs/source/build.md) for more a comprehensive guide on building RAFT and using it in downstream projects. ### Conda @@ -119,7 +119,7 @@ You can also install the `libraft-*` conda packages individually using the `mamb After installing RAFT, `find_package(raft COMPONENTS nn distance)` can be used in your CUDA/C++ cmake build to compile and/or link against needed dependencies in your raft target. `COMPONENTS` are optional and will depend on the packages installed. -### CPM +### Cmake & CPM RAFT uses the [RAPIDS-CMake](https://github.com/rapidsai/rapids-cmake) library, which makes it simple to include in downstream cmake projects. RAPIDS CMake provides a convenience layer around CPM. @@ -186,7 +186,7 @@ mamba activate raft_dev_env ./build.sh raft-dask pylibraft libraft tests bench --compile-libs ``` -The [build](BUILD.md) instructions contain more details on building RAFT from source and including it in downstream projects. You can also find a more comprehensive version of the above CPM code snippet the [Building RAFT C++ from source](BUILD.md#build_cxx_source) section of the build instructions. +The [build](docs/source/build.md) instructions contain more details on building RAFT from source and including it in downstream projects. You can also find a more comprehensive version of the above CPM code snippet the [Building RAFT C++ from source](docs/source/build.md#building-raft-c-from-source-in-cmake) section of the build instructions. ## Folder Structure and Contents @@ -220,7 +220,7 @@ The folder structure mirrors other RAPIDS repos, with the following folders: - `scripts`: Helpful scripts for development - `src`: Compiled APIs and template specializations for the shared libraries - `test`: Googletests source code -- `docs`: Source code and scripts for building library documentation (doxygen + pydocs) +- `docs`: Source code and scripts for building library documentation (Uses breath, doxygen, & pydocs) - `python`: Source code for Python libraries. - `pylibraft`: Python build and source code for pylibraft library - `raft-dask`: Python build and source code for raft-dask library diff --git a/docs/README.md b/docs/README.md index ced8e63938..a09ccf41eb 100644 --- a/docs/README.md +++ b/docs/README.md @@ -1,7 +1,7 @@ # Building Documentation ## Building locally: -#### [Build and install RAFT](../BUILD.md) +#### [Build and install RAFT](source/build.md) #### Generate the docs ```shell script diff --git a/BUILD.md b/docs/source/build.md similarity index 88% rename from BUILD.md rename to docs/source/build.md index d38db90249..b75e67d82f 100644 --- a/BUILD.md +++ b/docs/source/build.md @@ -1,38 +1,21 @@ -# RAFT Build and Development Guide - -- [Building and installing RAFT](#build_install) - - [CUDA/GPU Requirements](#cuda_gpu_req) - - [Build Dependencies](#required_depenencies) - - [Header-only C++](#install_header_only_cpp) - - [C++ Shared Libraries](#shared_cpp_libs) - - [Improving Rebuild Times](#ccache) - - [Googletests](#gtests) - - [Googlebench](#gbench) - - [C++ Using Cmake](#cpp_using_cmake) - - [Python](#python) - - [Documentation](#docs) -- [Using RAFT in downstream projects](#use_raft) - - [Cmake Header-only Integration](#cxx_integration) - - [Using Shared Libraries in Cmake](#use_shared_libs) - - [Building RAFT C++ from source](#build_cxx_source) - - [Python/Cython Integration](#py_integration) - -## Building and installing RAFT - -### CUDA/GPU Requirements +# Install Guide + +## Building and installing RAFT + +### CUDA/GPU Requirements - CUDA Toolkit 11.0+ - NVIDIA driver 450.80.02+ - Pascal architecture of better (compute capability >= 6.0) -### Build Dependencies +### Build Dependencies In addition to the libraries included with cudatoolkit 11.0+, there are some other dependencies below for building RAFT from source. Many of the dependencies are optional and depend only on the primitives being used. All of these can be installed with cmake or [rapids-cpm](https://github.com/rapidsai/rapids-cmake#cpm) and many of them can be installed with [conda](https://anaconda.org). #### Required - [RMM](https://github.com/rapidsai/rmm) corresponding to RAFT version. +- [Thrust](https://github.com/NVIDIA/thrust) v1.17 / [CUB](https://github.com/NVIDIA/cub) #### Optional -- [Thrust](https://github.com/NVIDIA/thrust) v1.15 / [CUB](https://github.com/NVIDIA/cub) - On by default but can be disabled. - [cuCollections](https://github.com/NVIDIA/cuCollections) - Used in `raft::sparse::distance` API. - [Libcu++](https://github.com/NVIDIA/libcudacxx) v1.7.0 - [FAISS](https://github.com/facebookresearch/faiss) v1.7.0 - Used in `raft::spatial::knn` API and needed to build tests. @@ -46,7 +29,7 @@ C++ RAFT is a header-only library but provides the option of building shared lib The recommended way to build and install RAFT is to use the `build.sh` script in the root of the repository. This script can build both the C++ and Python artifacts and provides options for building and installing the headers, tests, benchmarks, and individual shared libraries. -### Header-only C++ +### Header-only C++ `build.sh` uses [rapids-cmake](https://github.com/rapidsai/rapids-cmake), which will automatically download any dependencies which are not already installed. It's important to note that while all the headers will be installed and available, some parts of the RAFT API depend on libraries like `FAISS`, which will need to be explicitly enabled in `build.sh`. @@ -55,7 +38,7 @@ The following example will download the needed dependencies and install the RAFT ./build.sh libraft --install ``` -### C++ Shared Libraries (optional) +### C++ Shared Libraries (optional) For larger projects which make heavy use of the pairwise distances or nearest neighbors APIs, shared libraries can be built to speed up compile times. These shared libraries can also significantly improve re-compile times both while developing RAFT and developing against the APIs. Build all of the available shared libraries by passing `--compile-libs` flag to `build.sh`: ```bash @@ -69,7 +52,7 @@ Individual shared libraries have their own flags and multiple can be used (thoug Add the `--install` flag to the above example to also install the shared libraries into `$INSTALL_PREFIX/lib`. -### `ccache` and `sccache` +### ccache and sccache `ccache` and `sccache` can be used to better cache parts of the build when rebuilding frequently, such as when working on a new feature. You can also use `ccache` or `sccache` with `build.sh`: @@ -77,7 +60,7 @@ Add the `--install` flag to the above example to also install the shared librari ./build.sh libraft --cache-tool=ccache ``` -### Tests +### Tests Compile the tests using the `tests` target in `build.sh`. @@ -104,7 +87,7 @@ It can take sometime to compile all of the tests. You can build individual tests ./build.sh libraft tests --limit-tests=NEIGHBORS_TEST;DISTANCE_TEST;MATRIX_TEST ``` -### Benchmarks +### Benchmarks The benchmarks are broken apart by algorithm category, so you will find several binaries in `cpp/build/` named `*_BENCH`. ```bash @@ -117,7 +100,7 @@ It can take sometime to compile all of the benchmarks. You can build individual ./build.sh libraft bench --limit-bench=NEIGHBORS_BENCH;DISTANCE_BENCH;LINALG_BENCH ``` -### C++ Using Cmake +### C++ Using Cmake Use `CMAKE_INSTALL_PREFIX` to install RAFT into a specific location. The snippet below will install it into the current conda environment: ```bash @@ -139,7 +122,6 @@ RAFT's cmake has the following configurable flags available:. | RAFT_COMPILE_NN_LIBRARY | ON, OFF | OFF | Compiles the `libraft-nn` shared library | | RAFT_COMPILE_DIST_LIBRARY | ON, OFF | OFF | Compiles the `libraft-distance` shared library | | RAFT_ENABLE_NN_DEPENDENCIES | ON, OFF | OFF | Searches for dependencies of nearest neighbors API, such as FAISS, and compiles them if not found. Needed for `raft::spatial::knn` | -| RAFT_ENABLE_thrust_DEPENDENCY | ON, OFF | ON | Enables the Thrust dependency. This can be disabled when using many simple utilities or to override with a different Thrust version. | | RAFT_USE_FAISS_STATIC | ON, OFF | OFF | Statically link FAISS into `libraft-nn` | | RAFT_STATIC_LINK_LIBRARIES | ON, OFF | ON | Build static link libraries instead of shared libraries | | DETECT_CONDA_ENV | ON, OFF | ON | Enable detection of conda environment for dependencies | @@ -150,7 +132,7 @@ RAFT's cmake has the following configurable flags available:. Currently, shared libraries are provided for the `libraft-nn` and `libraft-distance` components. The `libraft-nn` component depends upon [FAISS](https://github.com/facebookresearch/faiss) and the `RAFT_ENABLE_NN_DEPENDENCIES` option will build it from source if it is not already installed. -### Python +### Python Conda environment scripts are provided for installing the necessary dependencies for building and using the Python APIs. It is preferred to use `mamba`, as it provides significant speedup over `conda`. In addition you will have to manually install `nvcc` as it will not be installed as part of the conda environment. The following example will install create and install dependencies for a CUDA 11.5 conda environment: @@ -189,9 +171,9 @@ cd python/pylibraft py.test -s -v ``` -### Documentation +### Documentation -The documentation requires that the C++ headers and python packages have been built and installed. +The documentation requires that the C++ headers and python packages have been built and installed. The following will build the docs along with the C++ and Python packages: @@ -201,11 +183,11 @@ The following will build the docs along with the C++ and Python packages: -## Using RAFT in downstream projects +## Using RAFT in downstream projects There are two different strategies for including RAFT in downstream projects, depending on whether or not the required dependencies are already installed and available on the `lib` and `include` paths. -### C++ header-only integration using cmake +### C++ header-only integration using cmake When the needed [build dependencies](#required_depenencies) are already satisfied, RAFT can be trivially integrated into downstream projects by cloning the repository and adding `cpp/include` from RAFT to the include path: ```cmake @@ -222,7 +204,7 @@ set(RAFT_INCLUDE_DIR ${RAFT_GIT_DIR}/raft/cpp/include CACHE STRING "RAFT include If RAFT has already been installed, such as by using the `build.sh` script, use `find_package(raft)` and the `raft::raft` target if using RAFT to interact only with the public APIs of consuming projects. -### Using pre-compiled shared libraries +### Using pre-compiled shared libraries Use `find_package(raft COMPONENTS nn distance)` to enable the shared libraries and transitively pass dependencies through separate targets for each component. In this example, the `raft::distance` and `raft::nn` targets will be available for configuring linking paths in addition to `raft::raft`. These targets will also pass through any transitive dependencies (such as FAISS for the `nn` package). @@ -234,7 +216,7 @@ The following example tells the compiler to ignore the pre-compiled templates fo #include ``` -### Building RAFT C++ from source in cmake +### Building RAFT C++ from source in cmake RAFT uses the [RAPIDS-CMake](https://github.com/rapidsai/rapids-cmake) library so it can be more easily included into downstream projects. RAPIDS cmake provides a convenience layer around the [CMake Package Manager (CPM)](https://github.com/cpm-cmake/CPM.cmake). @@ -324,6 +306,6 @@ find_and_configure_raft(VERSION ${RAFT_VERSION}.00 If using the nearest neighbors APIs without the shared libraries, set `ENABLE_NN_DEPENDENCIES=ON` and keep `USE_NN_LIBRARY=OFF` -### Python/Cython Integration +### Python/Cython Integration Once installed, RAFT's Python library can be added to downstream conda recipes, imported and used directly. diff --git a/docs/source/index.rst b/docs/source/index.rst index fb7ce310c8..c46f08aac6 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -40,6 +40,7 @@ While not exhaustive, the following general categories help summarize the accele :caption: Contents: quick_start.md + build.md cpp_api.rst pylibraft_api.rst raft_dask_api.rst diff --git a/docs/source/quick_start.md b/docs/source/quick_start.md index a09f7beceb..e73f9b8a7a 100644 --- a/docs/source/quick_start.md +++ b/docs/source/quick_start.md @@ -1,11 +1,10 @@ # Quick Start - This guide is meant to provide a quick-start tutorial for interacting with RAFT's C++ APIs. ## RAPIDS Memory Manager (RMM) -RAFT relies heavily on RMM which eases the burden of configuring different allocation strategies globally across the libraries that use it. +RAFT relies heavily on the [RMM](https://github.com/rapidsai/rmm) library which eases the burden of configuring different allocation strategies globally across the libraries that use it. ## Multi-dimensional Spans and Arrays From 1ac3538f1a967401a173df07423d1b3aef71399b Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Sat, 22 Oct 2022 15:26:14 -0400 Subject: [PATCH 13/14] Separating out mdspan docs into their own section. Documenting more of the mdspan infra --- .../raft/core/detail/device_mdarray.hpp | 2 +- cpp/include/raft/core/device_mdspan.hpp | 6 +- .../{detail => }/host_device_accessor.hpp | 8 +- cpp/include/raft/core/host_mdspan.hpp | 4 +- cpp/include/raft/core/mdarray.hpp | 2 +- cpp/include/raft/core/mdspan.hpp | 2 +- docs/source/cpp_api.rst | 1 + docs/source/cpp_api/core.rst | 149 +------- docs/source/cpp_api/mdspan.rst | 344 ++++++++++++++++++ 9 files changed, 359 insertions(+), 159 deletions(-) rename cpp/include/raft/core/{detail => }/host_device_accessor.hpp (86%) create mode 100644 docs/source/cpp_api/mdspan.rst diff --git a/cpp/include/raft/core/detail/device_mdarray.hpp b/cpp/include/raft/core/detail/device_mdarray.hpp index ff7c31000d..ad6831794e 100644 --- a/cpp/include/raft/core/detail/device_mdarray.hpp +++ b/cpp/include/raft/core/detail/device_mdarray.hpp @@ -25,8 +25,8 @@ #include #include -#include #include // dynamic_extent +#include #include #include diff --git a/cpp/include/raft/core/device_mdspan.hpp b/cpp/include/raft/core/device_mdspan.hpp index 2fc43e2a05..ffbbe43d01 100644 --- a/cpp/include/raft/core/device_mdspan.hpp +++ b/cpp/include/raft/core/device_mdspan.hpp @@ -16,16 +16,16 @@ #pragma once -#include +#include #include namespace raft { template -using device_accessor = detail::host_device_accessor; +using device_accessor = host_device_accessor; template -using managed_accessor = detail::host_device_accessor; +using managed_accessor = host_device_accessor; /** * @brief std::experimental::mdspan with device tag to avoid accessing incorrect memory location. diff --git a/cpp/include/raft/core/detail/host_device_accessor.hpp b/cpp/include/raft/core/host_device_accessor.hpp similarity index 86% rename from cpp/include/raft/core/detail/host_device_accessor.hpp rename to cpp/include/raft/core/host_device_accessor.hpp index 3a71e6366b..4f6f559be4 100644 --- a/cpp/include/raft/core/detail/host_device_accessor.hpp +++ b/cpp/include/raft/core/host_device_accessor.hpp @@ -16,10 +16,12 @@ #pragma once -namespace raft::detail { +namespace raft { /** - * @brief A mixin to distinguish host and device memory. + * @brief A mixin to distinguish host and device memory. This is the primary + * accessor used throught RAFT's APIs to denote whether an underlying pointer + * is accessible from device, host, or both. */ template struct host_device_accessor : public AccessorPolicy { @@ -36,4 +38,4 @@ struct host_device_accessor : public AccessorPolicy { host_device_accessor(AccessorPolicy const& that) : AccessorPolicy{that} {} // NOLINT }; -} // namespace raft::detail +} // namespace raft diff --git a/cpp/include/raft/core/host_mdspan.hpp b/cpp/include/raft/core/host_mdspan.hpp index fc2a9bbd6d..3e76dbb9ce 100644 --- a/cpp/include/raft/core/host_mdspan.hpp +++ b/cpp/include/raft/core/host_mdspan.hpp @@ -18,12 +18,12 @@ #include -#include +#include namespace raft { template -using host_accessor = detail::host_device_accessor; +using host_accessor = host_device_accessor; /** * @brief std::experimental::mdspan with host tag to avoid accessing incorrect memory location. diff --git a/cpp/include/raft/core/mdarray.hpp b/cpp/include/raft/core/mdarray.hpp index 44730d901e..e425e0013f 100644 --- a/cpp/include/raft/core/mdarray.hpp +++ b/cpp/include/raft/core/mdarray.hpp @@ -24,8 +24,8 @@ #include -#include #include +#include #include #include #include diff --git a/cpp/include/raft/core/mdspan.hpp b/cpp/include/raft/core/mdspan.hpp index a858633e07..44011a33f4 100644 --- a/cpp/include/raft/core/mdspan.hpp +++ b/cpp/include/raft/core/mdspan.hpp @@ -18,9 +18,9 @@ #include #include -#include #include #include +#include #include diff --git a/docs/source/cpp_api.rst b/docs/source/cpp_api.rst index e3f650563d..cf3829422d 100644 --- a/docs/source/cpp_api.rst +++ b/docs/source/cpp_api.rst @@ -14,6 +14,7 @@ C++ API Reference cpp_api/distance.rst cpp_api/linalg.rst cpp_api/matrix.rst + cpp_api/mdspan.rst cpp_api/neighbors.rst cpp_api/solver.rst cpp_api/random.rst diff --git a/docs/source/cpp_api/core.rst b/docs/source/cpp_api/core.rst index f5914f5d52..9e4ef412f7 100644 --- a/docs/source/cpp_api/core.rst +++ b/docs/source/cpp_api/core.rst @@ -14,7 +14,7 @@ handle_t :members: -interruptible +Interruptible ############# .. doxygenclass:: raft::interruptible @@ -29,153 +29,6 @@ NVTX :members: -mdarray -####### - -.. doxygenclass:: raft::mdarray - :project: RAFT - :members: - -.. doxygenclass:: raft::make_device_matrix - :project: RAFT - -.. doxygenclass:: raft::make_device_vector - :project: RAFT - -.. doxygenclass:: raft::make_device_scalar - :project: RAFT - -.. doxygenclass:: raft::make_host_matrix - :project: RAFT - -.. doxygenclass:: raft::make_host_vector - :project: RAFT - -.. doxygenclass:: raft::make_device_scalar - :project: RAFT - - -mdspan -####### - -.. doxygenfunction:: raft::make_device_mdspan - :project: RAFT - -.. doxygenfunction:: raft::make_device_matrix_view - :project: RAFT - -.. doxygenfunction:: raft::make_device_vector_view - :project: RAFT - -.. doxygenfunction:: raft::make_device_scalar_view - :project: RAFT - -.. doxygenfunction:: raft::make_host_matrix_view - :project: RAFT - -.. doxygenfunction:: raft::make_host_vector_view - :project: RAFT - -.. doxygenfunction:: raft::make_device_scalar_view - :project: RAFT - -Device Factories ----------------- - -.. doxygenfunction:: raft::make_device_matrix - :project: RAFT - -.. doxygenfunction:: raft::make_device_vector - :project: RAFT - -.. doxygenfunction:: raft::make_device_scalar - :project: RAFT - -Host Factories ----------------- - -.. doxygenfunction:: raft::make_host_matrix - :project: RAFT - -.. doxygenfunction:: raft::make_host_vector - :project: RAFT - -.. doxygenfunction:: raft::make_device_scalar - :project: RAFT - - -mdspan -####### - -Device Vocabulary ------------------ - -.. doxygentypedef:: raft::device_mdspan - :project: RAFT - -.. doxygentypedef:: raft::device_matrix_view - :project: RAFT - -.. doxygentypedef:: raft::device_vector_view - :project: RAFT - -.. doxygentypedef:: raft::device_scalar_view - :project: RAFT - -Host Vocabulary ---------------- - -.. doxygentypedef:: raft::host_mdspan - :project: RAFT - -.. doxygentypedef:: raft::host_matrix_view - :project: RAFT - -.. doxygentypedef:: raft::host_vector_view - :project: RAFT - -.. doxygentypedef:: raft::host_scalar_view - :project: RAFT - -Device Factories ----------------- - -.. doxygenfunction:: raft::make_device_matrix_view - :project: RAFT - -.. doxygenfunction:: raft::make_device_vector_view - :project: RAFT - -.. doxygenfunction:: raft::make_device_scalar_view - :project: RAFT - -Host Factories --------------- - -.. doxygenfunction:: raft::make_host_matrix_view - :project: RAFT - -.. doxygenfunction:: raft::make_host_vector_view - :project: RAFT - -.. doxygenfunction:: raft::make_device_scalar_view - :project: RAFT - -span -#### - -.. doxygentypedef:: raft::device_span - :project: RAFT - -.. doxygentypedef:: raft::host_span - :project: RAFT - -.. doxygenclass:: raft::span - :project: RAFT - :members: - - - Key-Value Pair ############## diff --git a/docs/source/cpp_api/mdspan.rst b/docs/source/cpp_api/mdspan.rst new file mode 100644 index 0000000000..a283da967b --- /dev/null +++ b/docs/source/cpp_api/mdspan.rst @@ -0,0 +1,344 @@ +Multi-dimensional Span / Array +============================== + +This page provides C++ class references for the RAFT's 1d span and multi-dimension owning (mdarray) and non-owning (mdspan) APIs. These headers can be found in the `raft/core` directory. + +Representation +############## + +.. doxygenstruct:: raft::host_device_accessor + :project: RAFT + :members: + +.. doxygentypedef:: raft::host_accessor + :project: RAFT + +.. doxygentypedef:: raft::device_accessor + :project: RAFT + +.. doxygentypedef:: raft::managed_accessor + :project: RAFT + +.. doxygentypedef:: raft::row_major + :project: RAFT + +.. doxygentypedef:: raft::col_major + :project: RAFT + +.. doxygentypedef:: raft::matrix_extent + :project: RAFT + +.. doxygentypedef:: raft::vector_extent + :project: RAFT + +.. doxygentypedef:: raft::scalar_extent + :project: RAFT + +.. doxygentypedef:: raft::extent_3d + :project: RAFT + +.. doxygentypedef:: raft::extent_4d + :project: RAFT + +.. doxygentypedef:: raft::extent_5d + :project: RAFT + +.. doxygentypedef:: raft::dynamic_extent + :project: RAFT + +.. doxygentypedef:: raft::extents + :project: RAFT + +.. doxygenfunction:: raft::flatten + :project: RAFT + + +.. doxygenfunction:: raft::reshape + :project: RAFT + + +mdarray +####### + +.. doxygenclass:: raft::mdarray + :project: RAFT + :members: + +.. doxygenclass:: raft::array_interface + :project: RAFT + :members: + +.. doxygenstruct:: raft::is_array_interface + :project: RAFT + :members: + +.. doxygentypedef:: raft::is_array_interface_t + :project RAFT + +Device Vocabulary +----------------- + +.. doxygentypedef:: raft::device_mdarray + :project: RAFT + + +.. doxygentypedef:: raft::device_matrix + :project: RAFT + +.. doxygentypedef:: raft::device_vector + :project: RAFT + +.. doxygentypedef:: raft::device_scalar + :project: RAFT + + +Device Factories +---------------- + +.. doxygenfunction:: raft::make_device_matrix + :project: RAFT + +.. doxygenfunction:: raft::make_device_vector + :project: RAFT + +.. doxygenfunction:: raft::make_device_scalar + :project: RAFT + + +Host Vocabulary +--------------- + +.. doxygentypedef:: raft::host_matrix + :project: RAFT + +.. doxygentypedef:: raft::host_vector + :project: RAFT + +.. doxygentypedef:: raft::host_scalar + :project: RAFT + + +Host Factories +-------------- + +.. doxygenfunction:: raft::make_host_matrix + :project: RAFT + +.. doxygenfunction:: raft::make_host_vector + :project: RAFT + +.. doxygenfunction:: raft::make_device_scalar + :project: RAFT + +mdspan +###### + +.. doxygentypedef:: raft::mdspan + :project: RAFT + +.. doxygenstruct:: raft::is_mdspan + :project: RAFT + :members: + +.. doxygentypedef:: raft::is_mdspan_t + :project: RAFT + +.. doxygenstruct:: raft::is_input_mdspan + :project: RAFT + :members: + +.. doxygentypedef:: raft::is_input_mdspan_t + :project: RAFT + +.. doxygenstruct:: raft::is_output_mdspan + :project: RAFT + :members: + +.. doxygentypedef:: raft::is_output_mdspan_t + :project: RAFT + +.. doxygentypedef:: raft::enable_if_mdspan + :project: RAFT + +.. doxygentypedef:: raft::enable_if_input_mdspan + :project: RAFT + +.. doxygentypedef:: raft::enable_if_output_mdspan + :project: RAFT + +.. doxygenfunction:: raft::make_mdspan + :project: RAFT + +.. doxygenfunction:: raft::make_extents + :project: RAFT + +.. doxygenfunction:: raft::unravel_index + :project: RAFT + + +Device Vocabulary +----------------- + +.. doxygentypedef:: raft::device_mdspan + :project: RAFT + +.. doxygenstruct:: raft::is_device_mdspan + :project: RAFT + +.. doxygenstruct:: raft::is_device_mdspan_t + :project: RAFT + +.. doxygenstruct:: raft::is_input_device_mdspan_t + :project: RAFT + +.. doxygenstruct:: raft::is_output_device_mdspan_t + :project: RAFT + +.. doxygentypedef:: raft::enable_if_device_mdspan + :project: RAFT + +.. doxygentypedef:: raft::enable_if_input_device_mdspan + :project: RAFT + +.. doxygentypedef:: raft::enable_if_output_device_mdspan + :project: RAFT + +.. doxygentypedef:: raft::device_matrix_view + :project: RAFT + +.. doxygentypedef:: raft::device_vector_view + :project: RAFT + +.. doxygentypedef:: raft::device_scalar_view + :project: RAFT + + +Device Factories +---------------- + +.. doxygenfunction:: raft::make_device_mdspan + :project: RAFT + +.. doxygenfunction:: raft::make_device_matrix_view + :project: RAFT + +.. doxygenfunction:: raft::make_device_vector_view + :project: RAFT + +.. doxygenfunction:: raft::make_device_scalar_view + :project: RAFT + + +Managed Vocabulary +------------------ + +..doxygentypedef:: raft::managed_mdspan + :project: RAFT + +.. doxygenstruct:: raft::is_managed_mdspan + :project: RAFT + +.. doxygenstruct:: raft::is_managed_mdspan_t + :project: RAFT + +.. doxygenstruct:: raft::is_input_managed_mdspan_t + :project: RAFT + +.. doxygenstruct:: raft::is_output_managed_mdspan_t + :project: RAFT + +.. doxygentypedef:: raft::enable_if_managed_mdspan + :project: RAFT + +.. doxygentypedef:: raft::enable_if_input_managed_mdspan + :project: RAFT + +.. doxygentypedef:: raft::enable_if_output_managed_mdspan + :project: RAFT + +.. doxygentypedef:: raft::managed_matrix_view + :project: RAFT + +.. doxygentypedef:: raft::managed_vector_view + :project: RAFT + +.. doxygentypedef:: raft::managed_scalar_view + :project: RAFT + + +Managed Factories +----------------- + +.. doxygenfunction:: raft::make_managed_mdspan + :project: RAFT + +.. doxygenfunction:: raft::make_managed_matrix_view + :project: RAFT + +.. doxygenfunction:: raft::make_managed_vector_view + :project: RAFT + +.. doxygenfunction:: raft::make_managed_scalar_view + :project: RAFT + + +Host Vocabulary +--------------- + +.. doxygentypedef:: raft::host_mdspan + :project: RAFT + +.. doxygenstruct:: raft::is_host_mdspan + :project: RAFT + +.. doxygenstruct:: raft::is_host_mdspan_t + :project: RAFT + +.. doxygenstruct:: raft::is_input_host_mdspan_t + :project: RAFT + +.. doxygenstruct:: raft::is_output_host_mdspan_t + :project: RAFT + +.. doxygentypedef:: raft::enable_if_host_mdspan + :project: RAFT + +.. doxygentypedef:: raft::enable_if_input_host_mdspan + :project: RAFT + +.. doxygentypedef:: raft::enable_if_output_host_mdspan + :project: RAFT + +.. doxygentypedef:: raft::host_matrix_view + :project: RAFT + +.. doxygentypedef:: raft::host_vector_view + :project: RAFT + +.. doxygentypedef:: raft::host_scalar_view + :project: RAFT + +Host Factories +-------------- + +.. doxygenfunction:: raft::make_host_matrix_view + :project: RAFT + +.. doxygenfunction:: raft::make_host_vector_view + :project: RAFT + +.. doxygenfunction:: raft::make_device_scalar_view + :project: RAFT + +span +#### + +.. doxygentypedef:: raft::device_span + :project: RAFT + +.. doxygentypedef:: raft::host_span + :project: RAFT + +.. doxygenclass:: raft::span + :project: RAFT + :members: From 6d41d8a69d667e693a0bb10f6ad380d75e5fc4ab Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Sat, 22 Oct 2022 15:28:45 -0400 Subject: [PATCH 14/14] Removing last references to detail::host_device_accessor (it's been moved to the public) --- cpp/include/raft/core/mdarray.hpp | 13 ++++++------- cpp/include/raft/core/mdspan.hpp | 7 +++---- 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/cpp/include/raft/core/mdarray.hpp b/cpp/include/raft/core/mdarray.hpp index e425e0013f..ae5d236395 100644 --- a/cpp/include/raft/core/mdarray.hpp +++ b/cpp/include/raft/core/mdarray.hpp @@ -154,13 +154,12 @@ class mdarray std::conditional_t, typename container_policy_type::const_accessor_policy, typename container_policy_type::accessor_policy>> - using view_type_impl = - mdspan>; + using view_type_impl = mdspan>; public: /** diff --git a/cpp/include/raft/core/mdspan.hpp b/cpp/include/raft/core/mdspan.hpp index 44011a33f4..1b98a7a937 100644 --- a/cpp/include/raft/core/mdspan.hpp +++ b/cpp/include/raft/core/mdspan.hpp @@ -149,10 +149,9 @@ template auto make_mdspan(ElementType* ptr, extents exts) { - using accessor_type = - detail::host_device_accessor, - is_host_accessible, - is_device_accessible>; + using accessor_type = host_device_accessor, + is_host_accessible, + is_device_accessible>; return mdspan{ptr, exts}; }