Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement raft::stats API with mdspan #802

Merged
merged 50 commits into from
Oct 7, 2022
Merged
Show file tree
Hide file tree
Changes from 37 commits
Commits
Show all changes
50 commits
Select commit Hold shift + click to select a range
853d984
Add mdspan for cov and mean_center
lowener Jul 25, 2022
2b5218c
Change only public API
lowener Jul 25, 2022
9c97e01
Add accuracy, randIndex, completeness and contingency
lowener Jul 25, 2022
426c201
Merge branch 'branch-22.10' into 22.10-stats-api
lowener Aug 11, 2022
1591389
Update meanvar
lowener Aug 11, 2022
a6cdb62
Start using vanilla mdspan
lowener Aug 11, 2022
16c1d03
Add vanilla mdspan for stats public api
lowener Aug 15, 2022
3b262cf
Fix comments, start adding tests
lowener Aug 18, 2022
b199ef9
Merge branch 'branch-22.10' into 22.10-stats-api
lowener Aug 31, 2022
21f374a
Remove constness, add tests
lowener Sep 3, 2022
82a9bd7
Add remaining tests, fix style
lowener Sep 5, 2022
0d11ec7
Merge branch 'branch-22.10' into 22.10-stats-api
lowener Sep 5, 2022
7ff8a2f
Using device_*_view instead of vanilla mdspan
lowener Sep 20, 2022
1e52d85
Template fix, add static_assert and fix tests
lowener Sep 21, 2022
37894d7
Add optional argument to contingency matrix
lowener Sep 21, 2022
8e95d2f
Prefer extent over size, change workspace type of contingency
lowener Sep 21, 2022
b85044c
Merge branch 'branch-22.10' into 22.10-stats-api
lowener Sep 22, 2022
e9a929c
Add device_mdspan include. Fix parameter order
lowener Sep 22, 2022
bbdf6dd
Fix copyright
lowener Sep 22, 2022
55c0c91
Fix tests
lowener Sep 23, 2022
b5cf18b
Update remaining stats function and their tests with mdspan
lowener Sep 26, 2022
ef94359
Use snake case for variables, parameters and templates
lowener Sep 27, 2022
0479c11
fix style
lowener Sep 27, 2022
f8ae9e1
Remove workspace from public api
lowener Sep 28, 2022
093dc4c
Add [in] [out] to parameter documentation
lowener Sep 28, 2022
2a4b5b8
Adding const specifier when possible
lowener Sep 28, 2022
669163e
Merge branch 'branch-22.10' into 22.10-stats-api
lowener Sep 29, 2022
00448a1
Remove default template, rename dispersion, fix silhouette_score
lowener Oct 3, 2022
36f066f
Fix silhouette test file
lowener Oct 3, 2022
39cc643
Add overload for std::nullopt
lowener Oct 3, 2022
2d4d285
Merge branch 'branch-22.10' into 22.10-stats-api
lowener Oct 3, 2022
2a97ef2
Add cluster dispersion definition
lowener Oct 3, 2022
461eee5
Fix bcast_along_rows
lowener Oct 4, 2022
8b9d840
Merge branch 'branch-22.10' into 22.10-stats-api
lowener Oct 4, 2022
9c4adf4
mean_center and weighted_mean correction for along_rows parameter
lowener Oct 5, 2022
29d360e
Updating row weighted mean
cjnolet Oct 5, 2022
333e596
iRemoving weighted mean mdspanification for now.
cjnolet Oct 5, 2022
b489e3b
Updating weighted mean test.
cjnolet Oct 6, 2022
a78d38a
Weighted mean
cjnolet Oct 6, 2022
c3d11a3
Skipping re-install of raft-dask for docs build
cjnolet Oct 6, 2022
5a7307f
Adding missing semicolon
cjnolet Oct 6, 2022
a7790e3
Adding pylibraft to docs build.
cjnolet Oct 6, 2022
374c91c
Reverting changes to build.sh
cjnolet Oct 6, 2022
be1baa1
Merge branch 'branch-22.10' into 22.10-stats-api
cjnolet Oct 6, 2022
faf86cc
Merge branch 'branch-22.10' into 22.10-stats-api
cjnolet Oct 6, 2022
72d6c80
enabling verbose logging in build.sh for docs
cjnolet Oct 7, 2022
ff2b9b0
Removing --build from cmake --build
cjnolet Oct 7, 2022
db02b2d
Merge branch 'fea-2212-increase_docs_build_logging' into 22.10-stats-api
cjnolet Oct 7, 2022
dd89e81
Fixing doxygen build
cjnolet Oct 7, 2022
4a3ad93
Fixing style
cjnolet Oct 7, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions cpp/include/raft/stats/accuracy.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

#pragma once

#include <raft/core/device_mdspan.hpp>
#include <raft/stats/detail/scores.cuh>

namespace raft {
Expand All @@ -39,6 +40,29 @@ float accuracy(const math_t* predictions, const math_t* ref_predictions, int n,
return detail::accuracy_score(predictions, ref_predictions, n, stream);
}

/**
* @brief Compute accuracy of predictions. Useful for classification.
* @tparam value_t: data type for predictions (e.g., int for classification)
* @tparam idx_t Index type of matrix extent.
* @param[in] handle: the raft handle.
* @param[in] predictions: array of predictions (GPU pointer).
* @param[in] ref_predictions: array of reference (ground-truth) predictions (GPU pointer).
* @return: Accuracy score in [0, 1]; higher is better.
*/
template <typename value_t, typename idx_t>
float accuracy(const raft::handle_t& handle,
raft::device_vector_view<const value_t, idx_t> predictions,
raft::device_vector_view<const value_t, idx_t> ref_predictions)
{
RAFT_EXPECTS(predictions.size() == ref_predictions.size(), "Size mismatch");
RAFT_EXPECTS(predictions.is_exhaustive(), "predictions must be contiguous");
RAFT_EXPECTS(ref_predictions.is_exhaustive(), "ref_predictions must be contiguous");

return detail::accuracy_score(predictions.data_handle(),
lowener marked this conversation as resolved.
Show resolved Hide resolved
ref_predictions.data_handle(),
predictions.extent(0),
handle.get_stream());
}
} // namespace stats
} // namespace raft

Expand Down
27 changes: 27 additions & 0 deletions cpp/include/raft/stats/adjusted_rand_index.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@

#pragma once

#include <raft/core/device_mdspan.hpp>
#include <raft/stats/detail/adjusted_rand_index.cuh>

namespace raft {
Expand All @@ -48,6 +49,32 @@ double adjusted_rand_index(const T* firstClusterArray,
return detail::compute_adjusted_rand_index(firstClusterArray, secondClusterArray, size, stream);
}

/**
* @brief Function to calculate Adjusted RandIndex as described
* <a href="https://en.wikipedia.org/wiki/Rand_index">here</a>
* @tparam value_t data-type for input label arrays
* @tparam math_t integral data-type used for computing n-choose-r
* @tparam idx_t Index type of matrix extent.
* @param[in] handle: the raft handle.
* @param[in] first_cluster_array: the array of classes
* @param[in] second_cluster_array: the array of classes
* @return the Adjusted RandIndex
*/
template <typename value_t, typename math_t, typename idx_t>
double adjusted_rand_index(const raft::handle_t& handle,
raft::device_vector_view<const value_t, idx_t> first_cluster_array,
raft::device_vector_view<const value_t, idx_t> second_cluster_array)
{
RAFT_EXPECTS(first_cluster_array.size() == second_cluster_array.size(), "Size mismatch");
RAFT_EXPECTS(first_cluster_array.is_exhaustive(), "first_cluster_array must be contiguous");
RAFT_EXPECTS(second_cluster_array.is_exhaustive(), "second_cluster_array must be contiguous");

return detail::compute_adjusted_rand_index<value_t, math_t>(first_cluster_array.data_handle(),
second_cluster_array.data_handle(),
first_cluster_array.extent(0),
handle.get_stream());
}

}; // end namespace stats
}; // end namespace raft

Expand Down
41 changes: 36 additions & 5 deletions cpp/include/raft/stats/completeness_score.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

#pragma once

#include <raft/core/device_mdspan.hpp>
#include <raft/stats/detail/homogeneity_score.cuh>

namespace raft {
Expand All @@ -30,20 +31,50 @@ namespace stats {
* @param truthClusterArray: the array of truth classes of type T
* @param predClusterArray: the array of predicted classes of type T
* @param size: the size of the data points of type int
* @param lowerLabelRange: the lower bound of the range of labels
* @param upperLabelRange: the upper bound of the range of labels
* @param lower_label_range: the lower bound of the range of labels
* @param upper_label_range: the upper bound of the range of labels
* @param stream: the cudaStream object
*/
template <typename T>
double completeness_score(const T* truthClusterArray,
const T* predClusterArray,
int size,
T lowerLabelRange,
T upperLabelRange,
T lower_label_range,
T upper_label_range,
cudaStream_t stream)
{
return detail::homogeneity_score(
predClusterArray, truthClusterArray, size, lowerLabelRange, upperLabelRange, stream);
predClusterArray, truthClusterArray, size, lower_label_range, upper_label_range, stream);
}

/**
* @brief Function to calculate the completeness score between two clusters
*
* @tparam value_t the data type
* @tparam idx_t Index type of matrix extent.
* @param[in] handle: the raft handle.
* @param[in] truth_cluster_array: the array of truth classes of type value_t
* @param[in] pred_cluster_array: the array of predicted classes of type value_t
* @param[in] lower_label_range: the lower bound of the range of labels
* @param[in] upper_label_range: the upper bound of the range of labels
* @return the cluster completeness score
*/
template <typename value_t, typename idx_t>
double completeness_score(const raft::handle_t& handle,
raft::device_vector_view<const value_t, idx_t> truth_cluster_array,
raft::device_vector_view<const value_t, idx_t> pred_cluster_array,
value_t lower_label_range,
value_t upper_label_range)
{
RAFT_EXPECTS(truth_cluster_array.size() == pred_cluster_array.size(), "Size mismatch");
RAFT_EXPECTS(truth_cluster_array.is_exhaustive(), "truth_cluster_array must be contiguous");
RAFT_EXPECTS(pred_cluster_array.is_exhaustive(), "pred_cluster_array must be contiguous");
return detail::homogeneity_score(pred_cluster_array.data_handle(),
truth_cluster_array.data_handle(),
truth_cluster_array.extent(0),
lower_label_range,
upper_label_range,
handle.get_stream());
}

}; // end namespace stats
Expand Down
107 changes: 106 additions & 1 deletion cpp/include/raft/stats/contingency_matrix.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,10 @@

#pragma once

#include <raft/core/device_mdarray.hpp>
lowener marked this conversation as resolved.
Show resolved Hide resolved
#include <raft/core/device_mdspan.hpp>
#include <raft/core/handle.hpp>
#include <raft/core/host_mdspan.hpp>
#include <raft/stats/detail/contingencyMatrix.cuh>

namespace raft {
Expand All @@ -40,6 +44,31 @@ void getInputClassCardinality(
detail::getInputClassCardinality(groundTruth, nSamples, stream, minLabel, maxLabel);
}

/**
* @brief use this to allocate output matrix size
* size of matrix = (maxLabel - minLabel + 1)^2 * sizeof(int)
* @tparam value_t label type
* @tparam idx_t Index type of matrix extent.
* @param[in] handle: the raft handle.
* @param[in] groundTruth: device 1-d array for ground truth (num of rows)
* @param[out] minLabel: calculated min value in input array
* @param[out] maxLabel: calculated max value in input array
*/
template <typename value_t, typename idx_t>
void get_input_class_cardinality(const raft::handle_t& handle,
raft::device_vector_view<const value_t, idx_t> groundTruth,
raft::host_scalar_view<value_t> minLabel,
raft::host_scalar_view<value_t> maxLabel)
{
RAFT_EXPECTS(minLabel.data_handle() != nullptr, "Invalid minLabel pointer");
RAFT_EXPECTS(maxLabel.data_handle() != nullptr, "Invalid maxLabel pointer");
detail::getInputClassCardinality(groundTruth.data_handle(),
groundTruth.extent(0),
handle.get_stream(),
*minLabel.data_handle(),
*maxLabel.data_handle());
}

/**
* @brief Calculate workspace size for running contingency matrix calculations
* @tparam T label type
Expand Down Expand Up @@ -71,7 +100,7 @@ size_t getContingencyMatrixWorkspaceSize(int nSamples,
* @param groundTruth: device 1-d array for ground truth (num of rows)
* @param predictedLabel: device 1-d array for prediction (num of columns)
* @param nSamples: number of elements in input array
* @param outMat: output buffer for contingecy matrix
* @param outMat: output buffer for contingency matrix
* @param stream: cuda stream for execution
* @param workspace: Optional, workspace memory allocation
* @param workspaceSize: Optional, size of workspace memory
Expand Down Expand Up @@ -100,6 +129,82 @@ void contingencyMatrix(const T* groundTruth,
maxLabel);
}

/**
* @brief contruct contingency matrix given input ground truth and prediction
* labels. Users should call function getInputClassCardinality to find
* and allocate memory for output. Similarly workspace requirements
* should be checked using function getContingencyMatrixWorkspaceSize
* @tparam value_t label type
* @tparam out_t output matrix type
* @tparam idx_t Index type of matrix extent.
* @tparam layout_t Layout type of the input data.
* @param[in] handle: the raft handle.
* @param[in] ground_truth: device 1-d array for ground truth (num of rows)
* @param[in] predicted_label: device 1-d array for prediction (num of columns)
* @param[out] out_mat: output buffer for contingency matrix
* @param[in] min_label: Optional, min value in input ground truth array
* @param[in] max_label: Optional, max value in input ground truth array
*/
template <typename value_t, typename out_t, typename idx_t, typename layout_t>
void contingency_matrix(const raft::handle_t& handle,
raft::device_vector_view<const value_t, idx_t> ground_truth,
raft::device_vector_view<const value_t, idx_t> predicted_label,
raft::device_matrix_view<out_t, idx_t, layout_t> out_mat,
std::optional<value_t> min_label = std::nullopt,
std::optional<value_t> max_label = std::nullopt)
{
RAFT_EXPECTS(ground_truth.size() == predicted_label.size(), "Size mismatch");
RAFT_EXPECTS(ground_truth.is_exhaustive(), "ground_truth must be contiguous");
RAFT_EXPECTS(predicted_label.is_exhaustive(), "predicted_label must be contiguous");
RAFT_EXPECTS(out_mat.is_exhaustive(), "out_mat must be contiguous");

value_t min_label_value = std::numeric_limits<value_t>::max();
value_t max_label_value = std::numeric_limits<value_t>::max();
if (min_label.has_value()) { min_label_value = min_label.value(); }
if (max_label.has_value()) { max_label_value = max_label.value(); }

auto workspace_sz = detail::getContingencyMatrixWorkspaceSize(ground_truth.extent(0),
ground_truth.data_handle(),
handle.get_stream(),
min_label_value,
max_label_value);
auto workspace = raft::make_device_vector<char>(handle, workspace_sz);

detail::contingencyMatrix<value_t, out_t>(ground_truth.data_handle(),
predicted_label.data_handle(),
ground_truth.extent(0),
out_mat.data_handle(),
handle.get_stream(),
workspace.data_handle(),
workspace_sz,
min_label_value,
max_label_value);
}

/**
* @brief Overload of `contingency_matrix` to help the
* compiler find the above overload, in case users pass in
* `std::nullopt` for the optional arguments.
*
* Please see above for documentation of `contingency_matrix`.
*/
template <typename value_t,
typename out_t,
typename idx_t,
typename layout_t,
typename opt_min_label_t,
typename opt_max_label_t>
void contingency_matrix(const raft::handle_t& handle,
raft::device_vector_view<const value_t, idx_t> ground_truth,
raft::device_vector_view<const value_t, idx_t> predicted_label,
raft::device_matrix_view<out_t, idx_t, layout_t> out_mat,
opt_min_label_t&& min_label = std::nullopt,
opt_max_label_t&& max_label = std::nullopt)
{
std::optional<value_t> opt_min_label = std::forward<opt_min_label_t>(min_label);
std::optional<value_t> opt_max_label = std::forward<opt_max_label_t>(max_label);
contingency_matrix(handle, ground_truth, predicted_label, out_mat, opt_min_label, opt_max_label);
}
}; // namespace stats
}; // namespace raft

Expand Down
50 changes: 50 additions & 0 deletions cpp/include/raft/stats/cov.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

#pragma once

#include <raft/core/device_mdspan.hpp>
#include <raft/stats/detail/cov.cuh>
namespace raft {
namespace stats {
Expand Down Expand Up @@ -57,6 +58,55 @@ void cov(const raft::handle_t& handle,
{
detail::cov(handle, covar, data, mu, D, N, sample, rowMajor, stable, stream);
}

/**
* @brief Compute covariance of the input matrix
*
* Mean operation is assumed to be performed on a given column.
*
* @tparam value_t the data type
* @tparam idx_t the index type
* @tparam layout_t Layout type of the input data.
* @param[in] handle the raft handle
* @param[in] data the input matrix (this will get mean-centered at the end!)
* (length = nrows * ncols)
* @param[in] mu mean vector of the input matrix (length = ncols)
* @param[out] covar the output covariance matrix (length = ncols * ncols)
* @param[in] sample whether to evaluate sample covariance or not. In other words,
* whether to normalize the output using N-1 or N, for true or false,
* respectively
* @param[in] stable whether to run the slower-but-numerically-stable version or not
* @note if stable=true, then the input data will be mean centered after this
* function returns!
*/
template <typename value_t, typename idx_t, typename layout_t>
void cov(const raft::handle_t& handle,
raft::device_matrix_view<value_t, idx_t, layout_t> data,
raft::device_vector_view<const value_t, idx_t> mu,
raft::device_matrix_view<value_t, idx_t, layout_t> covar,
bool sample,
cjnolet marked this conversation as resolved.
Show resolved Hide resolved
bool stable)
{
static_assert(
std::is_same_v<layout_t, raft::row_major> || std::is_same_v<layout_t, raft::col_major>,
"Data layout not supported");
RAFT_EXPECTS(data.extent(1) == covar.extent(0) && data.extent(1) == covar.extent(1),
lowener marked this conversation as resolved.
Show resolved Hide resolved
"Size mismatch");
RAFT_EXPECTS(data.is_exhaustive(), "data must be contiguous");
RAFT_EXPECTS(covar.is_exhaustive(), "covar must be contiguous");
RAFT_EXPECTS(mu.is_exhaustive(), "mu must be contiguous");

detail::cov(handle,
covar.data_handle(),
data.data_handle(),
mu.data_handle(),
data.extent(1),
data.extent(0),
std::is_same_v<layout_t, raft::row_major>,
sample,
stable,
handle.get_stream());
}
}; // end namespace stats
}; // end namespace raft

Expand Down
14 changes: 7 additions & 7 deletions cpp/include/raft/stats/detail/batched/silhouette_score.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ __global__ void compute_chunked_a_b_kernel(value_t* a,

template <typename value_idx, typename label_idx>
rmm::device_uvector<value_idx> get_cluster_counts(const raft::handle_t& handle,
label_idx* y,
const label_idx* y,
value_idx& n_rows,
label_idx& n_labels)
{
Expand All @@ -129,8 +129,8 @@ rmm::device_uvector<value_idx> get_cluster_counts(const raft::handle_t& handle,

template <typename value_t, typename value_idx>
rmm::device_uvector<value_t> get_pairwise_distance(const raft::handle_t& handle,
value_t* left_begin,
value_t* right_begin,
const value_t* left_begin,
const value_t* right_begin,
value_idx& n_left_rows,
value_idx& n_right_rows,
value_idx& n_cols,
Expand Down Expand Up @@ -170,10 +170,10 @@ void compute_chunked_a_b(const raft::handle_t& handle,
template <typename value_t, typename value_idx, typename label_idx>
value_t silhouette_score(
const raft::handle_t& handle,
value_t* X,
const value_t* X,
value_idx n_rows,
value_idx n_cols,
label_idx* y,
const label_idx* y,
label_idx n_labels,
value_t* scores,
value_idx chunk,
Expand Down Expand Up @@ -221,8 +221,8 @@ value_t silhouette_score(

auto chunk_stream = handle.get_next_usable_stream(i + chunk * j);

auto* left_begin = X + (i * n_cols);
auto* right_begin = X + (j * n_cols);
const auto* left_begin = X + (i * n_cols);
const auto* right_begin = X + (j * n_cols);

auto n_left_rows = (i + chunk) < n_rows ? chunk : (n_rows - i);
auto n_right_rows = (j + chunk) < n_rows ? chunk : (n_rows - j);
Expand Down
2 changes: 1 addition & 1 deletion cpp/include/raft/stats/detail/histogram.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -465,7 +465,7 @@ HistType selectBestHistAlgo(IdxT nbins)
* @param nbins number of bins
* @param data input data (length = ncols * nrows)
* @param nrows data array length in each column (or batch)
* @param ncols number of columsn (or batch size)
* @param ncols number of columns (or batch size)
* @param stream cuda stream
* @param binner the operation that computes the bin index of the input data
*
Expand Down
Loading