diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index cddfa4b38d..5fe02ec794 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -323,6 +323,10 @@ if(RAFT_COMPILE_LIBRARY) src/neighbors/detail/selection_faiss_size_t_double.cu src/neighbors/detail/selection_faiss_size_t_float.cu src/neighbors/detail/selection_faiss_uint32_t_float.cu + src/neighbors/detail/selection_faiss_int64_t_double.cu + src/neighbors/detail/selection_faiss_int64_t_half.cu + src/neighbors/detail/selection_faiss_uint32_t_double.cu + src/neighbors/detail/selection_faiss_uint32_t_half.cu src/neighbors/ivf_flat_build_float_int64_t.cu src/neighbors/ivf_flat_build_int8_t_int64_t.cu src/neighbors/ivf_flat_build_uint8_t_int64_t.cu diff --git a/cpp/bench/prims/CMakeLists.txt b/cpp/bench/prims/CMakeLists.txt index 505ca32886..c90886841b 100644 --- a/cpp/bench/prims/CMakeLists.txt +++ b/cpp/bench/prims/CMakeLists.txt @@ -116,8 +116,16 @@ if(BUILD_PRIMS_BENCH) ) ConfigureBench( - NAME MATRIX_BENCH PATH bench/prims/matrix/argmin.cu bench/prims/matrix/gather.cu - bench/prims/matrix/select_k.cu bench/prims/main.cpp OPTIONAL LIB EXPLICIT_INSTANTIATE_ONLY + NAME + MATRIX_BENCH + PATH + bench/prims/matrix/argmin.cu + bench/prims/matrix/gather.cu + bench/prims/matrix/select_k.cu + bench/prims/matrix/main.cpp + OPTIONAL + LIB + EXPLICIT_INSTANTIATE_ONLY ) ConfigureBench( diff --git a/cpp/bench/prims/matrix/main.cpp b/cpp/bench/prims/matrix/main.cpp new file mode 100644 index 0000000000..9cdb1c2546 --- /dev/null +++ b/cpp/bench/prims/matrix/main.cpp @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +namespace raft::matrix { +void add_select_k_dataset_benchmarks(); +} + +int main(int argc, char** argv) +{ + // if we're passed a 'select_k_dataset' flag, add in extra benchmarks + for (int i = 1; i < argc; ++i) { + if (strcmp(argv[i], "--select_k_dataset") == 0) { + raft::matrix::add_select_k_dataset_benchmarks(); + + // pop off the cmdline argument from argc/argv + for (int j = i; j < argc - 1; ++j) + argv[j] = argv[j + 1]; + argc--; + break; + } + } + benchmark::Initialize(&argc, argv); + if (::benchmark::ReportUnrecognizedArguments(argc, argv)) return 1; + benchmark::RunSpecifiedBenchmarks(); +} diff --git a/cpp/bench/prims/matrix/select_k.cu b/cpp/bench/prims/matrix/select_k.cu index d0bc993cc1..22ec998f4f 100644 --- a/cpp/bench/prims/matrix/select_k.cu +++ b/cpp/bench/prims/matrix/select_k.cu @@ -36,7 +36,6 @@ #include namespace raft::matrix { - using namespace raft::bench; // NOLINT template @@ -72,17 +71,16 @@ struct selection : public fixture { void run_benchmark(::benchmark::State& state) override // NOLINT { - device_resources handle{stream}; try { std::ostringstream label_stream; label_stream << params_.batch_size << "#" << params_.len << "#" << params_.k; if (params_.use_same_leading_bits) { label_stream << "#same-leading-bits"; } state.SetLabel(label_stream.str()); - loop_on_state(state, [this, &handle]() { + loop_on_state(state, [this]() { select::select_k_impl(handle, Algo, in_dists_.data(), - in_ids_.data(), + params_.use_index_input ? in_ids_.data() : NULL, params_.batch_size, params_.len, params_.k, @@ -182,4 +180,91 @@ SELECTION_REGISTER(double, int64_t, kWarpFiltered); // NOLINT SELECTION_REGISTER(double, int64_t, kWarpDistributed); // NOLINT SELECTION_REGISTER(double, int64_t, kWarpDistributedShm); // NOLINT +// For learning a heuristic of which selection algorithm to use, we +// have a couple of additional constraints when generating the dataset: +// 1. We want these benchmarks to be optionally enabled from the commandline - +// there are thousands of them, and the run-time is non-trivial. This should be opt-in only +// 2. We test out larger k values - that won't work for all algorithms. This requires filtering +// the input parameters per algorithm. +// This makes the code to generate this dataset different from the code above to +// register other benchmarks +#define SELECTION_REGISTER_ALGO_INPUT(KeyT, IdxT, A, input) \ + { \ + using SelectK = selection; \ + std::stringstream name; \ + name << "SelectKDataset/" << #KeyT "/" #IdxT "/" #A << "/" << input.batch_size << "/" \ + << input.len << "/" << input.k << "/" << input.use_index_input; \ + auto* b = ::benchmark::internal::RegisterBenchmarkInternal( \ + new raft::bench::internal::Fixture(name.str(), input)); \ + b->UseManualTime(); \ + b->Unit(benchmark::kMillisecond); \ + } + +const static size_t MAX_MEMORY = 16 * 1024 * 1024 * 1024ULL; + +// registers the input for all algorithms +#define SELECTION_REGISTER_INPUT(KeyT, IdxT, input) \ + { \ + size_t mem = input.batch_size * input.len * (sizeof(KeyT) + sizeof(IdxT)); \ + if (mem < MAX_MEMORY) { \ + SELECTION_REGISTER_ALGO_INPUT(KeyT, IdxT, kRadix8bits, input) \ + SELECTION_REGISTER_ALGO_INPUT(KeyT, IdxT, kRadix11bits, input) \ + SELECTION_REGISTER_ALGO_INPUT(KeyT, IdxT, kRadix11bitsExtraPass, input) \ + if (input.k <= raft::matrix::detail::select::warpsort::kMaxCapacity) { \ + SELECTION_REGISTER_ALGO_INPUT(KeyT, IdxT, kWarpImmediate, input) \ + SELECTION_REGISTER_ALGO_INPUT(KeyT, IdxT, kWarpFiltered, input) \ + SELECTION_REGISTER_ALGO_INPUT(KeyT, IdxT, kWarpDistributed, input) \ + SELECTION_REGISTER_ALGO_INPUT(KeyT, IdxT, kWarpDistributedShm, input) \ + } \ + if (input.k <= raft::neighbors::detail::kFaissMaxK()) { \ + SELECTION_REGISTER_ALGO_INPUT(KeyT, IdxT, kFaissBlockSelect, input) \ + } \ + } \ + } + +void add_select_k_dataset_benchmarks() +{ + // define a uniform grid + std::vector inputs; + + size_t grid_increment = 1; + std::vector k_vals; + for (size_t k = 0; k < 13; k += grid_increment) { + k_vals.push_back(1 << k); + } + // Add in values just past the limit for warp/faiss select + k_vals.push_back(257); + k_vals.push_back(2049); + + const static bool select_min = true; + const static bool use_ids = false; + + for (size_t row = 0; row < 13; row += grid_increment) { + for (size_t col = 10; col < 28; col += grid_increment) { + for (auto k : k_vals) { + inputs.push_back( + select::params{size_t(1 << row), size_t(1 << col), k, select_min, use_ids}); + } + } + } + + // also add in some random values + std::default_random_engine rng(42); + std::uniform_real_distribution<> row_dist(0, 13); + std::uniform_real_distribution<> col_dist(10, 28); + std::uniform_real_distribution<> k_dist(0, 13); + for (size_t i = 0; i < 1024; ++i) { + auto row = static_cast(pow(2, row_dist(rng))); + auto col = static_cast(pow(2, col_dist(rng))); + auto k = static_cast(pow(2, k_dist(rng))); + inputs.push_back(select::params{row, col, k, select_min, use_ids}); + } + + for (auto& input : inputs) { + SELECTION_REGISTER_INPUT(double, int64_t, input); + SELECTION_REGISTER_INPUT(double, uint32_t, input); + SELECTION_REGISTER_INPUT(float, int64_t, input); + SELECTION_REGISTER_INPUT(float, uint32_t, input); + } +} } // namespace raft::matrix diff --git a/cpp/include/raft/neighbors/detail/selection_faiss-ext.cuh b/cpp/include/raft/neighbors/detail/selection_faiss-ext.cuh index 8636ee9596..c000a4810b 100644 --- a/cpp/include/raft/neighbors/detail/selection_faiss-ext.cuh +++ b/cpp/include/raft/neighbors/detail/selection_faiss-ext.cuh @@ -18,6 +18,7 @@ #include // size_t #include // uint32_t +#include // __half #include // kFaissMaxK #include // RAFT_EXPLICIT @@ -58,4 +59,9 @@ instantiate_raft_neighbors_detail_select_k(size_t, double); instantiate_raft_neighbors_detail_select_k(int, double); instantiate_raft_neighbors_detail_select_k(size_t, float); +instantiate_raft_neighbors_detail_select_k(uint32_t, double); +instantiate_raft_neighbors_detail_select_k(int64_t, double); +instantiate_raft_neighbors_detail_select_k(uint32_t, __half); +instantiate_raft_neighbors_detail_select_k(int64_t, __half); + #undef instantiate_raft_neighbors_detail_select_k diff --git a/cpp/internal/raft_internal/matrix/select_k.cuh b/cpp/internal/raft_internal/matrix/select_k.cuh index 3d7a11e91e..1af3859ce7 100644 --- a/cpp/internal/raft_internal/matrix/select_k.cuh +++ b/cpp/internal/raft_internal/matrix/select_k.cuh @@ -20,6 +20,7 @@ #include #include #include +#include namespace raft::matrix::select { @@ -52,7 +53,8 @@ enum class Algo { kWarpImmediate, kWarpFiltered, kWarpDistributed, - kWarpDistributedShm + kWarpDistributedShm, + kFaissBlockSelect }; inline auto operator<<(std::ostream& os, const Algo& algo) -> std::ostream& @@ -67,6 +69,7 @@ inline auto operator<<(std::ostream& os, const Algo& algo) -> std::ostream& case Algo::kWarpFiltered: return os << "kWarpFiltered"; case Algo::kWarpDistributed: return os << "kWarpDistributed"; case Algo::kWarpDistributedShm: return os << "kWarpDistributedShm"; + case Algo::kFaissBlockSelect: return os << "kFaissBlockSelect"; default: return os << "unknown enum value"; } } @@ -154,7 +157,9 @@ void select_k_impl(const device_resources& handle, return detail::select::warpsort:: select_k_impl( in, in_idx, batch_size, len, k, out, out_idx, select_min, stream); + case Algo::kFaissBlockSelect: + return neighbors::detail::select_k( + in, in_idx, batch_size, len, out, out_idx, select_min, k, stream); } } - } // namespace raft::matrix::select diff --git a/cpp/src/neighbors/detail/selection_faiss_00_generate.py b/cpp/src/neighbors/detail/selection_faiss_00_generate.py index 36ba56c9b3..386dd18e0c 100644 --- a/cpp/src/neighbors/detail/selection_faiss_00_generate.py +++ b/cpp/src/neighbors/detail/selection_faiss_00_generate.py @@ -57,6 +57,10 @@ types = dict( uint32_t_float=("uint32_t", "float"), + uint32_t_double=("uint32_t", "double"), + uint32_t_half=("uint32_t", "half"), + int64_t_double=("int64_t", "double"), + int64_t_half=("int64_t", "half"), int32_t_float=("int32_t", "float"), long_float=("long", "float"), size_t_double=("size_t", "double"), diff --git a/cpp/src/neighbors/detail/selection_faiss_int64_t_double.cu b/cpp/src/neighbors/detail/selection_faiss_int64_t_double.cu new file mode 100644 index 0000000000..f824fdd479 --- /dev/null +++ b/cpp/src/neighbors/detail/selection_faiss_int64_t_double.cu @@ -0,0 +1,44 @@ + +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by selection_faiss_00_generate.py + * + * Make changes there and run in this directory: + * + * > python selection_faiss_00_generate.py + * + */ + +#include // size_t +#include // uint32_t +#include + +#define instantiate_raft_neighbors_detail_select_k(payload_t, key_t) \ + template void raft::neighbors::detail::select_k(const key_t* inK, \ + const payload_t* inV, \ + size_t n_rows, \ + size_t n_cols, \ + key_t* outK, \ + payload_t* outV, \ + bool select_min, \ + int k, \ + cudaStream_t stream) + +instantiate_raft_neighbors_detail_select_k(int64_t, double); + +#undef instantiate_raft_neighbors_detail_select_k diff --git a/cpp/src/neighbors/detail/selection_faiss_int64_t_half.cu b/cpp/src/neighbors/detail/selection_faiss_int64_t_half.cu new file mode 100644 index 0000000000..34ca525c64 --- /dev/null +++ b/cpp/src/neighbors/detail/selection_faiss_int64_t_half.cu @@ -0,0 +1,44 @@ + +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by selection_faiss_00_generate.py + * + * Make changes there and run in this directory: + * + * > python selection_faiss_00_generate.py + * + */ + +#include // size_t +#include // uint32_t +#include + +#define instantiate_raft_neighbors_detail_select_k(payload_t, key_t) \ + template void raft::neighbors::detail::select_k(const key_t* inK, \ + const payload_t* inV, \ + size_t n_rows, \ + size_t n_cols, \ + key_t* outK, \ + payload_t* outV, \ + bool select_min, \ + int k, \ + cudaStream_t stream) + +instantiate_raft_neighbors_detail_select_k(int64_t, half); + +#undef instantiate_raft_neighbors_detail_select_k diff --git a/cpp/src/neighbors/detail/selection_faiss_uint32_t_double.cu b/cpp/src/neighbors/detail/selection_faiss_uint32_t_double.cu new file mode 100644 index 0000000000..e39edbb031 --- /dev/null +++ b/cpp/src/neighbors/detail/selection_faiss_uint32_t_double.cu @@ -0,0 +1,44 @@ + +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by selection_faiss_00_generate.py + * + * Make changes there and run in this directory: + * + * > python selection_faiss_00_generate.py + * + */ + +#include // size_t +#include // uint32_t +#include + +#define instantiate_raft_neighbors_detail_select_k(payload_t, key_t) \ + template void raft::neighbors::detail::select_k(const key_t* inK, \ + const payload_t* inV, \ + size_t n_rows, \ + size_t n_cols, \ + key_t* outK, \ + payload_t* outV, \ + bool select_min, \ + int k, \ + cudaStream_t stream) + +instantiate_raft_neighbors_detail_select_k(uint32_t, double); + +#undef instantiate_raft_neighbors_detail_select_k diff --git a/cpp/src/neighbors/detail/selection_faiss_uint32_t_half.cu b/cpp/src/neighbors/detail/selection_faiss_uint32_t_half.cu new file mode 100644 index 0000000000..549509f6da --- /dev/null +++ b/cpp/src/neighbors/detail/selection_faiss_uint32_t_half.cu @@ -0,0 +1,44 @@ + +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by selection_faiss_00_generate.py + * + * Make changes there and run in this directory: + * + * > python selection_faiss_00_generate.py + * + */ + +#include // size_t +#include // uint32_t +#include + +#define instantiate_raft_neighbors_detail_select_k(payload_t, key_t) \ + template void raft::neighbors::detail::select_k(const key_t* inK, \ + const payload_t* inV, \ + size_t n_rows, \ + size_t n_cols, \ + key_t* outK, \ + payload_t* outV, \ + bool select_min, \ + int k, \ + cudaStream_t stream) + +instantiate_raft_neighbors_detail_select_k(uint32_t, half); + +#undef instantiate_raft_neighbors_detail_select_k