Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Replace dots_along_rows with rowNorm and improve coalescedReduction performance #1011

Merged
merged 11 commits into from
Nov 22, 2022
1 change: 1 addition & 0 deletions cpp/bench/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ if(BUILD_BENCH)
bench/linalg/add.cu
bench/linalg/map_then_reduce.cu
bench/linalg/matrix_vector_op.cu
bench/linalg/norm.cu
bench/linalg/normalize.cu
bench/linalg/reduce_rows_by_key.cu
bench/linalg/reduce.cu
Expand Down
84 changes: 84 additions & 0 deletions cpp/bench/linalg/norm.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
/*
* Copyright (c) 2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <common/benchmark.hpp>
#include <raft/linalg/matrix_vector_op.cuh>
#include <raft/linalg/norm.cuh>
#include <raft/random/rng.cuh>
#include <raft/util/itertools.hpp>

#include <rmm/device_uvector.hpp>

namespace raft::bench::linalg {

template <typename IdxT>
struct norm_input {
IdxT rows, cols;
};

template <typename IdxT>
inline auto operator<<(std::ostream& os, const norm_input<IdxT>& p) -> std::ostream&
{
os << p.rows << "#" << p.cols;
return os;
}

template <typename T, typename IdxT>
struct rowNorm : public fixture {
rowNorm(const norm_input<IdxT>& p) : params(p), in(p.rows * p.cols, stream), dots(p.rows, stream)
{
raft::random::RngState rng{1234};
raft::random::uniform(rng, in.data(), p.rows * p.cols, (T)-10.0, (T)10.0, stream);
}

void run_benchmark(::benchmark::State& state) override
{
std::ostringstream label_stream;
label_stream << params;
state.SetLabel(label_stream.str());

loop_on_state(state, [this]() {
auto input_view = raft::make_device_matrix_view<const T, IdxT, raft::row_major>(
in.data(), params.rows, params.cols);
auto output_view =
raft::make_device_vector_view<T, IdxT, raft::row_major>(dots.data(), params.rows);
raft::linalg::norm(handle,
input_view,
output_view,
raft::linalg::L2Norm,
raft::linalg::Apply::ALONG_ROWS,
raft::SqrtOp<T>());
});
}

private:
norm_input<IdxT> params;
rmm::device_uvector<T> in, dots;
}; // struct rowNorm

const std::vector<norm_input<int>> norm_inputs_i32 =
raft::util::itertools::product<norm_input<int>>({10, 100, 1000, 10000, 100000},
{16, 32, 64, 128, 256, 512, 1024});
const std::vector<norm_input<int64_t>> norm_inputs_i64 =
raft::util::itertools::product<norm_input<int64_t>>({10, 100, 1000, 10000, 100000},
{16, 32, 64, 128, 256, 512, 1024});

RAFT_BENCH_REGISTER((rowNorm<float, int>), "", norm_inputs_i32);
RAFT_BENCH_REGISTER((rowNorm<double, int>), "", norm_inputs_i32);
RAFT_BENCH_REGISTER((rowNorm<float, int64_t>), "", norm_inputs_i64);
RAFT_BENCH_REGISTER((rowNorm<double, int64_t>), "", norm_inputs_i64);

} // namespace raft::bench::linalg
14 changes: 7 additions & 7 deletions cpp/include/raft/linalg/coalesced_reduction.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -112,21 +112,21 @@ void coalescedReduction(OutType* dots,
template <typename InValueType,
typename LayoutPolicy,
typename OutValueType,
typename IndexType,
typename MainLambda = raft::Nop<InValueType>,
typename IdxType,
typename MainLambda = raft::Nop<InValueType, IdxType>,
typename ReduceLambda = raft::Sum<OutValueType>,
typename FinalLambda = raft::Nop<OutValueType>>
void coalesced_reduction(const raft::handle_t& handle,
raft::device_matrix_view<const InValueType, IndexType, LayoutPolicy> data,
raft::device_vector_view<OutValueType, IndexType> dots,
raft::device_matrix_view<const InValueType, IdxType, LayoutPolicy> data,
raft::device_vector_view<OutValueType, IdxType> dots,
OutValueType init,
bool inplace = false,
MainLambda main_op = raft::Nop<InValueType>(),
MainLambda main_op = raft::Nop<InValueType, IdxType>(),
ReduceLambda reduce_op = raft::Sum<OutValueType>(),
FinalLambda final_op = raft::Nop<OutValueType>())
{
if constexpr (std::is_same_v<LayoutPolicy, raft::row_major>) {
RAFT_EXPECTS(static_cast<IndexType>(dots.size()) == data.extent(0),
RAFT_EXPECTS(static_cast<IdxType>(dots.size()) == data.extent(0),
"Output should be equal to number of rows in Input");

coalescedReduction(dots.data_handle(),
Expand All @@ -140,7 +140,7 @@ void coalesced_reduction(const raft::handle_t& handle,
reduce_op,
final_op);
} else if constexpr (std::is_same_v<LayoutPolicy, raft::col_major>) {
RAFT_EXPECTS(static_cast<IndexType>(dots.size()) == data.extent(1),
RAFT_EXPECTS(static_cast<IdxType>(dots.size()) == data.extent(1),
"Output should be equal to number of columns in Input");

coalescedReduction(dots.data_handle(),
Expand Down
Loading