Skip to content

Commit

Permalink
Replace map_along_rows with matrixVectorOp (#911)
Browse files Browse the repository at this point in the history
The prim `matrixVectorOp` is more optimized than the ANN util function `map_along_rows`. However, in order to substitute the latter, I needed to add better support for differing matrix and vector types.

This PR adds support to arbitrary matrix and vector types in matrix-vector ops. The input and output matrices must have the same type but the vectors can each have different types.

Authors:
  - Louis Sugy (https://github.com/Nyrio)

Approvers:
  - Tamas Bela Feher (https://github.com/tfeher)
  - Corey J. Nolet (https://github.com/cjnolet)

URL: #911
  • Loading branch information
Nyrio authored Nov 8, 2022
1 parent 5ed3e1a commit 1923c87
Show file tree
Hide file tree
Showing 13 changed files with 628 additions and 498 deletions.
140 changes: 108 additions & 32 deletions cpp/bench/linalg/matrix_vector_op.cu
Original file line number Diff line number Diff line change
Expand Up @@ -20,61 +20,137 @@

namespace raft::bench::linalg {

template <typename IdxT>
struct mat_vec_op_inputs {
int rows, cols;
IdxT rows, cols;
bool rowMajor, bcastAlongRows;
IdxT inAlignOffset, outAlignOffset;
}; // struct mat_vec_op_inputs

template <typename T>
template <typename IdxT>
inline auto operator<<(std::ostream& os, const mat_vec_op_inputs<IdxT>& p) -> std::ostream&
{
os << p.rows << "#" << p.cols << "#" << p.rowMajor << "#" << p.bcastAlongRows << "#"
<< p.inAlignOffset << "#" << p.outAlignOffset;
return os;
}

template <typename OpT, typename T, typename IdxT>
struct mat_vec_op : public fixture {
mat_vec_op(const mat_vec_op_inputs& p)
mat_vec_op(const mat_vec_op_inputs<IdxT>& p)
: params(p),
out(p.rows * p.cols, stream),
in(p.rows * p.cols, stream),
vec(p.bcastAlongRows ? p.cols : p.rows, stream)
out(p.rows * p.cols + params.outAlignOffset, stream),
in(p.rows * p.cols + params.inAlignOffset, stream),
vec1(p.bcastAlongRows ? p.cols : p.rows, stream),
vec2(p.bcastAlongRows ? p.cols : p.rows, stream)
{
}

void run_benchmark(::benchmark::State& state) override
{
std::ostringstream label_stream;
label_stream << params;
state.SetLabel(label_stream.str());

loop_on_state(state, [this]() {
raft::linalg::matrixVectorOp(out.data(),
in.data(),
vec.data(),
params.cols,
params.rows,
params.rowMajor,
params.bcastAlongRows,
raft::Sum<T>(),
stream);
if constexpr (OpT::useTwoVectors) {
raft::linalg::matrixVectorOp(out.data() + params.outAlignOffset,
in.data() + params.inAlignOffset,
vec1.data(),
vec2.data(),
params.cols,
params.rows,
params.rowMajor,
params.bcastAlongRows,
OpT{},
stream);
} else {
raft::linalg::matrixVectorOp(out.data() + params.outAlignOffset,
in.data() + params.inAlignOffset,
vec1.data(),
params.cols,
params.rows,
params.rowMajor,
params.bcastAlongRows,
OpT{},
stream);
}
});
}

private:
mat_vec_op_inputs params;
rmm::device_uvector<T> out, in, vec;
mat_vec_op_inputs<IdxT> params;
rmm::device_uvector<T> out, in, vec1, vec2;
}; // struct MatVecOp

const std::vector<mat_vec_op_inputs> mat_vec_op_input_vecs{
{1024, 128, true, true}, {1024 * 1024, 128, true, true},
{1024, 128 + 2, true, true}, {1024 * 1024, 128 + 2, true, true},
{1024, 128 + 1, true, true}, {1024 * 1024, 128 + 1, true, true},
template <typename IdxT>
std::vector<mat_vec_op_inputs<IdxT>> get_mv_inputs()
{
std::vector<mat_vec_op_inputs<IdxT>> out;

{1024, 128, true, false}, {1024 * 1024, 128, true, false},
{1024, 128 + 2, true, false}, {1024 * 1024, 128 + 2, true, false},
{1024, 128 + 1, true, false}, {1024 * 1024, 128 + 1, true, false},
// Scalability benchmark with round dimensions
std::vector<IdxT> rows = {1000, 100000, 1000000};
std::vector<IdxT> cols = {8, 64, 256, 1024};
for (bool rowMajor : {true, false}) {
for (bool alongRows : {true, false}) {
for (IdxT rows_ : rows) {
for (IdxT cols_ : cols) {
out.push_back({rows_, cols_, rowMajor, alongRows, 0, 0});
}
}
}
}

{1024, 128, false, false}, {1024 * 1024, 128, false, false},
{1024, 128 + 2, false, false}, {1024 * 1024, 128 + 2, false, false},
{1024, 128 + 1, false, false}, {1024 * 1024, 128 + 1, false, false},
// Odd dimensions, misalignment
std::vector<std::tuple<IdxT, IdxT>> rowcols = {
{44739207, 7},
{44739207, 15},
{44739207, 16},
{44739207, 17},
{2611236, 256},
{2611236, 257},
{2611236, 263},
};
for (bool rowMajor : {true, false}) {
for (bool alongRows : {true, false}) {
for (auto rc : rowcols) {
for (IdxT inAlignOffset : {0, 1}) {
for (IdxT outAlignOffset : {0, 1}) {
out.push_back({std::get<0>(rc),
std::get<1>(rc),
rowMajor,
alongRows,
inAlignOffset,
outAlignOffset});
}
}
}
}
}
return out;
}

{1024, 128, false, true}, {1024 * 1024, 128, false, true},
{1024, 128 + 2, false, true}, {1024 * 1024, 128 + 2, false, true},
{1024, 128 + 1, false, true}, {1024 * 1024, 128 + 1, false, true},
const std::vector<mat_vec_op_inputs<int>> mv_input_i32 = get_mv_inputs<int>();
const std::vector<mat_vec_op_inputs<int64_t>> mv_input_i64 = get_mv_inputs<int64_t>();

template <typename T>
struct Add1Vec {
static constexpr bool useTwoVectors = false;
HDI T operator()(T a, T b) const { return a + b; };
};
template <typename T>
struct Add2Vec {
static constexpr bool useTwoVectors = true;
HDI T operator()(T a, T b, T c) const { return a + b + c; };
};

RAFT_BENCH_REGISTER(mat_vec_op<float>, "", mat_vec_op_input_vecs);
RAFT_BENCH_REGISTER(mat_vec_op<double>, "", mat_vec_op_input_vecs);
RAFT_BENCH_REGISTER((mat_vec_op<Add1Vec<float>, float, int>), "", mv_input_i32);
RAFT_BENCH_REGISTER((mat_vec_op<Add1Vec<double>, double, int>), "", mv_input_i32);
RAFT_BENCH_REGISTER((mat_vec_op<Add2Vec<float>, float, int>), "", mv_input_i32);
RAFT_BENCH_REGISTER((mat_vec_op<Add2Vec<double>, double, int>), "", mv_input_i32);
RAFT_BENCH_REGISTER((mat_vec_op<Add1Vec<float>, float, int64_t>), "", mv_input_i64);
RAFT_BENCH_REGISTER((mat_vec_op<Add1Vec<double>, double, int64_t>), "", mv_input_i64);
RAFT_BENCH_REGISTER((mat_vec_op<Add2Vec<float>, float, int64_t>), "", mv_input_i64);
RAFT_BENCH_REGISTER((mat_vec_op<Add2Vec<double>, double, int64_t>), "", mv_input_i64);

} // namespace raft::bench::linalg
93 changes: 14 additions & 79 deletions cpp/include/raft/linalg/detail/matrix_vector_op.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -22,80 +22,10 @@ namespace raft {
namespace linalg {
namespace detail {

namespace {
template <size_t VecBytes>
struct AlignedAccess {
template <typename T>
static inline bool test(const T* matrix, size_t strideBytes)
{
return Pow2<VecBytes>::isAligned(matrix) && Pow2<VecBytes>::isAligned(strideBytes) &&
Pow2<sizeof(T)>::isAligned(VecBytes);
}
};
}; // namespace

template <typename Type, int veclen_, typename Lambda, typename IdxType>
__global__ void matrixVectorOpKernel(Type* out,
const Type* matrix,
const Type* vector,
IdxType D,
IdxType N,
bool rowMajor,
bool bcastAlongRows,
Lambda op)
{
typedef TxN_t<Type, veclen_> VecType;
IdxType len = N * D;
IdxType idx = threadIdx.x;
idx += (IdxType)blockIdx.x * (IdxType)blockDim.x;
idx *= VecType::Ratio;
if (idx >= len) return;
IdxType vIdx;
VecType mat, vec;
///@todo: yikes! use fast-int-div here.
///@todo: shared mem for vector could help with perf
if (rowMajor && bcastAlongRows) {
vIdx = idx % D;
vec.load(vector, vIdx);
} else if (!rowMajor && !bcastAlongRows) {
vIdx = idx % N;
vec.load(vector, vIdx);
} else if (rowMajor && !bcastAlongRows) {
vIdx = idx / D;
vec.fill(vector[vIdx]);
} else {
vIdx = idx / N;
vec.fill(vector[vIdx]);
}
mat.load(matrix, idx);
#pragma unroll
for (int i = 0; i < VecType::Ratio; ++i)
mat.val.data[i] = op(mat.val.data[i], vec.val.data[i]);
mat.store(out, idx);
}

template <typename Type, int veclen_, typename Lambda, typename IdxType, int TPB>
void matrixVectorOpImpl(Type* out,
const Type* matrix,
const Type* vec,
IdxType D,
IdxType N,
bool rowMajor,
bool bcastAlongRows,
Lambda op,
cudaStream_t stream)
{
IdxType len = N * D;
IdxType nblks = raft::ceildiv(veclen_ ? len / veclen_ : veclen_, (IdxType)TPB);
matrixVectorOpKernel<Type, veclen_, Lambda, IdxType>
<<<nblks, TPB, 0, stream>>>(out, matrix, vec, D, N, rowMajor, bcastAlongRows, op);
RAFT_CUDA_TRY(cudaPeekAtLastError());
}

template <typename Type, typename Lambda, typename IdxType = int, int TPB = 256>
void matrixVectorOp(Type* out,
const Type* matrix,
const Type* vec,
template <typename MatT, typename Lambda, typename VecT, typename IdxType = int, int TPB = 256>
void matrixVectorOp(MatT* out,
const MatT* matrix,
const VecT* vec,
IdxType D,
IdxType N,
bool rowMajor,
Expand All @@ -109,11 +39,16 @@ void matrixVectorOp(Type* out,
out, matrix, stride, nLines, rowMajor == bcastAlongRows, op, stream, vec);
}

template <typename Type, typename Lambda, typename IdxType = int, int TPB = 256>
void matrixVectorOp(Type* out,
const Type* matrix,
const Type* vec1,
const Type* vec2,
template <typename MatT,
typename Lambda,
typename Vec1T,
typename Vec2T,
typename IdxType = int,
int TPB = 256>
void matrixVectorOp(MatT* out,
const MatT* matrix,
const Vec1T* vec1,
const Vec2T* vec2,
IdxType D,
IdxType N,
bool rowMajor,
Expand Down
Loading

0 comments on commit 1923c87

Please sign in to comment.