Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Replace map_along_rows with matrixVectorOp #911

Merged
merged 30 commits into from
Nov 8, 2022
Merged
Show file tree
Hide file tree
Changes from 22 commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
7f11225
Integrate accumulate_into_selected into raft prims
Nyrio Oct 7, 2022
529931d
Remove accumulate_into_selected
Nyrio Oct 10, 2022
0f284b0
Merge remote-tracking branch 'origin/branch-22.12' into enh-ann-accum…
Nyrio Oct 10, 2022
6a61537
Replace map_along_rows with matrixVectorOp
Nyrio Oct 10, 2022
e184ebf
Merge remote-tracking branch 'origin/branch-22.12' into enh-map-along…
Nyrio Oct 17, 2022
0a98482
Start adding support for arbitrary types in linewiseOp
Nyrio Oct 18, 2022
eafa61e
Allow different types for output, matrix and vector(s) in mdspan-base…
Nyrio Oct 18, 2022
74309ed
Call cub histogram with signed type to avoid a warning breaking compi…
Nyrio Oct 18, 2022
df7cfb5
Start adding support for different output/matrix/vector types in MatV…
Nyrio Oct 18, 2022
8264462
Fix shared mem buffering offset in linewise kernels
Nyrio Oct 18, 2022
e4a8b66
Pass custom op to naiveMat
Nyrio Oct 18, 2022
938b180
Support different output/matrix/vector(s) types in naiveMatVec
Nyrio Oct 18, 2022
c85ee9b
Test matrix-vector op with different matrix / vector types
Nyrio Oct 18, 2022
be142c6
Fix linewiseOp VecRows kernels
Nyrio Oct 19, 2022
a850d85
Fix linewiseOp VecCols kernels
Nyrio Oct 20, 2022
e44bcc4
Merge OutT=MatT because linewiseOp only supports one input/output mat…
Nyrio Oct 20, 2022
351fddc
Merge remote-tracking branch 'origin/branch-22.12' into enh-map-along…
Nyrio Oct 20, 2022
dcfd962
Replace for_each with linalg::add + fix syntax error
Nyrio Oct 20, 2022
4718e5b
Clang-format fix
Nyrio Oct 20, 2022
fc55921
used alignedLen instead of totalLen in max block number calculation
Nyrio Oct 21, 2022
5421dda
Add misalignments to matrix-vector-op test
Nyrio Oct 26, 2022
ccdc961
Extend matrix-vector op benchmark
Nyrio Oct 26, 2022
b5c4d01
Merge remote-tracking branch 'origin/branch-22.12' into enh-map-along…
Nyrio Oct 28, 2022
f2f67db
Apply changes to new padded kernel (note: test is still failing but a…
Nyrio Oct 28, 2022
1450bae
Put itertools in util namespace
Nyrio Oct 28, 2022
7bcbbe2
Remove TPB from public API (it wasn't even forwarded to the actual im…
Nyrio Oct 28, 2022
c7942b0
Fix utils -> util
Nyrio Oct 28, 2022
d28a3c5
Merge remote-tracking branch 'origin/branch-22.12' into enh-map-along…
Nyrio Nov 8, 2022
68fd977
Move product auxiliary function to itertools::detail
Nyrio Nov 8, 2022
b2be0c1
Add test case for int8_t matrix with float vectors
Nyrio Nov 8, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
140 changes: 108 additions & 32 deletions cpp/bench/linalg/matrix_vector_op.cu
Original file line number Diff line number Diff line change
Expand Up @@ -20,61 +20,137 @@

namespace raft::bench::linalg {

template <typename IdxT>
struct mat_vec_op_inputs {
int rows, cols;
IdxT rows, cols;
bool rowMajor, bcastAlongRows;
IdxT inAlignOffset, outAlignOffset;
}; // struct mat_vec_op_inputs

template <typename T>
template <typename IdxT>
inline auto operator<<(std::ostream& os, const mat_vec_op_inputs<IdxT>& p) -> std::ostream&
{
os << p.rows << "#" << p.cols << "#" << p.rowMajor << "#" << p.bcastAlongRows << "#"
<< p.inAlignOffset << "#" << p.outAlignOffset;
return os;
}

template <typename OpT, typename T, typename IdxT>
struct mat_vec_op : public fixture {
mat_vec_op(const mat_vec_op_inputs& p)
mat_vec_op(const mat_vec_op_inputs<IdxT>& p)
: params(p),
out(p.rows * p.cols, stream),
in(p.rows * p.cols, stream),
vec(p.bcastAlongRows ? p.cols : p.rows, stream)
out(p.rows * p.cols + params.outAlignOffset, stream),
in(p.rows * p.cols + params.inAlignOffset, stream),
vec1(p.bcastAlongRows ? p.cols : p.rows, stream),
vec2(p.bcastAlongRows ? p.cols : p.rows, stream)
{
}

void run_benchmark(::benchmark::State& state) override
{
std::ostringstream label_stream;
label_stream << params;
state.SetLabel(label_stream.str());

loop_on_state(state, [this]() {
raft::linalg::matrixVectorOp(out.data(),
in.data(),
vec.data(),
params.cols,
params.rows,
params.rowMajor,
params.bcastAlongRows,
raft::Sum<T>(),
stream);
if constexpr (OpT::useTwoVectors) {
raft::linalg::matrixVectorOp(out.data() + params.outAlignOffset,
in.data() + params.inAlignOffset,
vec1.data(),
vec2.data(),
params.cols,
params.rows,
params.rowMajor,
params.bcastAlongRows,
OpT{},
stream);
} else {
raft::linalg::matrixVectorOp(out.data() + params.outAlignOffset,
in.data() + params.inAlignOffset,
vec1.data(),
params.cols,
params.rows,
params.rowMajor,
params.bcastAlongRows,
OpT{},
stream);
}
});
}

private:
mat_vec_op_inputs params;
rmm::device_uvector<T> out, in, vec;
mat_vec_op_inputs<IdxT> params;
rmm::device_uvector<T> out, in, vec1, vec2;
}; // struct MatVecOp

const std::vector<mat_vec_op_inputs> mat_vec_op_input_vecs{
{1024, 128, true, true}, {1024 * 1024, 128, true, true},
{1024, 128 + 2, true, true}, {1024 * 1024, 128 + 2, true, true},
{1024, 128 + 1, true, true}, {1024 * 1024, 128 + 1, true, true},
template <typename IdxT>
std::vector<mat_vec_op_inputs<IdxT>> get_mv_inputs()
{
std::vector<mat_vec_op_inputs<IdxT>> out;

{1024, 128, true, false}, {1024 * 1024, 128, true, false},
{1024, 128 + 2, true, false}, {1024 * 1024, 128 + 2, true, false},
{1024, 128 + 1, true, false}, {1024 * 1024, 128 + 1, true, false},
// Scalability benchmark with round dimensions
std::vector<IdxT> rows = {1000, 100000, 1000000};
std::vector<IdxT> cols = {8, 64, 256, 1024};
for (bool rowMajor : {true, false}) {
for (bool alongRows : {true, false}) {
for (IdxT rows_ : rows) {
for (IdxT cols_ : cols) {
out.push_back({rows_, cols_, rowMajor, alongRows, 0, 0});
}
}
}
}

{1024, 128, false, false}, {1024 * 1024, 128, false, false},
{1024, 128 + 2, false, false}, {1024 * 1024, 128 + 2, false, false},
{1024, 128 + 1, false, false}, {1024 * 1024, 128 + 1, false, false},
// Odd dimensions, misalignment
std::vector<std::tuple<IdxT, IdxT>> rowcols = {
{44739207, 7},
{44739207, 15},
{44739207, 16},
{44739207, 17},
{2611236, 256},
{2611236, 257},
{2611236, 263},
};
for (bool rowMajor : {true, false}) {
for (bool alongRows : {true, false}) {
for (auto rc : rowcols) {
for (IdxT inAlignOffset : {0, 1}) {
for (IdxT outAlignOffset : {0, 1}) {
out.push_back({std::get<0>(rc),
std::get<1>(rc),
rowMajor,
alongRows,
inAlignOffset,
outAlignOffset});
}
}
}
}
}
return out;
}

{1024, 128, false, true}, {1024 * 1024, 128, false, true},
{1024, 128 + 2, false, true}, {1024 * 1024, 128 + 2, false, true},
{1024, 128 + 1, false, true}, {1024 * 1024, 128 + 1, false, true},
const std::vector<mat_vec_op_inputs<int>> mv_input_i32 = get_mv_inputs<int>();
const std::vector<mat_vec_op_inputs<int64_t>> mv_input_i64 = get_mv_inputs<int64_t>();

template <typename T>
struct Add1Vec {
static constexpr bool useTwoVectors = false;
HDI T operator()(T a, T b) const { return a + b; };
};
template <typename T>
struct Add2Vec {
static constexpr bool useTwoVectors = true;
HDI T operator()(T a, T b, T c) const { return a + b + c; };
};

RAFT_BENCH_REGISTER(mat_vec_op<float>, "", mat_vec_op_input_vecs);
RAFT_BENCH_REGISTER(mat_vec_op<double>, "", mat_vec_op_input_vecs);
RAFT_BENCH_REGISTER((mat_vec_op<Add1Vec<float>, float, int>), "", mv_input_i32);
RAFT_BENCH_REGISTER((mat_vec_op<Add1Vec<double>, double, int>), "", mv_input_i32);
RAFT_BENCH_REGISTER((mat_vec_op<Add2Vec<float>, float, int>), "", mv_input_i32);
RAFT_BENCH_REGISTER((mat_vec_op<Add2Vec<double>, double, int>), "", mv_input_i32);
RAFT_BENCH_REGISTER((mat_vec_op<Add1Vec<float>, float, int64_t>), "", mv_input_i64);
RAFT_BENCH_REGISTER((mat_vec_op<Add1Vec<double>, double, int64_t>), "", mv_input_i64);
RAFT_BENCH_REGISTER((mat_vec_op<Add2Vec<float>, float, int64_t>), "", mv_input_i64);
RAFT_BENCH_REGISTER((mat_vec_op<Add2Vec<double>, double, int64_t>), "", mv_input_i64);

} // namespace raft::bench::linalg
93 changes: 14 additions & 79 deletions cpp/include/raft/linalg/detail/matrix_vector_op.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -22,80 +22,10 @@ namespace raft {
namespace linalg {
namespace detail {

namespace {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are any of the changes in this file going to break users downstream (such as cuml)?

Copy link
Contributor Author

@Nyrio Nyrio Oct 28, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It won't break cuML because all template types are inferred in calls to matrixVectorOp.
It could in theory break other projects if they provide the template list explicitly, but I'm not aware of any.

template <size_t VecBytes>
struct AlignedAccess {
template <typename T>
static inline bool test(const T* matrix, size_t strideBytes)
{
return Pow2<VecBytes>::isAligned(matrix) && Pow2<VecBytes>::isAligned(strideBytes) &&
Pow2<sizeof(T)>::isAligned(VecBytes);
}
};
}; // namespace

template <typename Type, int veclen_, typename Lambda, typename IdxType>
__global__ void matrixVectorOpKernel(Type* out,
const Type* matrix,
const Type* vector,
IdxType D,
IdxType N,
bool rowMajor,
bool bcastAlongRows,
Lambda op)
{
typedef TxN_t<Type, veclen_> VecType;
IdxType len = N * D;
IdxType idx = threadIdx.x;
idx += (IdxType)blockIdx.x * (IdxType)blockDim.x;
idx *= VecType::Ratio;
if (idx >= len) return;
IdxType vIdx;
VecType mat, vec;
///@todo: yikes! use fast-int-div here.
///@todo: shared mem for vector could help with perf
if (rowMajor && bcastAlongRows) {
vIdx = idx % D;
vec.load(vector, vIdx);
} else if (!rowMajor && !bcastAlongRows) {
vIdx = idx % N;
vec.load(vector, vIdx);
} else if (rowMajor && !bcastAlongRows) {
vIdx = idx / D;
vec.fill(vector[vIdx]);
} else {
vIdx = idx / N;
vec.fill(vector[vIdx]);
}
mat.load(matrix, idx);
#pragma unroll
for (int i = 0; i < VecType::Ratio; ++i)
mat.val.data[i] = op(mat.val.data[i], vec.val.data[i]);
mat.store(out, idx);
}

template <typename Type, int veclen_, typename Lambda, typename IdxType, int TPB>
void matrixVectorOpImpl(Type* out,
const Type* matrix,
const Type* vec,
IdxType D,
IdxType N,
bool rowMajor,
bool bcastAlongRows,
Lambda op,
cudaStream_t stream)
{
IdxType len = N * D;
IdxType nblks = raft::ceildiv(veclen_ ? len / veclen_ : veclen_, (IdxType)TPB);
matrixVectorOpKernel<Type, veclen_, Lambda, IdxType>
<<<nblks, TPB, 0, stream>>>(out, matrix, vec, D, N, rowMajor, bcastAlongRows, op);
RAFT_CUDA_TRY(cudaPeekAtLastError());
}

template <typename Type, typename Lambda, typename IdxType = int, int TPB = 256>
void matrixVectorOp(Type* out,
const Type* matrix,
const Type* vec,
template <typename MatT, typename Lambda, typename VecT, typename IdxType = int, int TPB = 256>
void matrixVectorOp(MatT* out,
const MatT* matrix,
const VecT* vec,
IdxType D,
IdxType N,
bool rowMajor,
Expand All @@ -109,11 +39,16 @@ void matrixVectorOp(Type* out,
out, matrix, stride, nLines, rowMajor == bcastAlongRows, op, stream, vec);
}

template <typename Type, typename Lambda, typename IdxType = int, int TPB = 256>
void matrixVectorOp(Type* out,
const Type* matrix,
const Type* vec1,
const Type* vec2,
template <typename MatT,
typename Lambda,
typename Vec1T,
typename Vec2T,
typename IdxType = int,
int TPB = 256>
void matrixVectorOp(MatT* out,
const MatT* matrix,
const Vec1T* vec1,
const Vec2T* vec2,
IdxType D,
IdxType N,
bool rowMajor,
Expand Down
Loading