Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Host unit tests #267

Merged
merged 3 commits into from
Sep 8, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 8 additions & 10 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -93,15 +93,11 @@ if (CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 11.4)
message(FATAL_ERROR "MatX requires CUDA 11.4 or higher. Please update before using.")
endif()

# If we're on CUDA 11.4 or lower we need a newer version of libcudacxx
if (CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 11.5)
message(STATUS "Need libcuda++ 1.7.0 or higher (included in CTK 11.5+). Finding...")
set(LIBCUDACXX_VERSION "1.7.0" CACHE STRING "Version of libcudacxx to use")
include(cmake/FindLibcudacxx.cmake)
target_include_directories(matx INTERFACE "$<BUILD_INTERFACE:${LIBCUDACXX_INCLUDE_DIR}>")
else()
message(STATUS "Recent libcuda++ found in CUDA toolkit. Skipping finding...")
endif()
# We typically need newer versions libcudacxx than availabled in the toolkit. pull down specific version here
message(STATUS "Need libcuda++. Finding...")
set(LIBCUDACXX_VERSION "1.8.0" CACHE STRING "Version of libcudacxx to use")
include(cmake/FindLibcudacxx.cmake)
target_include_directories(matx INTERFACE "$<BUILD_INTERFACE:${LIBCUDACXX_INCLUDE_DIR}>")

# Set flags for compiling tests faster
set(MATX_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} --threads 0)
Expand All @@ -112,7 +108,6 @@ endif()
# Set preferred compiler warning flags
set(WARN_FLAGS -Wall
-Wextra
-Werror all-warnings
-Wcast-align
-Wunused
-Wconversion
Expand All @@ -129,6 +124,9 @@ if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
-Wnull-dereference)
endif()

set(WARN_FLAGS ${WARN_FLAGS} $<$<COMPILE_LANGUAGE:CUDA>:-Werror all-warnings>)
set(WARN_FLAGS ${WARN_FLAGS} $<$<COMPILE_LANGUAGE:CXX>:-Werror>)

# CUTLASS slows down compile times when used, so leave it as optional for now
if (MATX_EN_CUTLASS)
include(cmake/GetCUTLASS.cmake)
Expand Down
2 changes: 1 addition & 1 deletion cmake/FindLibcudacxx.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ function(find_and_configure_libcudacxx version)
${PROJECT_NAME}-exports
CPM_ARGS
GIT_REPOSITORY https://github.com/NVIDIA/libcudacxx.git
GIT_TAG ${version}-ea
GIT_TAG ${version}
GIT_SHALLOW TRUE
DOWNLOAD_ONLY TRUE
OPTIONS "LIBCXX_INCLUDE_BENCHMARKS OFF"
Expand Down
1 change: 1 addition & 0 deletions include/matx.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
/////////////////////////////////////////////////////////////////////////////////

#pragma once
#include <cuda_runtime_api.h>
#include <cuda/std/ccomplex>
#include "matx/core/defines.h"
#include "matx/core/error.h"
Expand Down
2 changes: 1 addition & 1 deletion include/matx/core/tensor.h
Original file line number Diff line number Diff line change
Expand Up @@ -752,7 +752,7 @@ class tensor_t : public detail::tensor_impl_t<T,RANK,Desc> {
std::array<index_t, NRANK> tshape;
std::move(std::begin(shape), std::end(shape), tshape.begin());

stride_type prod = std::accumulate(std::begin(shape), std::end(shape), 1, std::multiplies<stride_type>());
[[maybe_unused]] stride_type prod = std::accumulate(std::begin(shape), std::end(shape), 1, std::multiplies<stride_type>());
MATX_ASSERT_STR(
sizeof(T) * prod <= storage_.Bytes(), matxInvalidSize,
"Total size of new tensor must not be larger than the original");
Expand Down
6 changes: 5 additions & 1 deletion include/matx/core/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ __MATX_INLINE__ int GetDeviceAttr(cudaDeviceAttr attr) {
int val;
int dev;
cudaGetDevice(&dev);
auto err = cudaDeviceGetAttribute(&val, attr, dev);
[[maybe_unused]] auto err = cudaDeviceGetAttribute(&val, attr, dev);
MATX_ASSERT(err == cudaSuccess, matxCudaError);
return val;
}
Expand Down Expand Up @@ -115,8 +115,12 @@ __MATX_HOST__ __MATX_DEVICE__ __MATX_INLINE__ auto madd( const T1 &x, const T2 &
const __half2 &Z = *reinterpret_cast<const __half2*>(&z);

#if 1
#ifdef __CUDACC__
auto v = __hcmadd(X,Y,Z);
return T4(v.x, v.y);
#else
return x*y+z;
#endif
#else
// In theory this could be faster but compiler is not folding broadcast/swap into HFMAs

Expand Down
54 changes: 9 additions & 45 deletions include/matx/executors/host.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,54 +54,18 @@ class SingleThreadHostExecutor {
*/
template <typename Op>
void Exec(Op &op) const noexcept {
if constexpr (op.Rank() == 0) {
if constexpr (Op::Rank() == 0) {
op();
}
else if constexpr (op.Rank() == 1) {
index_t size0 = op.Size(0);
for (index_t idx = 0; idx < size0; idx++) {
op(idx);
}
}
else if constexpr (op.Rank() == 2) {
index_t size0 = op.Size(0);
index_t size1 = op.Size(1);

for (index_t idx = 0; idx < size0; idx++) {
for (index_t idy = 0; idy < size1; idy++) {
op(idx, idy);
}
}
}
else if constexpr (op.Rank() == 3) {
index_t size0 = op.Size(0);
index_t size1 = op.Size(1);
index_t size2 = op.Size(2);

for (index_t idx = 0; idx < size0; idx++) {
for (index_t idy = 0; idy < size1; idy++) {
for (index_t idz = 0; idz < size2; idz++) {
op(idx, idy, idz);
}
}
}
}
else {
index_t size0 = op.Size(0);
index_t size1 = op.Size(1);
index_t size2 = op.Size(2);
index_t size3 = op.Size(3);

for (index_t idx = 0; idx < size0; idx++) {
for (index_t idy = 0; idy < size1; idy++) {
for (index_t idz = 0; idz < size2; idz++) {
for (index_t idw = 0; idw < size3; idw++) {
op(idx, idy, idz, idw);
}
}
}
}
}
index_t size = TotalSize(op);
for (index_t i = 0; i < size; i++) {
auto idx = GetIdxFromAbs(op, i);
std::apply([&](auto... args) {
return op(args...);
}, idx);
}
}
}
};

Expand Down
3 changes: 2 additions & 1 deletion include/matx/kernels/filter.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,11 @@ typedef enum {
STATUS_FLAG_FULL_COMPLETE = 2,
} STATUS_FLAGS;

#ifdef __CUDACC__
// Chunk ID assignment used for atomic incrementing between blocks
static __device__ uint32_t cid_assign[MAX_BATCHES] = {0};

#ifdef __CUDACC__

template <uint32_t num_recursive, uint32_t num_non_recursive, typename OutType,
typename InType, typename FilterType>
__global__ __launch_bounds__(BLOCK_SIZE_RECURSIVE, 1) void RecursiveFilter(
Expand Down
1 change: 1 addition & 0 deletions include/matx/operators/binary_operators.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@

#pragma once

#include <cuda/std/array>

#include "matx/core/type_utils.h"
#include "matx/operators/base_operator.h"
Expand Down
4 changes: 2 additions & 2 deletions include/matx/operators/collapse.h
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,7 @@ namespace matx
auto ind = in[Rank() - 1];
#pragma unroll
for(int i = 0; i <= DIM; i++) {
index_t d = T1::Rank() - 1 - i;
int d = T1::Rank() - 1 - i;
out[d] = ind % op_.Size(d);
ind /= op_.Size(d);
}
Expand All @@ -228,7 +228,7 @@ namespace matx
auto ind = in[Rank() - 1];
#pragma unroll
for(int i = 0; i <= DIM; i++) {
index_t d = T1::Rank() - 1 - i;
int d = T1::Rank() - 1 - i;
out[d] = ind % op_.Size(d);
ind /= op_.Size(d);
}
Expand Down
2 changes: 1 addition & 1 deletion include/matx/operators/concat.h
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ namespace matx

// Base case. Cannot be reached
template <size_t I = 0, typename... Is, std::enable_if_t<I == sizeof...(Ts), bool> = true>
__MATX_INLINE__ __MATX_DEVICE__ __MATX_HOST__ auto GetVal(cuda::std::tuple<Is...> tup) const {
__MATX_INLINE__ __MATX_DEVICE__ __MATX_HOST__ auto GetVal([[maybe_unused]] cuda::std::tuple<Is...> tup) const {
return static_cast<first_value_type>(0);
}

Expand Down
2 changes: 1 addition & 1 deletion include/matx/operators/permute.h
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ namespace matx
__MATX_INLINE__ PermuteOp(T op, const int32_t (&dims)[Rank()]) : op_(op) {

for(int32_t i = 0; i < Rank(); i++) {
int32_t dim = dims[i];
[[maybe_unused]] int32_t dim = dims[i];
MATX_ASSERT_STR(dim < Rank() && dim >= 0, matxInvalidDim, "PermuteOp: Invalid permute index.");

dims_[i] = dims[i];
Expand Down
5 changes: 3 additions & 2 deletions include/matx/operators/scalar_ops.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
#pragma once

#include <type_traits>
#include <cuda/std/array>

namespace matx {
namespace detail {
Expand Down Expand Up @@ -484,15 +485,15 @@ template <typename T1, typename T2> using PowOp = BinOp<T1, T2, PowF<T1, T2>>;
template <typename T1, typename T2> struct MaxF {
static __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ auto op(T1 v1, T2 v2)
{
return max(v1, v2);
return cuda::std::max(v1, v2);
}
};
template <typename T1, typename T2> using MaxOp = BinOp<T1, T2, MaxF<T1, T2>>;

template <typename T1, typename T2> struct MinF {
static __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ auto op(T1 v1, T2 v2)
{
return min(v1, v2);
return cuda::std::min(v1, v2);
}
};
template <typename T1, typename T2> using MinOp = BinOp<T1, T2, MinF<T1, T2>>;
Expand Down
2 changes: 1 addition & 1 deletion include/matx/operators/set.h
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ class set : public BaseOp<set<T, Op>> {
matxInvalidDim);
if constexpr (Rank() > 0) {
for (int i = 0; i < Rank(); i++) {
index_t size = detail::get_expanded_size<Rank()>(op_, i);
[[maybe_unused]] index_t size = detail::get_expanded_size<Rank()>(op_, i);
MATX_ASSERT_STR(
size == 0 || size == Size(i), matxInvalidSize,
"Size mismatch in source operator to destination tensor view");
Expand Down
4 changes: 2 additions & 2 deletions include/matx/operators/shift.h
Original file line number Diff line number Diff line change
Expand Up @@ -71,12 +71,12 @@ namespace matx
__MATX_INLINE__ __MATX_DEVICE__ __MATX_HOST__ auto operator()(Is... indices) const
{
auto tup = cuda::std::make_tuple(indices...);
auto shift = -get_value(shift_, indices...);
index_t shift = -get_value(shift_, indices...);


shift = (shift + cuda::std::get<DIM>(tup)) % Size(DIM);

if(shift<0) shift+= Size(DIM);
if(shift<0) shift += Size(DIM);

cuda::std::get<DIM>(tup) = shift;

Expand Down
1 change: 0 additions & 1 deletion include/matx/transforms/ambgfun.h
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,6 @@ void InternalAmbgFun(AMFTensor &amf, XTensor &x,
{
constexpr int RANK = XTensor::Rank();
using T1 = typename XTensor::scalar_type;
using T2 = typename AMFTensor::scalar_type;

MATX_STATIC_ASSERT(is_cuda_complex_v<T1>, matxInvalidType);
auto ry = x.View();
Expand Down
7 changes: 4 additions & 3 deletions include/matx/transforms/conv.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,6 @@ inline void matxDirectConv1DInternal(OutputType &o, const InType &i,
const FilterType &filter, matxConvCorrMode_t mode,
cudaStream_t stream)
{
using strip_input_t = typename InType::scalar_type;
using strip_filter_t = typename FilterType::scalar_type;
using shape_type = typename OutputType::shape_type;
MATX_STATIC_ASSERT(OutputType::Rank() == InType::Rank(), matxInvalidDim);
MATX_ASSERT_STR(filter.Size(filter.Rank()-1) < CONV1D_ELEMENTS_PER_BLOCK, matxInvalidSize,
"Convolutions are limited to filter lengths < 1024");
Expand All @@ -63,6 +60,10 @@ inline void matxDirectConv1DInternal(OutputType &o, const InType &i,
matxInvalidSize, "Output size for SAME convolution incorrect");

#ifdef __CUDACC__
using strip_input_t = typename InType::scalar_type;
using strip_filter_t = typename FilterType::scalar_type;
using shape_type = typename OutputType::shape_type;

size_t filter_len = filter.Size(filter.Rank()-1);
size_t signal_len = i.Size(i.Rank()-1);

Expand Down
4 changes: 4 additions & 0 deletions include/matx/transforms/filter.h
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,9 @@ class matxFilter_t {

void Exec(OutType &o, const InType &i, cudaStream_t stream)
{
#ifndef __CUDACC__
MATX_THROW(matxNotSupported, "convolution not supported on host");
#else
if (num_recursive > 0) {
auto grid =
dim3(static_cast<int>(
Expand All @@ -171,6 +174,7 @@ class matxFilter_t {
// use SAME here or give them an option? IIR doesn't have the same concept
conv1d(o, i, h_nonr_copy, matxConvCorrMode_t::MATX_C_MODE_SAME, stream);
}
#endif
}

private:
Expand Down
1 change: 0 additions & 1 deletion include/matx/transforms/inverse.h
Original file line number Diff line number Diff line change
Expand Up @@ -350,7 +350,6 @@ template <typename TensorTypeAInv, typename TensorTypeA, MatInverseAlgo_t ALGO =
void inv(TensorTypeAInv &a_inv, const TensorTypeA &a,
cudaStream_t stream = 0)
{
using T1 = typename TensorTypeAInv::scalar_type;
static_assert(TensorTypeAInv::Rank() == TensorTypeA::Rank(), "Input and output ranks must match");
// Get parameters required by these tensors
auto params = detail::matxInversePlan_t<TensorTypeAInv, TensorTypeA, ALGO>::GetInverseParams(a_inv, a);
Expand Down
5 changes: 1 addition & 4 deletions include/matx/transforms/solver.h
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ class matxDnSolver_t {
matxError_t SetAdvancedOptions(cusolverDnFunction_t function,
cusolverAlgMode_t algo)
{
cusolverStatus_t ret = cusolverDnSetAdvOptions(dn_params, function, algo);
[[maybe_unused]] cusolverStatus_t ret = cusolverDnSetAdvOptions(dn_params, function, algo);
MATX_ASSERT(ret == CUSOLVER_STATUS_SUCCESS, matxSolverError);

return matxSuccess;
Expand Down Expand Up @@ -1312,9 +1312,6 @@ void svd(UTensor &u, STensor &s,
cudaStream_t stream = 0, const char jobu = 'A', const char jobvt = 'A')
{
using T1 = typename ATensor::scalar_type;
using T2 = typename UTensor::scalar_type;
using T3 = typename STensor::scalar_type;
using T4 = typename VTensor::scalar_type;

/* Temporary WAR
cuSolver doesn't support row-major layouts. Since we want to make the
Expand Down
6 changes: 4 additions & 2 deletions include/matx/transforms/transpose.h
Original file line number Diff line number Diff line change
Expand Up @@ -58,9 +58,9 @@ namespace matx
*
*/
template <typename OutputTensor, typename InputTensor>
__MATX_INLINE__ void transpose(OutputTensor &out,
__MATX_INLINE__ void transpose([[maybe_unused]] OutputTensor &out,
const InputTensor &in,
const cudaStream_t stream)
[[maybe_unused]] const cudaStream_t stream)
{
constexpr int RANK = OutputTensor::Rank();
if constexpr (RANK <= 1)
Expand Down Expand Up @@ -93,6 +93,8 @@ namespace matx
batch_dims);
transpose_kernel_oop<<<grid, block, shm, stream>>>(out, in);
}
#else
MATX_THROW(matxNotSupported, "Transpose not supported on host");
#endif
};
} // end namespace matx
Loading