NVIDIA · cliffburdick · Sep 8, 2022 · Sep 8, 2022 · Sep 8, 2022 · Sep 8, 2022
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -93,15 +93,11 @@ if (CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 11.4)
     message(FATAL_ERROR "MatX requires CUDA 11.4 or higher. Please update before using.")
 endif()
 
-# If we're on CUDA 11.4 or lower we need a newer version of libcudacxx
-if (CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 11.5)
-    message(STATUS "Need libcuda++ 1.7.0 or higher (included in CTK 11.5+). Finding...")
-    set(LIBCUDACXX_VERSION "1.7.0" CACHE STRING "Version of libcudacxx to use")
-    include(cmake/FindLibcudacxx.cmake)
-    target_include_directories(matx INTERFACE "$<BUILD_INTERFACE:${LIBCUDACXX_INCLUDE_DIR}>")    
-else()
-    message(STATUS "Recent libcuda++ found in CUDA toolkit. Skipping finding...")
-endif()
+# We typically need newer versions libcudacxx than availabled in the toolkit.  pull down specific version here
+message(STATUS "Need libcuda++. Finding...")
+set(LIBCUDACXX_VERSION "1.8.0" CACHE STRING "Version of libcudacxx to use")
+include(cmake/FindLibcudacxx.cmake)
+target_include_directories(matx INTERFACE "$<BUILD_INTERFACE:${LIBCUDACXX_INCLUDE_DIR}>")    
 
 # Set flags for compiling tests faster
 set(MATX_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} --threads 0)
@@ -112,7 +108,6 @@ endif()
 # Set preferred compiler warning flags
 set(WARN_FLAGS  -Wall 
                 -Wextra 
-                -Werror all-warnings
                 -Wcast-align
                 -Wunused
                 -Wconversion
@@ -129,6 +124,9 @@ if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
         -Wnull-dereference)
 endif()
 
+set(WARN_FLAGS ${WARN_FLAGS} $<$<COMPILE_LANGUAGE:CUDA>:-Werror all-warnings>)
+set(WARN_FLAGS ${WARN_FLAGS} $<$<COMPILE_LANGUAGE:CXX>:-Werror>)
+
 # CUTLASS slows down compile times when used, so leave it as optional for now
 if (MATX_EN_CUTLASS)
     include(cmake/GetCUTLASS.cmake)

diff --git a/cmake/FindLibcudacxx.cmake b/cmake/FindLibcudacxx.cmake
@@ -43,7 +43,7 @@ function(find_and_configure_libcudacxx version)
       ${PROJECT_NAME}-exports
     CPM_ARGS
       GIT_REPOSITORY  https://github.com/NVIDIA/libcudacxx.git
-      GIT_TAG         ${version}-ea
+      GIT_TAG         ${version}
       GIT_SHALLOW     TRUE
       DOWNLOAD_ONLY   TRUE
       OPTIONS         "LIBCXX_INCLUDE_BENCHMARKS OFF"

diff --git a/include/matx.h b/include/matx.h
@@ -31,6 +31,7 @@
 /////////////////////////////////////////////////////////////////////////////////
 
 #pragma once
+#include <cuda_runtime_api.h>
 #include <cuda/std/ccomplex>
 #include "matx/core/defines.h"
 #include "matx/core/error.h"

diff --git a/include/matx/core/tensor.h b/include/matx/core/tensor.h
@@ -752,7 +752,7 @@ class tensor_t : public detail::tensor_impl_t<T,RANK,Desc> {
     std::array<index_t, NRANK> tshape;
     std::move(std::begin(shape), std::end(shape), tshape.begin()); 
 
-    stride_type prod = std::accumulate(std::begin(shape), std::end(shape), 1, std::multiplies<stride_type>());
+    [[maybe_unused]] stride_type prod = std::accumulate(std::begin(shape), std::end(shape), 1, std::multiplies<stride_type>());
     MATX_ASSERT_STR(
         sizeof(T) * prod <= storage_.Bytes(), matxInvalidSize,
         "Total size of new tensor must not be larger than the original");    

diff --git a/include/matx/core/utils.h b/include/matx/core/utils.h
@@ -48,7 +48,7 @@ __MATX_INLINE__ int GetDeviceAttr(cudaDeviceAttr attr) {
     int val;
     int dev;
     cudaGetDevice(&dev);
-    auto err = cudaDeviceGetAttribute(&val, attr, dev);
+    [[maybe_unused]] auto err = cudaDeviceGetAttribute(&val, attr, dev);
     MATX_ASSERT(err == cudaSuccess, matxCudaError);
     return val;
 }
@@ -115,8 +115,12 @@ __MATX_HOST__ __MATX_DEVICE__ __MATX_INLINE__ auto madd( const T1 &x, const T2 &
     const __half2 &Z = *reinterpret_cast<const __half2*>(&z);
 
 #if 1
+#ifdef __CUDACC__
     auto v = __hcmadd(X,Y,Z);
     return T4(v.x, v.y);
+#else
+    return x*y+z;
+#endif
 #else
     // In theory this could be faster but compiler is not folding broadcast/swap into HFMAs
 

diff --git a/include/matx/executors/host.h b/include/matx/executors/host.h
@@ -54,54 +54,18 @@ class SingleThreadHostExecutor {
      */
     template <typename Op>
     void Exec(Op &op) const noexcept {
-      if constexpr (op.Rank() == 0) {
+      if constexpr (Op::Rank() == 0) {
         op();
       }
-      else if constexpr (op.Rank() == 1) {
-        index_t size0 = op.Size(0);
-        for (index_t idx = 0; idx < size0; idx++) {
-          op(idx);
-        }
-      }
-      else if constexpr (op.Rank() == 2) {
-        index_t size0 = op.Size(0);
-        index_t size1 = op.Size(1);
-
-        for (index_t idx = 0; idx < size0; idx++) {
-          for (index_t idy = 0; idy < size1; idy++) {
-            op(idx, idy);
-          }
-        }
-      }
-      else if constexpr (op.Rank() == 3) {
-        index_t size0 = op.Size(0);
-        index_t size1 = op.Size(1);
-        index_t size2 = op.Size(2);
-
-        for (index_t idx = 0; idx < size0; idx++) {
-          for (index_t idy = 0; idy < size1; idy++) {
-            for (index_t idz = 0; idz < size2; idz++) {
-              op(idx, idy, idz);
-            }
-          }
-        }
-      }
       else {
-        index_t size0 = op.Size(0);
-        index_t size1 = op.Size(1);
-        index_t size2 = op.Size(2);
-        index_t size3 = op.Size(3);
-
-        for (index_t idx = 0; idx < size0; idx++) {
-          for (index_t idy = 0; idy < size1; idy++) {
-            for (index_t idz = 0; idz < size2; idz++) {
-              for (index_t idw = 0; idw < size3; idw++) {
-                op(idx, idy, idz, idw);
-              }
-            }
-          }
-        }
-      }        
+        index_t size = TotalSize(op);
+        for (index_t i = 0; i < size; i++) {
+          auto idx = GetIdxFromAbs(op, i);
+          std::apply([&](auto... args) {
+            return op(args...);
+          }, idx);        
+        }      
+      }
     }
 };
 

diff --git a/include/matx/kernels/filter.cuh b/include/matx/kernels/filter.cuh
@@ -35,10 +35,11 @@ typedef enum {
   STATUS_FLAG_FULL_COMPLETE = 2,
 } STATUS_FLAGS;
 
+#ifdef __CUDACC__  
 // Chunk ID assignment used for atomic incrementing between blocks
 static __device__ uint32_t cid_assign[MAX_BATCHES] = {0};
 
-#ifdef __CUDACC__  
+
 template <uint32_t num_recursive, uint32_t num_non_recursive, typename OutType,
           typename InType, typename FilterType>
 __global__ __launch_bounds__(BLOCK_SIZE_RECURSIVE, 1) void RecursiveFilter(

diff --git a/include/matx/operators/binary_operators.h b/include/matx/operators/binary_operators.h
@@ -32,6 +32,7 @@
 
 #pragma once
 
+#include <cuda/std/array>
 
 #include "matx/core/type_utils.h"
 #include "matx/operators/base_operator.h"

diff --git a/include/matx/operators/collapse.h b/include/matx/operators/collapse.h
@@ -203,7 +203,7 @@ namespace matx
             auto ind = in[Rank() - 1];
 #pragma unroll
             for(int i = 0; i <= DIM; i++) {
-              index_t d = T1::Rank() - 1 - i;
+              int d = T1::Rank() - 1 - i;
               out[d] = ind % op_.Size(d);
               ind /= op_.Size(d);
             }
@@ -228,7 +228,7 @@ namespace matx
             auto ind = in[Rank() - 1];
 #pragma unroll
             for(int i = 0; i <= DIM; i++) {
-              index_t d = T1::Rank() - 1 - i;
+              int d = T1::Rank() - 1 - i;
               out[d] = ind % op_.Size(d);
               ind /= op_.Size(d);
             }

diff --git a/include/matx/operators/concat.h b/include/matx/operators/concat.h
@@ -71,7 +71,7 @@ namespace matx
 
       // Base case. Cannot be reached
       template <size_t I = 0, typename... Is, std::enable_if_t<I == sizeof...(Ts), bool> = true>
-        __MATX_INLINE__ __MATX_DEVICE__ __MATX_HOST__ auto GetVal(cuda::std::tuple<Is...> tup) const {
+        __MATX_INLINE__ __MATX_DEVICE__ __MATX_HOST__ auto GetVal([[maybe_unused]] cuda::std::tuple<Is...> tup) const {
           return static_cast<first_value_type>(0);
         }
 

diff --git a/include/matx/operators/permute.h b/include/matx/operators/permute.h
@@ -67,7 +67,7 @@ namespace matx
         __MATX_INLINE__ PermuteOp(T op, const int32_t (&dims)[Rank()]) : op_(op) {
 
           for(int32_t i = 0; i < Rank(); i++) {
-            int32_t dim = dims[i];
+            [[maybe_unused]] int32_t dim = dims[i];
             MATX_ASSERT_STR(dim < Rank() && dim >= 0, matxInvalidDim, "PermuteOp:  Invalid permute index.");
 
             dims_[i] = dims[i];

diff --git a/include/matx/operators/scalar_ops.h b/include/matx/operators/scalar_ops.h
@@ -33,6 +33,7 @@
 #pragma once
 
 #include <type_traits>
+#include <cuda/std/array>
 
 namespace matx {
 namespace detail {
@@ -484,15 +485,15 @@ template <typename T1, typename T2> using PowOp = BinOp<T1, T2, PowF<T1, T2>>;
 template <typename T1, typename T2> struct MaxF {
   static __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ auto op(T1 v1, T2 v2)
   {
-    return max(v1, v2);
+    return cuda::std::max(v1, v2);
   }
 };
 template <typename T1, typename T2> using MaxOp = BinOp<T1, T2, MaxF<T1, T2>>;
 
 template <typename T1, typename T2> struct MinF {
   static __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ auto op(T1 v1, T2 v2)
   {
-    return min(v1, v2);
+    return cuda::std::min(v1, v2);
   }
 };
 template <typename T1, typename T2> using MinOp = BinOp<T1, T2, MinF<T1, T2>>;

diff --git a/include/matx/operators/set.h b/include/matx/operators/set.h
@@ -87,7 +87,7 @@ class set : public BaseOp<set<T, Op>> {
                        matxInvalidDim);
     if constexpr (Rank() > 0) {
       for (int i = 0; i < Rank(); i++) {
-        index_t size = detail::get_expanded_size<Rank()>(op_, i);
+        [[maybe_unused]] index_t size = detail::get_expanded_size<Rank()>(op_, i);
         MATX_ASSERT_STR(
             size == 0 || size == Size(i), matxInvalidSize,
             "Size mismatch in source operator to destination tensor view");

diff --git a/include/matx/operators/shift.h b/include/matx/operators/shift.h
@@ -71,12 +71,12 @@ namespace matx
           __MATX_INLINE__ __MATX_DEVICE__ __MATX_HOST__ auto operator()(Is... indices) const 
           {
             auto tup = cuda::std::make_tuple(indices...);
-            auto shift = -get_value(shift_, indices...);
+            index_t shift = -get_value(shift_, indices...);
 
 
             shift = (shift + cuda::std::get<DIM>(tup)) % Size(DIM);
 
-            if(shift<0) shift+= Size(DIM);
+            if(shift<0) shift += Size(DIM);
 
             cuda::std::get<DIM>(tup) = shift;
 

diff --git a/include/matx/transforms/ambgfun.h b/include/matx/transforms/ambgfun.h
@@ -165,7 +165,6 @@ void InternalAmbgFun(AMFTensor &amf, XTensor &x,
 {
   constexpr int RANK = XTensor::Rank();
   using T1 = typename XTensor::scalar_type;
-  using T2 = typename AMFTensor::scalar_type;
 
   MATX_STATIC_ASSERT(is_cuda_complex_v<T1>, matxInvalidType);
   auto ry = x.View();

diff --git a/include/matx/transforms/conv.h b/include/matx/transforms/conv.h
@@ -50,9 +50,6 @@ inline void matxDirectConv1DInternal(OutputType &o, const InType &i,
                                      const FilterType &filter, matxConvCorrMode_t mode,
                                      cudaStream_t stream)
 {
-  using strip_input_t = typename InType::scalar_type;
-  using strip_filter_t = typename FilterType::scalar_type;
-  using shape_type = typename OutputType::shape_type;
   MATX_STATIC_ASSERT(OutputType::Rank() == InType::Rank(), matxInvalidDim);
   MATX_ASSERT_STR(filter.Size(filter.Rank()-1) < CONV1D_ELEMENTS_PER_BLOCK, matxInvalidSize,
       "Convolutions are limited to filter lengths < 1024");
@@ -63,6 +60,10 @@ inline void matxDirectConv1DInternal(OutputType &o, const InType &i,
       matxInvalidSize, "Output size for SAME convolution incorrect");
 
 #ifdef __CUDACC__  
+  using strip_input_t = typename InType::scalar_type;
+  using strip_filter_t = typename FilterType::scalar_type;
+  using shape_type = typename OutputType::shape_type;
+
   size_t filter_len = filter.Size(filter.Rank()-1);
   size_t signal_len = i.Size(i.Rank()-1);
 

diff --git a/include/matx/transforms/filter.h b/include/matx/transforms/filter.h
@@ -151,6 +151,9 @@ class matxFilter_t {
 
   void Exec(OutType &o, const InType &i, cudaStream_t stream)
   {
+#ifndef __CUDACC__
+    MATX_THROW(matxNotSupported, "convolution not supported on host");
+#else
     if (num_recursive > 0) {
       auto grid =
           dim3(static_cast<int>(
@@ -171,6 +174,7 @@ class matxFilter_t {
       // use SAME here or give them an option? IIR doesn't have the same concept
       conv1d(o, i, h_nonr_copy, matxConvCorrMode_t::MATX_C_MODE_SAME, stream);
     }
+#endif
   }
 
 private:

diff --git a/include/matx/transforms/inverse.h b/include/matx/transforms/inverse.h
@@ -350,7 +350,6 @@ template <typename TensorTypeAInv, typename TensorTypeA, MatInverseAlgo_t ALGO =
 void inv(TensorTypeAInv &a_inv, const TensorTypeA &a,
          cudaStream_t stream = 0)
 {
-  using T1 = typename TensorTypeAInv::scalar_type;
   static_assert(TensorTypeAInv::Rank() == TensorTypeA::Rank(), "Input and output ranks must match");
   // Get parameters required by these tensors
   auto params = detail::matxInversePlan_t<TensorTypeAInv, TensorTypeA, ALGO>::GetInverseParams(a_inv, a);

diff --git a/include/matx/transforms/solver.h b/include/matx/transforms/solver.h
@@ -62,7 +62,7 @@ class matxDnSolver_t {
   matxError_t SetAdvancedOptions(cusolverDnFunction_t function,
                                  cusolverAlgMode_t algo)
   {
-    cusolverStatus_t ret = cusolverDnSetAdvOptions(dn_params, function, algo); 
+    [[maybe_unused]] cusolverStatus_t ret = cusolverDnSetAdvOptions(dn_params, function, algo); 
     MATX_ASSERT(ret == CUSOLVER_STATUS_SUCCESS, matxSolverError);
 
     return matxSuccess;
@@ -1312,9 +1312,6 @@ void svd(UTensor &u, STensor &s,
          cudaStream_t stream = 0, const char jobu = 'A', const char jobvt = 'A')
 {
   using T1 = typename ATensor::scalar_type;
-  using T2 = typename UTensor::scalar_type;
-  using T3 = typename STensor::scalar_type;
-  using T4 = typename VTensor::scalar_type;
 
   /* Temporary WAR
      cuSolver doesn't support row-major layouts. Since we want to make the

diff --git a/include/matx/transforms/transpose.h b/include/matx/transforms/transpose.h
@@ -58,9 +58,9 @@ namespace matx
    *
    */
   template <typename OutputTensor, typename InputTensor>
-    __MATX_INLINE__ void transpose(OutputTensor &out,
+    __MATX_INLINE__ void transpose([[maybe_unused]] OutputTensor &out,
         const InputTensor &in,
-        const cudaStream_t stream)
+        [[maybe_unused]] const cudaStream_t stream)
     {
       constexpr int RANK = OutputTensor::Rank();
       if constexpr (RANK <= 1)
@@ -93,6 +93,8 @@ namespace matx
             batch_dims);
         transpose_kernel_oop<<<grid, block, shm, stream>>>(out, in);
       }
+#else
+     MATX_THROW(matxNotSupported, "Transpose not supported on host");
 #endif    
     };
 } // end namespace matx