[QST]cutlass gemm error #1151

zwshan · 2023-10-18T09:01:23Z

What is your question?
I can run the following code without any issues.

#include <cutlass/gemm/device/gemm.h>
#include <iostream>


cudaError_t cutlass_dgemm_nn(
  cudaStream_t *stream_cut,
  int M, int N, int K,
  double alpha,
  double *A, int lda,
  double *B, int ldb,
  double beta, 
  double *C, int ldc,
  double *D, int ldd
  ) {
  using ElementOutput = double;
  using ElementAccumulator = double;
  using ColumnMajor = cutlass::layout::ColumnMajor;
  using RowMajor = cutlass::layout::RowMajor;
  using CutlassGemm = cutlass::gemm::device::Gemm< 
    double,             // Data-type of A matrix
    RowMajor,        // Layout of A matrix
    double,             // Data-type of B matrix
    RowMajor,        // Layout of B matrix
    ElementOutput,      // Data-type of C matrix
    RowMajor,        // Layout of C matrix , LayoutC = layout::ColumnMajor;                       
    ElementAccumulator,                                     // ElementAccumulator
    cutlass::arch::OpClassTensorOp,            // tag indicating Tensor Cores
    cutlass::arch::Sm80,                        // tag indicating target GPU compute architecture
    cutlass::gemm::GemmShape<64, 64, 16>, // Shape to optimize
    cutlass::gemm::GemmShape<32, 32, 16>, // Shape to optimize
    cutlass::gemm::GemmShape<8, 8, 4> // Shape to optimize
  >; 
  // Define a CUTLASS GEMM type
  CutlassGemm gemm_operator;
  cutlass::Status status = gemm_operator(*stream_cut);
  // cutlass::Status status = gemm_operator();
 
  // CutlassGemm::Arguments args({M , N, K},  // Gemm Problem dimensions
  //                             {A, lda},    // Tensor-ref for source matrix A
  //                             {B, ldb},    // Tensor-ref for source matrix B
  //                             {C, ldc},    // Tensor-ref for source matrix C
  //                             {D, ldd},    // Tensor-ref for destination matrix D (may be different memory than source C matrix)
  //                             {alpha, beta}); // Scalars used in the Epilogue
  
  CutlassGemm::Arguments args({M , N, K},  // Gemm Problem dimensions
                            {A, lda},    // Tensor-ref for source matrix A
                            {B, ldb},    // Tensor-ref for source matrix B
                            {C, ldc},    // Tensor-ref for source matrix C
                            {C, ldc},    // Tensor-ref for destination matrix D (may be different memory than source C matrix)
                            {alpha, beta}); // Scalars used in the Epilogue


  status = gemm_operator(args);
  //
  // Return a cudaError_t if the CUTLASS GEMM operator returned an error code.
  //
  if (status != cutlass::Status::kSuccess) {
    return cudaErrorUnknown;
  }
  // Return success, if no errors were encountered.
  size_t result_mem_size = sizeof(double) * M * N; //memory size of matrix C = M * N * sizeof(double)
  double *result;
  result = (double*)malloc(result_mem_size);  // host端D矩阵分配内存
  cudaMemcpy(result, C, result_mem_size, cudaMemcpyDeviceToHost);
  std::cout << result[0] << std::endl;                          //打印D中第一行第一个数据
  std::cout << result[M * N - 1] << std::endl;   

  return cudaSuccess;
}

void generate_tensor_2D(double *ptr, int i_M, int i_N){        // 二维矩阵填充函数（此处全部填充1）
    for(int i = 0; i < i_M; i++){
        for(int j = 0; j < i_N; j++){
            *(ptr + i*i_N + j ) = 1.0;
        }
    }
}

int main(int argc, const char *arg[]) {
    int M = 3840;           //M
    int N = 4096;           //N
    int K = 4096;           //K
 
    int lda = K;
    int ldb = K;
    int ldc = N;
    int ldd = N;
 
    double alpha = 1.0;      //alpha
    double beta = 1.0;       //beta
 
    double *A;               
    double *B;               
    double *C;               
    double *D;               
 
    size_t A_mem_size = sizeof(double) * M * K; //memory size of matrix A = M * K * sizeof(double)
    size_t B_mem_size = sizeof(double) * K * N; //memory size of matrix B = K * N * sizeof(double)
    size_t C_mem_size = sizeof(double) * M * N; //memory size of matrix C = M * N * sizeof(double)
    size_t D_mem_size = sizeof(double) * M * N; //memory size of matrix C = M * N * sizeof(double)
    
 
    A = (double*)malloc(A_mem_size);  
    B = (double*)malloc(B_mem_size);  
    C = (double*)malloc(C_mem_size);  
    D = (double*)malloc(D_mem_size);  
 
    generate_tensor_2D(A, M, K);     
    generate_tensor_2D(B, K, N);     
    generate_tensor_2D(C, M, N);    
 
    double *d_A;           
    double *d_B;            
    double *d_C;            
    double *d_D;            
 
    cudaMalloc((void**)&d_A, A_mem_size);  
    cudaMalloc((void**)&d_B, B_mem_size);  
    cudaMalloc((void**)&d_C, C_mem_size);  
    cudaMalloc((void**)&d_D, D_mem_size);  

    cudaStream_t stream[2];
    for (int i = 0; i < 2; ++i)
      cudaStreamCreate(&stream[i]);
    
    cudaMemcpy(d_A, A, A_mem_size, cudaMemcpyHostToDevice); 
    cudaMemcpy(d_B, B, B_mem_size, cudaMemcpyHostToDevice); 
    cudaMemcpy(d_C, C, C_mem_size, cudaMemcpyHostToDevice); 

    cutlass_dgemm_nn(&stream[1],
                      M, N, K,
                      alpha,
                      d_A, lda,
                      d_B, ldb,
                      beta, 
                      d_C, ldc,
                      d_D, ldd
                      );

    return 0;
}

When I compile the above code into a library 'lib.a,' it compiles successfully. However, when I run a piece of code that calls 'lib.a,' I encounter the following errors (there are hundreds of such errors).

void cutlass::arch::Mma<cutlass::gemm::GemmShape<8, 8, 4>, 32, double, cutlass::layout::RowMajor, double, cutlass::layout::ColumnMajor, double, cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAdd>::operator()(cutlass::Array<double, 2, true> &, const cutlass::Array<double, 1, true> &, const cutlass::Array<double, 1, true> &, const cutlass::Array<double, 2, true> &) const not implemented

hwu36 · 2023-10-18T16:17:32Z

maybe you can take a look how cutlass_library is built. https://github.com/NVIDIA/cutlass/blob/main/media/docs/profiler.md turn on KEEP in the cmake to check the full command line.

zwshan · 2023-10-19T04:39:43Z

I have solved my problem, thank you.

zwshan added ? - Needs Triage question Question labels Oct 18, 2023

zwshan changed the title ~~[QST]cutlass gemm erro~~ [QST]cutlass gemm error Oct 18, 2023

zwshan closed this as completed Oct 19, 2023

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[QST]cutlass gemm error #1151

[QST]cutlass gemm error #1151

zwshan commented Oct 18, 2023 •

edited

Loading

hwu36 commented Oct 18, 2023

zwshan commented Oct 19, 2023

[QST]cutlass gemm error #1151

[QST]cutlass gemm error #1151

Comments

zwshan commented Oct 18, 2023 • edited Loading

hwu36 commented Oct 18, 2023

zwshan commented Oct 19, 2023

zwshan commented Oct 18, 2023 •

edited

Loading