You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
What is your question?
I can run the following code without any issues.
#include<cutlass/gemm/device/gemm.h>
#include<iostream>
cudaError_t cutlass_dgemm_nn(
cudaStream_t *stream_cut,
int M, int N, int K,
double alpha,
double *A, int lda,
double *B, int ldb,
double beta,
double *C, int ldc,
double *D, int ldd
) {
using ElementOutput = double;
using ElementAccumulator = double;
using ColumnMajor = cutlass::layout::ColumnMajor;
using RowMajor = cutlass::layout::RowMajor;
using CutlassGemm = cutlass::gemm::device::Gemm<
double, // Data-type of A matrix
RowMajor, // Layout of A matrixdouble, // Data-type of B matrix
RowMajor, // Layout of B matrix
ElementOutput, // Data-type of C matrix
RowMajor, // Layout of C matrix , LayoutC = layout::ColumnMajor;
ElementAccumulator, // ElementAccumulator
cutlass::arch::OpClassTensorOp, // tag indicating Tensor Cores
cutlass::arch::Sm80, // tag indicating target GPU compute architecture
cutlass::gemm::GemmShape<64, 64, 16>, // Shape to optimize
cutlass::gemm::GemmShape<32, 32, 16>, // Shape to optimize
cutlass::gemm::GemmShape<8, 8, 4> // Shape to optimize
>;
// Define a CUTLASS GEMM type
CutlassGemm gemm_operator;
cutlass::Status status = gemm_operator(*stream_cut);
// cutlass::Status status = gemm_operator();// CutlassGemm::Arguments args({M , N, K}, // Gemm Problem dimensions// {A, lda}, // Tensor-ref for source matrix A// {B, ldb}, // Tensor-ref for source matrix B// {C, ldc}, // Tensor-ref for source matrix C// {D, ldd}, // Tensor-ref for destination matrix D (may be different memory than source C matrix)// {alpha, beta}); // Scalars used in the Epilogue
CutlassGemm::Arguments args({M , N, K}, // Gemm Problem dimensions
{A, lda}, // Tensor-ref for source matrix A
{B, ldb}, // Tensor-ref for source matrix B
{C, ldc}, // Tensor-ref for source matrix C
{C, ldc}, // Tensor-ref for destination matrix D (may be different memory than source C matrix)
{alpha, beta}); // Scalars used in the Epilogue
status = gemm_operator(args);
//// Return a cudaError_t if the CUTLASS GEMM operator returned an error code.//if (status != cutlass::Status::kSuccess) {
return cudaErrorUnknown;
}
// Return success, if no errors were encountered.size_t result_mem_size = sizeof(double) * M * N; //memory size of matrix C = M * N * sizeof(double)double *result;
result = (double*)malloc(result_mem_size); // host端D矩阵分配内存cudaMemcpy(result, C, result_mem_size, cudaMemcpyDeviceToHost);
std::cout << result[0] << std::endl; //打印D中第一行第一个数据
std::cout << result[M * N - 1] << std::endl;
return cudaSuccess;
}
voidgenerate_tensor_2D(double *ptr, int i_M, int i_N){ // 二维矩阵填充函数(此处全部填充1)for(int i = 0; i < i_M; i++){
for(int j = 0; j < i_N; j++){
*(ptr + i*i_N + j ) = 1.0;
}
}
}
intmain(int argc, constchar *arg[]) {
int M = 3840; //Mint N = 4096; //Nint K = 4096; //Kint lda = K;
int ldb = K;
int ldc = N;
int ldd = N;
double alpha = 1.0; //alphadouble beta = 1.0; //betadouble *A;
double *B;
double *C;
double *D;
size_t A_mem_size = sizeof(double) * M * K; //memory size of matrix A = M * K * sizeof(double)size_t B_mem_size = sizeof(double) * K * N; //memory size of matrix B = K * N * sizeof(double)size_t C_mem_size = sizeof(double) * M * N; //memory size of matrix C = M * N * sizeof(double)size_t D_mem_size = sizeof(double) * M * N; //memory size of matrix C = M * N * sizeof(double)
A = (double*)malloc(A_mem_size);
B = (double*)malloc(B_mem_size);
C = (double*)malloc(C_mem_size);
D = (double*)malloc(D_mem_size);
generate_tensor_2D(A, M, K);
generate_tensor_2D(B, K, N);
generate_tensor_2D(C, M, N);
double *d_A;
double *d_B;
double *d_C;
double *d_D;
cudaMalloc((void**)&d_A, A_mem_size);
cudaMalloc((void**)&d_B, B_mem_size);
cudaMalloc((void**)&d_C, C_mem_size);
cudaMalloc((void**)&d_D, D_mem_size);
cudaStream_t stream[2];
for (int i = 0; i < 2; ++i)
cudaStreamCreate(&stream[i]);
cudaMemcpy(d_A, A, A_mem_size, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, B, B_mem_size, cudaMemcpyHostToDevice);
cudaMemcpy(d_C, C, C_mem_size, cudaMemcpyHostToDevice);
cutlass_dgemm_nn(&stream[1],
M, N, K,
alpha,
d_A, lda,
d_B, ldb,
beta,
d_C, ldc,
d_D, ldd
);
return0;
}
When I compile the above code into a library 'lib.a,' it compiles successfully. However, when I run a piece of code that calls 'lib.a,' I encounter the following errors (there are hundreds of such errors).
What is your question?
I can run the following code without any issues.
When I compile the above code into a library 'lib.a,' it compiles successfully. However, when I run a piece of code that calls 'lib.a,' I encounter the following errors (there are hundreds of such errors).
The text was updated successfully, but these errors were encountered: