Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fixed a sync problem #15

Merged
merged 5 commits into from
Sep 28, 2023
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 0 additions & 26 deletions .travis.yml

This file was deleted.

13 changes: 0 additions & 13 deletions Artifacts.toml

This file was deleted.

1 change: 1 addition & 0 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ version = "1.0.0-DEV"
[deps]
ArtifactUtils = "8b73e784-e7d8-4ea5-973d-377fed4e3bce"
Artifacts = "56f22d72-fd6d-98f1-02f0-08ddc0907c33"
BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please remove all dependencies that not directly used in src from the Project.toml
If you want a test environment, please add a Project.toml file to the test folder, example: https://github.com/TensorBFS/TensorInference.jl/tree/main/test
TestEnv.jl can help you start a test environment for debugging easily.

For packages like BenchmarkTools, they should not be included in the local environment.

CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
LLVM = "929cbde3-209d-540e-8aea-75f648917ca0"
LLVMLoopInfo = "8b046642-f1f6-4319-8d3c-209ddc03c586"
Expand Down
136 changes: 88 additions & 48 deletions deps/tropicalgemm_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,9 @@ __global__ void CONCATENATETHREE(TYPENAME, FUNCNAME, TT)(
TYPE beta,
int M,
int N,
int K
int K,
int DIM_GRID_X,
int DIM_GRID_Y
) {

// size of thread block
Expand All @@ -115,6 +117,8 @@ __global__ void CONCATENATETHREE(TYPENAME, FUNCNAME, TT)(

// thread id
const int tid = threadIdx.y * bszx + threadIdx.x;
int BLOCK_IDX = blockIdx.x % DIM_GRID_X;
int BLOCK_IDY = blockIdx.x / DIM_GRID_X;

// shared memory

Expand Down Expand Up @@ -153,9 +157,9 @@ __global__ void CONCATENATETHREE(TYPENAME, FUNCNAME, TT)(

#pragma unroll
for ( int i = 0 ; i < BLOCK_SIZE_M ; i += A_TILE_ROW_STRIDE) {
const int row = BLOCK_SIZE_M * blockIdx.y + i + A_TILE_ROW ;
const int row = BLOCK_SIZE_M * BLOCK_IDY + i + A_TILE_ROW ;
const int col = A_TILE_COL + tile_idx;
if (tile_idx > K - BLOCK_SIZE_K || blockIdx.y == gridDim.y - 1) {
if (tile_idx > K - BLOCK_SIZE_K || BLOCK_IDY == DIM_GRID_Y - 1) {
As[OFFSET_row(i + A_TILE_ROW, A_TILE_COL, BLOCK_SIZE_K)] = row < M && col < K ? A[OFFSET_row(
row, // row
col, // col
Expand All @@ -172,8 +176,8 @@ __global__ void CONCATENATETHREE(TYPENAME, FUNCNAME, TT)(
#pragma unroll
for ( int i = 0 ; i < BLOCK_SIZE_K; i += B_TILE_ROW_STRIDE) {
const int row = tile_idx + i + B_TILE_ROW;
const int col = B_TILE_COL + BLOCK_SIZE_N * blockIdx.x;
if (blockIdx.x == gridDim.x -1 || tile_idx > K - BLOCK_SIZE_K) {
const int col = B_TILE_COL + BLOCK_SIZE_N * BLOCK_IDX;
if (BLOCK_IDX == DIM_GRID_X -1 || tile_idx > K - BLOCK_SIZE_K) {
Bs[OFFSET_row(i + B_TILE_ROW, B_TILE_COL, BLOCK_SIZE_N)] = row < K && col < N ? B[OFFSET_row(
row, // row
col, // col
Expand Down Expand Up @@ -219,9 +223,9 @@ __global__ void CONCATENATETHREE(TYPENAME, FUNCNAME, TT)(
for (int thread_y = 0; thread_y < THREAD_SIZE_M; ++thread_y) {
#pragma unroll
for (int thread_x = 0; thread_x < THREAD_SIZE_N; ++thread_x) {
const int row = BLOCK_SIZE_M * blockIdx.y + THREAD_SIZE_M * threadIdx.y + thread_y;
const int col = BLOCK_SIZE_N * blockIdx.x + THREAD_SIZE_N * threadIdx.x + thread_x;
if (blockIdx.x == gridDim.x -1 || blockIdx.y == gridDim.y - 1) {
const int row = BLOCK_SIZE_M * BLOCK_IDY + THREAD_SIZE_M * threadIdx.y + thread_y;
const int col = BLOCK_SIZE_N * BLOCK_IDX + THREAD_SIZE_N * threadIdx.x + thread_x;
if (BLOCK_IDX == DIM_GRID_X -1 || BLOCK_IDY == DIM_GRID_Y - 1) {
if (row < M && col < N) {
C[OFFSET_col(row, col, M)] = OPERATOR_ADD(
OPERATOR_MUL(C[OFFSET_col(row, col, M)], beta),
Expand All @@ -236,6 +240,7 @@ __global__ void CONCATENATETHREE(TYPENAME, FUNCNAME, TT)(
}
}
}
__syncthreads();
}

template <
Expand All @@ -253,7 +258,9 @@ __global__ void CONCATENATETHREE(TYPENAME, FUNCNAME, TN)(
TYPE beta,
int M,
int N,
int K
int K,
int DIM_GRID_X,
int DIM_GRID_Y
) {

// size of thread block
Expand All @@ -265,6 +272,9 @@ __global__ void CONCATENATETHREE(TYPENAME, FUNCNAME, TN)(
const int tid_A = threadIdx.y * bszx + threadIdx.x;
const int tid_B = threadIdx.y + threadIdx.x * bszy;

int BLOCK_IDX = blockIdx.x % DIM_GRID_X;
int BLOCK_IDY = blockIdx.x / DIM_GRID_X;

// shared memory

__shared__ TYPE As[BLOCK_SIZE_M * BLOCK_SIZE_K]; // avoid bank conflict
Expand Down Expand Up @@ -301,9 +311,9 @@ __global__ void CONCATENATETHREE(TYPENAME, FUNCNAME, TN)(

#pragma unroll
for ( int i = 0 ; i < BLOCK_SIZE_M ; i += A_TILE_ROW_STRIDE) {
const int row = BLOCK_SIZE_M * blockIdx.y + i + A_TILE_ROW ;
const int row = BLOCK_SIZE_M * BLOCK_IDY + i + A_TILE_ROW ;
const int col = A_TILE_COL + tile_idx;
if (tile_idx > K - BLOCK_SIZE_K || blockIdx.y == gridDim.y - 1) {
if (tile_idx > K - BLOCK_SIZE_K || BLOCK_IDY == DIM_GRID_Y - 1) {
As[OFFSET_row(i + A_TILE_ROW, A_TILE_COL, BLOCK_SIZE_K)] = row < M && col < K ? A[OFFSET_row(
row, // row
col, // col
Expand All @@ -320,8 +330,8 @@ __global__ void CONCATENATETHREE(TYPENAME, FUNCNAME, TN)(
#pragma unroll
for ( int i = 0 ; i < BLOCK_SIZE_N; i += B_TILE_COL_STRIDE) {
const int row = tile_idx + B_TILE_ROW;
const int col = B_TILE_COL + i + BLOCK_SIZE_N * blockIdx.x;
if (blockIdx.x == gridDim.x -1 || tile_idx > K - BLOCK_SIZE_K) {
const int col = B_TILE_COL + i + BLOCK_SIZE_N * BLOCK_IDX;
if (BLOCK_IDX == DIM_GRID_X -1 || tile_idx > K - BLOCK_SIZE_K) {
Bs[OFFSET_row(B_TILE_ROW, i + B_TILE_COL, BLOCK_SIZE_N)] = row < K && col < N ? B[OFFSET_col(row, col, K)] : PADDING;
} else {
Bs[OFFSET_row(B_TILE_ROW, i + B_TILE_COL, BLOCK_SIZE_N)] = B[OFFSET_col(row, col, K)];
Expand Down Expand Up @@ -361,9 +371,9 @@ __global__ void CONCATENATETHREE(TYPENAME, FUNCNAME, TN)(
for (int thread_y = 0; thread_y < THREAD_SIZE_M; ++thread_y) {
#pragma unroll
for (int thread_x = 0; thread_x < THREAD_SIZE_N; ++thread_x) {
const int row = BLOCK_SIZE_M * blockIdx.y + THREAD_SIZE_M * threadIdx.y + thread_y;
const int col = BLOCK_SIZE_N * blockIdx.x + THREAD_SIZE_N * threadIdx.x + thread_x;
if (blockIdx.x == gridDim.x -1 || blockIdx.y == gridDim.y - 1) {
const int row = BLOCK_SIZE_M * BLOCK_IDY + THREAD_SIZE_M * threadIdx.y + thread_y;
const int col = BLOCK_SIZE_N * BLOCK_IDX + THREAD_SIZE_N * threadIdx.x + thread_x;
if (BLOCK_IDX == DIM_GRID_X -1 || BLOCK_IDY == DIM_GRID_Y - 1) {
if (row < M && col < N) {
C[OFFSET_col(row, col, M)] = OPERATOR_ADD(
OPERATOR_MUL(C[OFFSET_col(row, col, M)], beta),
Expand All @@ -378,6 +388,7 @@ __global__ void CONCATENATETHREE(TYPENAME, FUNCNAME, TN)(
}
}
}
__syncthreads();
}

template <
Expand All @@ -395,7 +406,9 @@ __global__ void CONCATENATETHREE(TYPENAME, FUNCNAME, NT)(
TYPE beta,
int M,
int N,
int K
int K,
int DIM_GRID_X,
int DIM_GRID_Y
) {

// size of thread block
Expand All @@ -407,6 +420,9 @@ __global__ void CONCATENATETHREE(TYPENAME, FUNCNAME, NT)(
const int BLOCK_SIZE_KN = BLOCK_SIZE_K * BLOCK_SIZE_N;
const int THREAD_SIZE_MN = THREAD_SIZE_M * THREAD_SIZE_N;

int BLOCK_IDX = blockIdx.x % DIM_GRID_X;
int BLOCK_IDY = blockIdx.x / DIM_GRID_X;

// thread id
const int tid = threadIdx.y * bszm + threadIdx.x;

Expand Down Expand Up @@ -446,10 +462,10 @@ __global__ void CONCATENATETHREE(TYPENAME, FUNCNAME, NT)(
// load A from global memory to shared memory
#pragma unroll
for ( int i = 0 ; i < BLOCK_SIZE_K ; i += A_TILE_COL_STRIDE) {
const int row = BLOCK_SIZE_M * blockIdx.x + A_TILE_ROW ;
const int row = BLOCK_SIZE_M * BLOCK_IDX + A_TILE_ROW ;
const int col = A_TILE_COL + i + tile_idx;

if (blockIdx.x == gridDim.x -1 || tile_idx >= K - BLOCK_SIZE_K) {
if (BLOCK_IDX == DIM_GRID_X -1 || tile_idx >= K - BLOCK_SIZE_K) {
As[OFFSET_col(A_TILE_ROW, i + A_TILE_COL, BLOCK_SIZE_M)] = row < M && col < K ? A[OFFSET_col(row, col, M)] : PADDING;
} else {
As[OFFSET_col(A_TILE_ROW, i + A_TILE_COL, BLOCK_SIZE_M)] = A[OFFSET_col(row, col, M)];
Expand All @@ -460,9 +476,9 @@ __global__ void CONCATENATETHREE(TYPENAME, FUNCNAME, NT)(
#pragma unroll
for ( int i = 0 ; i < BLOCK_SIZE_K; i += B_TILE_ROW_STRIDE) {
const int row = tile_idx + i + B_TILE_ROW;
const int col = B_TILE_COL + BLOCK_SIZE_N * blockIdx.y;
const int col = B_TILE_COL + BLOCK_SIZE_N * BLOCK_IDY;

if (blockIdx.y == gridDim.y -1 || tile_idx > K - BLOCK_SIZE_K) {
if (BLOCK_IDY == DIM_GRID_Y -1 || tile_idx > K - BLOCK_SIZE_K) {
Bs[OFFSET_row(i + B_TILE_ROW, B_TILE_COL, BLOCK_SIZE_N)] = row < K && col < N ? B[OFFSET_row(row, col, N)] : PADDING;
} else {
Bs[OFFSET_row(i + B_TILE_ROW, B_TILE_COL, BLOCK_SIZE_N)] = B[OFFSET_row(row, col, N)];
Expand Down Expand Up @@ -504,9 +520,9 @@ __global__ void CONCATENATETHREE(TYPENAME, FUNCNAME, NT)(
for (int thread_m = 0; thread_m < THREAD_SIZE_M; ++thread_m) {
#pragma unroll
for (int thread_n = 0; thread_n < THREAD_SIZE_N; ++thread_n) {
const int col = BLOCK_SIZE_N * blockIdx.y + THREAD_SIZE_N * threadIdx.y + thread_n;
const int row = BLOCK_SIZE_M * blockIdx.x + THREAD_SIZE_M * threadIdx.x + thread_m;
if (blockIdx.x == gridDim.x -1 || blockIdx.y == gridDim.y - 1) {
const int col = BLOCK_SIZE_N * BLOCK_IDY + THREAD_SIZE_N * threadIdx.y + thread_n;
const int row = BLOCK_SIZE_M * BLOCK_IDX + THREAD_SIZE_M * threadIdx.x + thread_m;
if (BLOCK_IDX == DIM_GRID_X -1 || BLOCK_IDY == DIM_GRID_Y - 1) {
if (row < M && col < N) {
C[OFFSET_col(row, col, M)] = OPERATOR_ADD(
OPERATOR_MUL(accum[OFFSET_col(thread_m, thread_n, THREAD_SIZE_M)], alpha),
Expand All @@ -521,6 +537,7 @@ __global__ void CONCATENATETHREE(TYPENAME, FUNCNAME, NT)(
}
}
}
__syncthreads();
}

template <
Expand All @@ -538,7 +555,9 @@ __global__ void CONCATENATETHREE(TYPENAME, FUNCNAME, NN)(
TYPE beta,
int M,
int N,
int K
int K,
int DIM_GRID_X,
int DIM_GRID_Y
) {

// size of thread block
Expand All @@ -550,6 +569,9 @@ __global__ void CONCATENATETHREE(TYPENAME, FUNCNAME, NN)(
const int BLOCK_SIZE_KN = BLOCK_SIZE_K * BLOCK_SIZE_N;
const int THREAD_SIZE_MN = THREAD_SIZE_M * THREAD_SIZE_N;

int BLOCK_IDX = blockIdx.x % DIM_GRID_X;
int BLOCK_IDY = blockIdx.x / DIM_GRID_X;

// thread id
const int tid = threadIdx.y * bszm + threadIdx.x;

Expand Down Expand Up @@ -592,10 +614,10 @@ __global__ void CONCATENATETHREE(TYPENAME, FUNCNAME, NN)(
// load A from global memory to shared memory
#pragma unroll
for ( int i = 0 ; i < BLOCK_SIZE_K ; i += A_TILE_COL_STRIDE) {
const int row = BLOCK_SIZE_M * blockIdx.x + A_TILE_ROW ;
const int row = BLOCK_SIZE_M * BLOCK_IDX + A_TILE_ROW ;
const int col = A_TILE_COL + i + tile_idx;

if (blockIdx.x == gridDim.x -1 || tile_idx >= K - BLOCK_SIZE_K) {
if (BLOCK_IDX == DIM_GRID_X -1 || tile_idx >= K - BLOCK_SIZE_K) {
As[OFFSET_col(A_TILE_ROW, i + A_TILE_COL, BLOCK_SIZE_M)] = row < M && col < K ? A[OFFSET_col(row, col, M)] : PADDING;
} else {
As[OFFSET_col(A_TILE_ROW, i + A_TILE_COL, BLOCK_SIZE_M)] = A[OFFSET_col(row, col, M)];
Expand All @@ -606,9 +628,9 @@ __global__ void CONCATENATETHREE(TYPENAME, FUNCNAME, NN)(
#pragma unroll
for ( int i = 0 ; i < BLOCK_SIZE_N; i += B_TILE_COL_STRIDE) {
const int row = tile_idx + B_TILE_ROW;
const int col = BLOCK_SIZE_N * blockIdx.y + i + B_TILE_COL;
const int col = BLOCK_SIZE_N * BLOCK_IDY + i + B_TILE_COL;

if (tile_idx >= K - BLOCK_SIZE_K || blockIdx.y == gridDim.y - 1) {
if (tile_idx >= K - BLOCK_SIZE_K || BLOCK_IDY == DIM_GRID_Y - 1) {
Bs[OFFSET_col(B_TILE_ROW, i + B_TILE_COL, BLOCK_SIZE_K)] = row < K && col < N ? B[OFFSET_col(row, col, K)] : PADDING;
} else {
Bs[OFFSET_col(B_TILE_ROW, i + B_TILE_COL, BLOCK_SIZE_K)] = B[OFFSET_col(row, col, K)];
Expand Down Expand Up @@ -650,10 +672,10 @@ __global__ void CONCATENATETHREE(TYPENAME, FUNCNAME, NN)(
for (int thread_m = 0; thread_m < THREAD_SIZE_M; ++thread_m) {
#pragma unroll
for (int thread_n = 0; thread_n < THREAD_SIZE_N; ++thread_n) {
const int col = BLOCK_SIZE_N * blockIdx.y + THREAD_SIZE_N * threadIdx.y + thread_n;
const int row = BLOCK_SIZE_M * blockIdx.x + THREAD_SIZE_M * threadIdx.x + thread_m;
const int col = BLOCK_SIZE_N * BLOCK_IDY + THREAD_SIZE_N * threadIdx.y + thread_n;
const int row = BLOCK_SIZE_M * BLOCK_IDX + THREAD_SIZE_M * threadIdx.x + thread_m;

if (blockIdx.x == gridDim.x -1 || blockIdx.y == gridDim.y - 1) {
if (BLOCK_IDX == DIM_GRID_X -1 || BLOCK_IDY == DIM_GRID_Y - 1) {
if (row < M && col < N) {
C[OFFSET_col(row, col, M)] = OPERATOR_ADD(
OPERATOR_MUL(accum[OFFSET_col(thread_m, thread_n, THREAD_SIZE_M)], alpha),
Expand All @@ -668,6 +690,7 @@ __global__ void CONCATENATETHREE(TYPENAME, FUNCNAME, NN)(
}
}
}
__syncthreads();
}

extern "C"{
Expand All @@ -682,53 +705,70 @@ void CONCATENATE(TYPENAME, FUNCNAME)(const int m, const int n, const int k, TYPE
const int BLOCK_SIZE_N = 64;
const int THREAD_SIZE_M = 4;
const int THREAD_SIZE_N = 4;


if (TA == T && TB == T) {
dim3 dimBlock(BLOCK_SIZE_N / THREAD_SIZE_N, BLOCK_SIZE_M / THREAD_SIZE_M);
dim3 dimGrid(n / BLOCK_SIZE_N, m / BLOCK_SIZE_M);

int DIM_GRID_X = n / BLOCK_SIZE_N;
int DIM_GRID_Y = m / BLOCK_SIZE_M;
if (n % BLOCK_SIZE_N != 0)
dimGrid.x++;
DIM_GRID_X++;
if (m % BLOCK_SIZE_M != 0)
dimGrid.y++;
DIM_GRID_Y++;

dim3 dimGrid(DIM_GRID_X * DIM_GRID_Y);

CONCATENATETHREE(TYPENAME, FUNCNAME, TT)<BLOCK_SIZE_M, BLOCK_SIZE_K, BLOCK_SIZE_N, THREAD_SIZE_M, THREAD_SIZE_N>
<<< dimGrid, dimBlock >>>(d_A, d_B, d_C, alpha, beta, m, n, k);
<<< dimGrid, dimBlock >>>(d_A, d_B, d_C, alpha, beta, m, n, k, DIM_GRID_X, DIM_GRID_Y);
}

if (TA == T && TB == N) {
dim3 dimBlock(BLOCK_SIZE_N / THREAD_SIZE_N, BLOCK_SIZE_M / THREAD_SIZE_M);
dim3 dimGrid(n / BLOCK_SIZE_N, m / BLOCK_SIZE_M);

int DIM_GRID_X = n / BLOCK_SIZE_N;
int DIM_GRID_Y = m / BLOCK_SIZE_M;
if (n % BLOCK_SIZE_N != 0)
dimGrid.x++;
DIM_GRID_X++;
if (m % BLOCK_SIZE_M != 0)
dimGrid.y++;
DIM_GRID_Y++;

dim3 dimGrid(DIM_GRID_X * DIM_GRID_Y);

CONCATENATETHREE(TYPENAME, FUNCNAME, TN)<BLOCK_SIZE_M, BLOCK_SIZE_K, BLOCK_SIZE_N, THREAD_SIZE_M, THREAD_SIZE_N>
<<< dimGrid, dimBlock >>>(d_A, d_B, d_C, alpha, beta, m, n, k);
<<< dimGrid, dimBlock >>>(d_A, d_B, d_C, alpha, beta, m, n, k, DIM_GRID_X, DIM_GRID_Y);
}

if (TA == N && TB == T) {
dim3 dimBlock(BLOCK_SIZE_M / THREAD_SIZE_M, BLOCK_SIZE_N / THREAD_SIZE_N);
dim3 dimGrid(m / BLOCK_SIZE_M, n / BLOCK_SIZE_N);

int DIM_GRID_X = m / BLOCK_SIZE_M;
int DIM_GRID_Y = n / BLOCK_SIZE_N;
if (m % BLOCK_SIZE_M != 0)
dimGrid.x++;
DIM_GRID_X++;
if (n % BLOCK_SIZE_N != 0)
dimGrid.y++;
DIM_GRID_Y++;

dim3 dimGrid(DIM_GRID_X * DIM_GRID_Y);

CONCATENATETHREE(TYPENAME, FUNCNAME, NT)<BLOCK_SIZE_M, BLOCK_SIZE_K, BLOCK_SIZE_N, THREAD_SIZE_M, THREAD_SIZE_N>
<<< dimGrid, dimBlock >>>(d_A, d_B, d_C, alpha, beta, m, n, k);
<<< dimGrid, dimBlock >>>(d_A, d_B, d_C, alpha, beta, m, n, k, DIM_GRID_X, DIM_GRID_Y);
}

if (TA == N && TB == N) {
dim3 dimBlock(BLOCK_SIZE_M / THREAD_SIZE_M, BLOCK_SIZE_N / THREAD_SIZE_N);
dim3 dimGrid(m / BLOCK_SIZE_M, n / BLOCK_SIZE_N);

int DIM_GRID_X = m / BLOCK_SIZE_M;
int DIM_GRID_Y = n / BLOCK_SIZE_N;
if (m % BLOCK_SIZE_M != 0)
dimGrid.x++;
DIM_GRID_X++;
if (n % BLOCK_SIZE_N != 0)
dimGrid.y++;
DIM_GRID_Y++;

dim3 dimGrid(DIM_GRID_X * DIM_GRID_Y);

CONCATENATETHREE(TYPENAME, FUNCNAME, NN)<BLOCK_SIZE_M, BLOCK_SIZE_K, BLOCK_SIZE_N, THREAD_SIZE_M, THREAD_SIZE_N>
<<< dimGrid, dimBlock >>>(d_A, d_B, d_C, alpha, beta, m, n, k);
<<< dimGrid, dimBlock >>>(d_A, d_B, d_C, alpha, beta, m, n, k, DIM_GRID_X, DIM_GRID_Y);
}

}
Expand Down