From 71d6c1e71dd8abd22c329bedb443c26fd6c8e6ca Mon Sep 17 00:00:00 2001 From: xzgao Date: Wed, 27 Sep 2023 12:40:57 +0000 Subject: [PATCH 1/5] fixed a sync problem --- .travis.yml | 26 -------------------------- Artifacts.toml | 13 ------------- deps/tropicalgemm_kernels.cu | 4 ++++ 3 files changed, 4 insertions(+), 39 deletions(-) delete mode 100644 .travis.yml delete mode 100644 Artifacts.toml diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 9626410..0000000 --- a/.travis.yml +++ /dev/null @@ -1,26 +0,0 @@ -# Documentation: http://docs.travis-ci.com/user/languages/julia -language: julia -notifications: - email: false -julia: - - 1.0 - - 1.9 - - nightly -os: - - linux -arch: - - x64 -cache: - directories: - - ~/.julia/artifacts -jobs: - fast_finish: true - allow_failures: - - julia: nightly -after_success: - - | - julia -e ' - using Pkg - Pkg.add("Coverage") - using Coverage - Codecov.submit(process_folder())' diff --git a/Artifacts.toml b/Artifacts.toml deleted file mode 100644 index 795958d..0000000 --- a/Artifacts.toml +++ /dev/null @@ -1,13 +0,0 @@ -[CUDA_lib] -git-tree-sha1 = "2918fba865582556e219191a7f393c47c2e822e0" - - [[CUDA_lib.download]] - sha256 = "751bf9d1f2921d4176ffb8ed1ddbd59bb60d6a517e6784bb71d61b62357c0007" - url = "https://gist.github.com/ArrogantGao/c38791f143d36d4b2481ac7e4aa4ecce/raw/2918fba865582556e219191a7f393c47c2e822e0.tar.gz" - -[CuTropicalGemm_lib] -git-tree-sha1 = "4ba15e5eb224d3a635e827eb0a34e304bcbb8cc0" - - [[CuTropicalGemm_lib.download]] - sha256 = "9c38e1268465fb8dbeae547b7af411a1eebd1890015447e0d3f996a462f808f2" - url = "https://gist.github.com/ArrogantGao/163ac76a4de8c2c0e3694b4cae1068ae/raw/4ba15e5eb224d3a635e827eb0a34e304bcbb8cc0.tar.gz" diff --git a/deps/tropicalgemm_kernels.cu b/deps/tropicalgemm_kernels.cu index 4a23c01..cf4bbe5 100644 --- a/deps/tropicalgemm_kernels.cu +++ b/deps/tropicalgemm_kernels.cu @@ -236,6 +236,7 @@ __global__ void CONCATENATETHREE(TYPENAME, FUNCNAME, TT)( } } } + __syncthreads(); } template < @@ -378,6 +379,7 @@ __global__ void CONCATENATETHREE(TYPENAME, FUNCNAME, TN)( } } } + __syncthreads(); } template < @@ -521,6 +523,7 @@ __global__ void CONCATENATETHREE(TYPENAME, FUNCNAME, NT)( } } } + __syncthreads(); } template < @@ -668,6 +671,7 @@ __global__ void CONCATENATETHREE(TYPENAME, FUNCNAME, NN)( } } } + __syncthreads(); } extern "C"{ From e7d3ebc9a4978daaae3337f352f3e22d3da49a7e Mon Sep 17 00:00:00 2001 From: xzgao Date: Thu, 28 Sep 2023 10:40:55 +0000 Subject: [PATCH 2/5] fix block overflow --- deps/tropicalgemm_kernels.cu | 132 ++++++++++++++++++++++------------- 1 file changed, 84 insertions(+), 48 deletions(-) diff --git a/deps/tropicalgemm_kernels.cu b/deps/tropicalgemm_kernels.cu index cf4bbe5..8d3ac9b 100644 --- a/deps/tropicalgemm_kernels.cu +++ b/deps/tropicalgemm_kernels.cu @@ -105,7 +105,9 @@ __global__ void CONCATENATETHREE(TYPENAME, FUNCNAME, TT)( TYPE beta, int M, int N, - int K + int K, + int DIM_GRID_X, + int DIM_GRID_Y ) { // size of thread block @@ -115,6 +117,8 @@ __global__ void CONCATENATETHREE(TYPENAME, FUNCNAME, TT)( // thread id const int tid = threadIdx.y * bszx + threadIdx.x; + int BLOCK_IDX = blockIdx.x % DIM_GRID_X; + int BLOCK_IDY = blockIdx.x / DIM_GRID_X; // shared memory @@ -153,9 +157,9 @@ __global__ void CONCATENATETHREE(TYPENAME, FUNCNAME, TT)( #pragma unroll for ( int i = 0 ; i < BLOCK_SIZE_M ; i += A_TILE_ROW_STRIDE) { - const int row = BLOCK_SIZE_M * blockIdx.y + i + A_TILE_ROW ; + const int row = BLOCK_SIZE_M * BLOCK_IDY + i + A_TILE_ROW ; const int col = A_TILE_COL + tile_idx; - if (tile_idx > K - BLOCK_SIZE_K || blockIdx.y == gridDim.y - 1) { + if (tile_idx > K - BLOCK_SIZE_K || BLOCK_IDY == DIM_GRID_Y - 1) { As[OFFSET_row(i + A_TILE_ROW, A_TILE_COL, BLOCK_SIZE_K)] = row < M && col < K ? A[OFFSET_row( row, // row col, // col @@ -172,8 +176,8 @@ __global__ void CONCATENATETHREE(TYPENAME, FUNCNAME, TT)( #pragma unroll for ( int i = 0 ; i < BLOCK_SIZE_K; i += B_TILE_ROW_STRIDE) { const int row = tile_idx + i + B_TILE_ROW; - const int col = B_TILE_COL + BLOCK_SIZE_N * blockIdx.x; - if (blockIdx.x == gridDim.x -1 || tile_idx > K - BLOCK_SIZE_K) { + const int col = B_TILE_COL + BLOCK_SIZE_N * BLOCK_IDX; + if (BLOCK_IDX == DIM_GRID_X -1 || tile_idx > K - BLOCK_SIZE_K) { Bs[OFFSET_row(i + B_TILE_ROW, B_TILE_COL, BLOCK_SIZE_N)] = row < K && col < N ? B[OFFSET_row( row, // row col, // col @@ -219,9 +223,9 @@ __global__ void CONCATENATETHREE(TYPENAME, FUNCNAME, TT)( for (int thread_y = 0; thread_y < THREAD_SIZE_M; ++thread_y) { #pragma unroll for (int thread_x = 0; thread_x < THREAD_SIZE_N; ++thread_x) { - const int row = BLOCK_SIZE_M * blockIdx.y + THREAD_SIZE_M * threadIdx.y + thread_y; - const int col = BLOCK_SIZE_N * blockIdx.x + THREAD_SIZE_N * threadIdx.x + thread_x; - if (blockIdx.x == gridDim.x -1 || blockIdx.y == gridDim.y - 1) { + const int row = BLOCK_SIZE_M * BLOCK_IDY + THREAD_SIZE_M * threadIdx.y + thread_y; + const int col = BLOCK_SIZE_N * BLOCK_IDX + THREAD_SIZE_N * threadIdx.x + thread_x; + if (BLOCK_IDX == DIM_GRID_X -1 || BLOCK_IDY == DIM_GRID_Y - 1) { if (row < M && col < N) { C[OFFSET_col(row, col, M)] = OPERATOR_ADD( OPERATOR_MUL(C[OFFSET_col(row, col, M)], beta), @@ -254,7 +258,9 @@ __global__ void CONCATENATETHREE(TYPENAME, FUNCNAME, TN)( TYPE beta, int M, int N, - int K + int K, + int DIM_GRID_X, + int DIM_GRID_Y ) { // size of thread block @@ -266,6 +272,9 @@ __global__ void CONCATENATETHREE(TYPENAME, FUNCNAME, TN)( const int tid_A = threadIdx.y * bszx + threadIdx.x; const int tid_B = threadIdx.y + threadIdx.x * bszy; + int BLOCK_IDX = blockIdx.x % DIM_GRID_X; + int BLOCK_IDY = blockIdx.x / DIM_GRID_X; + // shared memory __shared__ TYPE As[BLOCK_SIZE_M * BLOCK_SIZE_K]; // avoid bank conflict @@ -302,9 +311,9 @@ __global__ void CONCATENATETHREE(TYPENAME, FUNCNAME, TN)( #pragma unroll for ( int i = 0 ; i < BLOCK_SIZE_M ; i += A_TILE_ROW_STRIDE) { - const int row = BLOCK_SIZE_M * blockIdx.y + i + A_TILE_ROW ; + const int row = BLOCK_SIZE_M * BLOCK_IDY + i + A_TILE_ROW ; const int col = A_TILE_COL + tile_idx; - if (tile_idx > K - BLOCK_SIZE_K || blockIdx.y == gridDim.y - 1) { + if (tile_idx > K - BLOCK_SIZE_K || BLOCK_IDY == DIM_GRID_Y - 1) { As[OFFSET_row(i + A_TILE_ROW, A_TILE_COL, BLOCK_SIZE_K)] = row < M && col < K ? A[OFFSET_row( row, // row col, // col @@ -321,8 +330,8 @@ __global__ void CONCATENATETHREE(TYPENAME, FUNCNAME, TN)( #pragma unroll for ( int i = 0 ; i < BLOCK_SIZE_N; i += B_TILE_COL_STRIDE) { const int row = tile_idx + B_TILE_ROW; - const int col = B_TILE_COL + i + BLOCK_SIZE_N * blockIdx.x; - if (blockIdx.x == gridDim.x -1 || tile_idx > K - BLOCK_SIZE_K) { + const int col = B_TILE_COL + i + BLOCK_SIZE_N * BLOCK_IDX; + if (BLOCK_IDX == DIM_GRID_X -1 || tile_idx > K - BLOCK_SIZE_K) { Bs[OFFSET_row(B_TILE_ROW, i + B_TILE_COL, BLOCK_SIZE_N)] = row < K && col < N ? B[OFFSET_col(row, col, K)] : PADDING; } else { Bs[OFFSET_row(B_TILE_ROW, i + B_TILE_COL, BLOCK_SIZE_N)] = B[OFFSET_col(row, col, K)]; @@ -362,9 +371,9 @@ __global__ void CONCATENATETHREE(TYPENAME, FUNCNAME, TN)( for (int thread_y = 0; thread_y < THREAD_SIZE_M; ++thread_y) { #pragma unroll for (int thread_x = 0; thread_x < THREAD_SIZE_N; ++thread_x) { - const int row = BLOCK_SIZE_M * blockIdx.y + THREAD_SIZE_M * threadIdx.y + thread_y; - const int col = BLOCK_SIZE_N * blockIdx.x + THREAD_SIZE_N * threadIdx.x + thread_x; - if (blockIdx.x == gridDim.x -1 || blockIdx.y == gridDim.y - 1) { + const int row = BLOCK_SIZE_M * BLOCK_IDY + THREAD_SIZE_M * threadIdx.y + thread_y; + const int col = BLOCK_SIZE_N * BLOCK_IDX + THREAD_SIZE_N * threadIdx.x + thread_x; + if (BLOCK_IDX == DIM_GRID_X -1 || BLOCK_IDY == DIM_GRID_Y - 1) { if (row < M && col < N) { C[OFFSET_col(row, col, M)] = OPERATOR_ADD( OPERATOR_MUL(C[OFFSET_col(row, col, M)], beta), @@ -397,7 +406,9 @@ __global__ void CONCATENATETHREE(TYPENAME, FUNCNAME, NT)( TYPE beta, int M, int N, - int K + int K, + int DIM_GRID_X, + int DIM_GRID_Y ) { // size of thread block @@ -409,6 +420,9 @@ __global__ void CONCATENATETHREE(TYPENAME, FUNCNAME, NT)( const int BLOCK_SIZE_KN = BLOCK_SIZE_K * BLOCK_SIZE_N; const int THREAD_SIZE_MN = THREAD_SIZE_M * THREAD_SIZE_N; + int BLOCK_IDX = blockIdx.x % DIM_GRID_X; + int BLOCK_IDY = blockIdx.x / DIM_GRID_X; + // thread id const int tid = threadIdx.y * bszm + threadIdx.x; @@ -448,10 +462,10 @@ __global__ void CONCATENATETHREE(TYPENAME, FUNCNAME, NT)( // load A from global memory to shared memory #pragma unroll for ( int i = 0 ; i < BLOCK_SIZE_K ; i += A_TILE_COL_STRIDE) { - const int row = BLOCK_SIZE_M * blockIdx.x + A_TILE_ROW ; + const int row = BLOCK_SIZE_M * BLOCK_IDX + A_TILE_ROW ; const int col = A_TILE_COL + i + tile_idx; - if (blockIdx.x == gridDim.x -1 || tile_idx >= K - BLOCK_SIZE_K) { + if (BLOCK_IDX == DIM_GRID_X -1 || tile_idx >= K - BLOCK_SIZE_K) { As[OFFSET_col(A_TILE_ROW, i + A_TILE_COL, BLOCK_SIZE_M)] = row < M && col < K ? A[OFFSET_col(row, col, M)] : PADDING; } else { As[OFFSET_col(A_TILE_ROW, i + A_TILE_COL, BLOCK_SIZE_M)] = A[OFFSET_col(row, col, M)]; @@ -462,9 +476,9 @@ __global__ void CONCATENATETHREE(TYPENAME, FUNCNAME, NT)( #pragma unroll for ( int i = 0 ; i < BLOCK_SIZE_K; i += B_TILE_ROW_STRIDE) { const int row = tile_idx + i + B_TILE_ROW; - const int col = B_TILE_COL + BLOCK_SIZE_N * blockIdx.y; + const int col = B_TILE_COL + BLOCK_SIZE_N * BLOCK_IDY; - if (blockIdx.y == gridDim.y -1 || tile_idx > K - BLOCK_SIZE_K) { + if (BLOCK_IDY == DIM_GRID_Y -1 || tile_idx > K - BLOCK_SIZE_K) { Bs[OFFSET_row(i + B_TILE_ROW, B_TILE_COL, BLOCK_SIZE_N)] = row < K && col < N ? B[OFFSET_row(row, col, N)] : PADDING; } else { Bs[OFFSET_row(i + B_TILE_ROW, B_TILE_COL, BLOCK_SIZE_N)] = B[OFFSET_row(row, col, N)]; @@ -506,9 +520,9 @@ __global__ void CONCATENATETHREE(TYPENAME, FUNCNAME, NT)( for (int thread_m = 0; thread_m < THREAD_SIZE_M; ++thread_m) { #pragma unroll for (int thread_n = 0; thread_n < THREAD_SIZE_N; ++thread_n) { - const int col = BLOCK_SIZE_N * blockIdx.y + THREAD_SIZE_N * threadIdx.y + thread_n; - const int row = BLOCK_SIZE_M * blockIdx.x + THREAD_SIZE_M * threadIdx.x + thread_m; - if (blockIdx.x == gridDim.x -1 || blockIdx.y == gridDim.y - 1) { + const int col = BLOCK_SIZE_N * BLOCK_IDY + THREAD_SIZE_N * threadIdx.y + thread_n; + const int row = BLOCK_SIZE_M * BLOCK_IDX + THREAD_SIZE_M * threadIdx.x + thread_m; + if (BLOCK_IDX == DIM_GRID_X -1 || BLOCK_IDY == DIM_GRID_Y - 1) { if (row < M && col < N) { C[OFFSET_col(row, col, M)] = OPERATOR_ADD( OPERATOR_MUL(accum[OFFSET_col(thread_m, thread_n, THREAD_SIZE_M)], alpha), @@ -541,7 +555,9 @@ __global__ void CONCATENATETHREE(TYPENAME, FUNCNAME, NN)( TYPE beta, int M, int N, - int K + int K, + int DIM_GRID_X, + int DIM_GRID_Y ) { // size of thread block @@ -553,6 +569,9 @@ __global__ void CONCATENATETHREE(TYPENAME, FUNCNAME, NN)( const int BLOCK_SIZE_KN = BLOCK_SIZE_K * BLOCK_SIZE_N; const int THREAD_SIZE_MN = THREAD_SIZE_M * THREAD_SIZE_N; + int BLOCK_IDX = blockIdx.x % DIM_GRID_X; + int BLOCK_IDY = blockIdx.x / DIM_GRID_X; + // thread id const int tid = threadIdx.y * bszm + threadIdx.x; @@ -595,10 +614,10 @@ __global__ void CONCATENATETHREE(TYPENAME, FUNCNAME, NN)( // load A from global memory to shared memory #pragma unroll for ( int i = 0 ; i < BLOCK_SIZE_K ; i += A_TILE_COL_STRIDE) { - const int row = BLOCK_SIZE_M * blockIdx.x + A_TILE_ROW ; + const int row = BLOCK_SIZE_M * BLOCK_IDX + A_TILE_ROW ; const int col = A_TILE_COL + i + tile_idx; - if (blockIdx.x == gridDim.x -1 || tile_idx >= K - BLOCK_SIZE_K) { + if (BLOCK_IDX == DIM_GRID_X -1 || tile_idx >= K - BLOCK_SIZE_K) { As[OFFSET_col(A_TILE_ROW, i + A_TILE_COL, BLOCK_SIZE_M)] = row < M && col < K ? A[OFFSET_col(row, col, M)] : PADDING; } else { As[OFFSET_col(A_TILE_ROW, i + A_TILE_COL, BLOCK_SIZE_M)] = A[OFFSET_col(row, col, M)]; @@ -609,9 +628,9 @@ __global__ void CONCATENATETHREE(TYPENAME, FUNCNAME, NN)( #pragma unroll for ( int i = 0 ; i < BLOCK_SIZE_N; i += B_TILE_COL_STRIDE) { const int row = tile_idx + B_TILE_ROW; - const int col = BLOCK_SIZE_N * blockIdx.y + i + B_TILE_COL; + const int col = BLOCK_SIZE_N * BLOCK_IDY + i + B_TILE_COL; - if (tile_idx >= K - BLOCK_SIZE_K || blockIdx.y == gridDim.y - 1) { + if (tile_idx >= K - BLOCK_SIZE_K || BLOCK_IDY == DIM_GRID_Y - 1) { Bs[OFFSET_col(B_TILE_ROW, i + B_TILE_COL, BLOCK_SIZE_K)] = row < K && col < N ? B[OFFSET_col(row, col, K)] : PADDING; } else { Bs[OFFSET_col(B_TILE_ROW, i + B_TILE_COL, BLOCK_SIZE_K)] = B[OFFSET_col(row, col, K)]; @@ -653,10 +672,10 @@ __global__ void CONCATENATETHREE(TYPENAME, FUNCNAME, NN)( for (int thread_m = 0; thread_m < THREAD_SIZE_M; ++thread_m) { #pragma unroll for (int thread_n = 0; thread_n < THREAD_SIZE_N; ++thread_n) { - const int col = BLOCK_SIZE_N * blockIdx.y + THREAD_SIZE_N * threadIdx.y + thread_n; - const int row = BLOCK_SIZE_M * blockIdx.x + THREAD_SIZE_M * threadIdx.x + thread_m; + const int col = BLOCK_SIZE_N * BLOCK_IDY + THREAD_SIZE_N * threadIdx.y + thread_n; + const int row = BLOCK_SIZE_M * BLOCK_IDX + THREAD_SIZE_M * threadIdx.x + thread_m; - if (blockIdx.x == gridDim.x -1 || blockIdx.y == gridDim.y - 1) { + if (BLOCK_IDX == DIM_GRID_X -1 || BLOCK_IDY == DIM_GRID_Y - 1) { if (row < M && col < N) { C[OFFSET_col(row, col, M)] = OPERATOR_ADD( OPERATOR_MUL(accum[OFFSET_col(thread_m, thread_n, THREAD_SIZE_M)], alpha), @@ -686,53 +705,70 @@ void CONCATENATE(TYPENAME, FUNCNAME)(const int m, const int n, const int k, TYPE const int BLOCK_SIZE_N = 64; const int THREAD_SIZE_M = 4; const int THREAD_SIZE_N = 4; + if (TA == T && TB == T) { dim3 dimBlock(BLOCK_SIZE_N / THREAD_SIZE_N, BLOCK_SIZE_M / THREAD_SIZE_M); - dim3 dimGrid(n / BLOCK_SIZE_N, m / BLOCK_SIZE_M); + + int DIM_GRID_X = n / BLOCK_SIZE_N; + int DIM_GRID_Y = m / BLOCK_SIZE_M; if (n % BLOCK_SIZE_N != 0) - dimGrid.x++; + DIM_GRID_X++; if (m % BLOCK_SIZE_M != 0) - dimGrid.y++; + DIM_GRID_Y++; + + dim3 dimGrid(DIM_GRID_X * DIM_GRID_Y); CONCATENATETHREE(TYPENAME, FUNCNAME, TT) - <<< dimGrid, dimBlock >>>(d_A, d_B, d_C, alpha, beta, m, n, k); + <<< dimGrid, dimBlock >>>(d_A, d_B, d_C, alpha, beta, m, n, k, DIM_GRID_X, DIM_GRID_Y); } if (TA == T && TB == N) { dim3 dimBlock(BLOCK_SIZE_N / THREAD_SIZE_N, BLOCK_SIZE_M / THREAD_SIZE_M); - dim3 dimGrid(n / BLOCK_SIZE_N, m / BLOCK_SIZE_M); + + int DIM_GRID_X = n / BLOCK_SIZE_N; + int DIM_GRID_Y = m / BLOCK_SIZE_M; if (n % BLOCK_SIZE_N != 0) - dimGrid.x++; + DIM_GRID_X++; if (m % BLOCK_SIZE_M != 0) - dimGrid.y++; + DIM_GRID_Y++; + + dim3 dimGrid(DIM_GRID_X * DIM_GRID_Y); CONCATENATETHREE(TYPENAME, FUNCNAME, TN) - <<< dimGrid, dimBlock >>>(d_A, d_B, d_C, alpha, beta, m, n, k); + <<< dimGrid, dimBlock >>>(d_A, d_B, d_C, alpha, beta, m, n, k, DIM_GRID_X, DIM_GRID_Y); } if (TA == N && TB == T) { dim3 dimBlock(BLOCK_SIZE_M / THREAD_SIZE_M, BLOCK_SIZE_N / THREAD_SIZE_N); - dim3 dimGrid(m / BLOCK_SIZE_M, n / BLOCK_SIZE_N); + + int DIM_GRID_X = m / BLOCK_SIZE_M; + int DIM_GRID_Y = n / BLOCK_SIZE_N; if (m % BLOCK_SIZE_M != 0) - dimGrid.x++; + DIM_GRID_X++; if (n % BLOCK_SIZE_N != 0) - dimGrid.y++; + DIM_GRID_Y++; + + dim3 dimGrid(DIM_GRID_X * DIM_GRID_Y); CONCATENATETHREE(TYPENAME, FUNCNAME, NT) - <<< dimGrid, dimBlock >>>(d_A, d_B, d_C, alpha, beta, m, n, k); + <<< dimGrid, dimBlock >>>(d_A, d_B, d_C, alpha, beta, m, n, k, DIM_GRID_X, DIM_GRID_Y); } if (TA == N && TB == N) { dim3 dimBlock(BLOCK_SIZE_M / THREAD_SIZE_M, BLOCK_SIZE_N / THREAD_SIZE_N); - dim3 dimGrid(m / BLOCK_SIZE_M, n / BLOCK_SIZE_N); + + int DIM_GRID_X = m / BLOCK_SIZE_M; + int DIM_GRID_Y = n / BLOCK_SIZE_N; if (m % BLOCK_SIZE_M != 0) - dimGrid.x++; + DIM_GRID_X++; if (n % BLOCK_SIZE_N != 0) - dimGrid.y++; + DIM_GRID_Y++; + + dim3 dimGrid(DIM_GRID_X * DIM_GRID_Y); CONCATENATETHREE(TYPENAME, FUNCNAME, NN) - <<< dimGrid, dimBlock >>>(d_A, d_B, d_C, alpha, beta, m, n, k); + <<< dimGrid, dimBlock >>>(d_A, d_B, d_C, alpha, beta, m, n, k, DIM_GRID_X, DIM_GRID_Y); } } From 25bf9880778bd48451b9bddb0373a57c815b2e3c Mon Sep 17 00:00:00 2001 From: xzgao Date: Thu, 28 Sep 2023 10:42:34 +0000 Subject: [PATCH 3/5] fixed block overflow --- Project.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/Project.toml b/Project.toml index 17380cd..4565663 100644 --- a/Project.toml +++ b/Project.toml @@ -6,6 +6,7 @@ version = "1.0.0-DEV" [deps] ArtifactUtils = "8b73e784-e7d8-4ea5-973d-377fed4e3bce" Artifacts = "56f22d72-fd6d-98f1-02f0-08ddc0907c33" +BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" LLVM = "929cbde3-209d-540e-8aea-75f648917ca0" LLVMLoopInfo = "8b046642-f1f6-4319-8d3c-209ddc03c586" From 4fbad812377bdfeeb03fe2a482b2d86fce6387ac Mon Sep 17 00:00:00 2001 From: xzgao Date: Thu, 28 Sep 2023 14:10:00 +0000 Subject: [PATCH 4/5] cleaned the Project.toml --- Project.toml | 10 +++------- README.md | 3 +-- test/Project.toml | 5 +++++ 3 files changed, 9 insertions(+), 9 deletions(-) create mode 100644 test/Project.toml diff --git a/Project.toml b/Project.toml index 4565663..2950470 100644 --- a/Project.toml +++ b/Project.toml @@ -4,18 +4,14 @@ authors = ["Xuanzhao Gao and contributors"] version = "1.0.0-DEV" [deps] -ArtifactUtils = "8b73e784-e7d8-4ea5-973d-377fed4e3bce" -Artifacts = "56f22d72-fd6d-98f1-02f0-08ddc0907c33" -BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" -LLVM = "929cbde3-209d-540e-8aea-75f648917ca0" -LLVMLoopInfo = "8b046642-f1f6-4319-8d3c-209ddc03c586" -Libdl = "8f399da3-3557-5675-b5ff-fb832c97cbdb" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" -Revise = "295af30f-e4ad-537b-8983-00126c2a3abe" +Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" TropicalNumbers = "b3a74e9c-7526-4576-a4eb-79c0d4c32334" [compat] +CUDA = "5" +TropicalNumbers = "0.6.2" julia = "1" [extras] diff --git a/README.md b/README.md index 6f08c95..8a71135 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,6 @@ # CuTropicalGEMM -[![Build Status](https://github.com/ArrogantGao/CuTropicalGEMM.jl/actions/workflows/CI.yml/badge.svg?branch=main)](https://github.com/ArrogantGao/CuTropicalGEMM.jl/actions/workflows/CI.yml?query=branch%3Amain) -[![Build Status](https://travis-ci.com/ArrogantGao/CuTropicalGEMM.jl.svg?branch=main)](https://travis-ci.com/ArrogantGao/CuTropicalGEMM.jl) +[![Build status](https://badge.buildkite.com/06c24dc7b1a9d7c38897acd21575ffd678ee03de190c0b8d81.svg)](https://buildkite.com/julialang/cutropicalgemm-dot-jl) [![Coverage](https://codecov.io/gh/ArrogantGao/CuTropicalGEMM.jl/branch/main/graph/badge.svg)](https://codecov.io/gh/ArrogantGao/CuTropicalGEMM.jl) CuTropicalGEMM is a fast Tropical matrix multiplication on Nvidia GPU. Supported matrix element types include diff --git a/test/Project.toml b/test/Project.toml new file mode 100644 index 0000000..416b83f --- /dev/null +++ b/test/Project.toml @@ -0,0 +1,5 @@ +[deps] +CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" +LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" +TropicalNumbers = "b3a74e9c-7526-4576-a4eb-79c0d4c32334" +Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" \ No newline at end of file From 1a762c9d37635ef7eb154c1bb6cb1a28b3cd0b77 Mon Sep 17 00:00:00 2001 From: xzgao Date: Thu, 28 Sep 2023 17:08:16 +0000 Subject: [PATCH 5/5] readme revised --- README.md | 85 +++++++++++++++++++++++++++++++++---------------------- 1 file changed, 51 insertions(+), 34 deletions(-) diff --git a/README.md b/README.md index 8a71135..006e6a8 100644 --- a/README.md +++ b/README.md @@ -3,64 +3,72 @@ [![Build status](https://badge.buildkite.com/06c24dc7b1a9d7c38897acd21575ffd678ee03de190c0b8d81.svg)](https://buildkite.com/julialang/cutropicalgemm-dot-jl) [![Coverage](https://codecov.io/gh/ArrogantGao/CuTropicalGEMM.jl/branch/main/graph/badge.svg)](https://codecov.io/gh/ArrogantGao/CuTropicalGEMM.jl) -CuTropicalGEMM is a fast Tropical matrix multiplication on Nvidia GPU. Supported matrix element types include +

+CuTropicalGEMM is an open source   + + + Julia + +  package for fast generic matrix mulplication (GEMM) of tropical numbers on Nvidia GPU base on CUDA. +It greatly speed up the tropical GEMM, which is widely used in tensor network contractions. +

+ +## Features + +CuTropicalGEMM support GEMM for various matrix element types: * and-or algebra: `TropicalAndOr` * max-plus algebra: `Tropical{Float32/Float64}` -* min-plus algebra numbers: `TropicalMinPlus{Float32/Float64}` -* max-times algebra numbers: `TropicalMaxMul{Float32/Float64/Int32/Int64}` +* min-plus algebra: `TropicalMinPlus{Float32/Float64}` +* max-times algebra: `TropicalMaxMul{Float32/Float64/Int32/Int64}` -Please check [`TropicalNumbers.jl`](https://github.com/TensorBFS/TropicalNumbers.jl) for the definitions of these types. +Please check [`TropicalNumbers.jl`](https://github.com/TensorBFS/TropicalNumbers.jl) for the definitions of these types and semiring algebras. -## Get started +## Getting Started Open a Julia REPL and type `]` to enter the `pkg>` mode, and then install related packages with ```julia pkg> add CuTropicalGEMM, BenchmarkTools, TropicalNumbers, CUDA -pkg> build ``` -Loading `CuTropicalGEMM` module into the workspace affects the `*` and `LinearAlgebra.mul!` on CuTropical matrices immediately. The following is a minimum working example + +Loading `CuTropicalGEMM` module into the workspace affects the `*` and `LinearAlgebra.mul!` on CuTropical matrices immediately. +The following is a minimum working example: ```julia julia> using TropicalNumbers, CUDA, BenchmarkTools, LinearAlgebra julia> a = Tropical.(CUDA.randn(4096, 4096)); -julia> @benchmark CUDA.@sync $a * $a -BenchmarkTools.Trial: 44 samples with 1 evaluation. - Range (min … max): 108.365 ms … 123.031 ms ┊ GC (min … max): 0.00% … 0.00% - Time (median): 116.051 ms ┊ GC (median): 0.00% - Time (mean ± σ): 116.289 ms ± 4.390 ms ┊ GC (mean ± σ): 0.00% ± 0.00% - - ▁▁ ▁▁ ▁ ▄ ▁ ▁█ - ▆▁▁▁▁▁▆▆▆▆▆██▁▆▁▁▆▁▆██▁▁▆▁▆▆▆▁▁▆▆▁▁▁▁▆▆▁▁▁▁█▆▆▁▁▁█▆█▆▆██▁▁▁▁▆ ▁ - 108 ms Histogram: frequency by time 123 ms < - - Memory estimate: 5.03 KiB, allocs estimate: 95. +julia> @btime CUDA.@sync $a * $a; + 116.272 ms (60 allocations: 2.69 KiB) julia> using CuTropicalGEMM julia> @benchmark CUDA.@sync $a * $a -BenchmarkTools.Trial: 440 samples with 1 evaluation. - Range (min … max): 8.920 μs … 24.497 ms ┊ GC (min … max): 0.00% … 0.00% - Time (median): 10.733 ms ┊ GC (median): 0.00% - Time (mean ± σ): 11.363 ms ± 11.347 ms ┊ GC (mean ± σ): 0.00% ± 0.00% +BenchmarkTools.Trial: 93 samples with 4 evaluations. + Range (min … max): 6.653 μs … 158.961 ms ┊ GC (min … max): 0.00% … 0.00% + Time (median): 13.535 ms ┊ GC (median): 0.00% + Time (mean ± σ): 13.499 ms ± 15.867 ms ┊ GC (mean ± σ): 0.00% ± 0.00% - █ ▅▇▂ - █▅▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▄▁▄▄███ ▆ - 8.92 μs Histogram: log(frequency) by time 23.3 ms < + █ + ▄▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁█ ▁ + 6.65 μs Histogram: frequency by time 13.5 ms < - Memory estimate: 160 bytes, allocs estimate: 4. + Memory estimate: 256 bytes, allocs estimate: 7. +``` - julia> o = Tropical.(CUDA.zeros(4096, 4096)); +You can also use the function `LinearAlgebra.mul!(o, a, b)`, which allows you to manually allocate memory for the result: -julia> @benchmark CUDA.@sync LinearAlgebra.mul!($o, $a, $a) -BenchmarkTools.Trial: 70 samples with 7 evaluations. - Range (min … max): 4.232 μs … 11.490 ms ┊ GC (min … max): 0.00% … 0.00% - Time (median): 11.459 ms ┊ GC (median): 0.00% - Time (mean ± σ): 10.349 ms ± 3.337 ms ┊ GC (mean ± σ): 0.00% ± 0.00% +```julia +julia> o = Tropical.(CUDA.zeros(4096, 4096)); + +julia> @benchmark CUDA.@sync mul!($o, $a, $a) +BenchmarkTools.Trial: 61 samples with 7 evaluations. + Range (min … max): 4.584 μs … 13.540 ms ┊ GC (min … max): 0.00% … 0.00% + Time (median): 13.536 ms ┊ GC (median): 0.00% + Time (mean ± σ): 11.892 ms ± 4.375 ms ┊ GC (mean ± σ): 0.00% ± 0.00% █ - ▃▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁█ ▁ - 4.23 μs Histogram: frequency by time 11.5 ms < + ▄▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁█ ▁ + 4.58 μs Histogram: frequency by time 13.5 ms < Memory estimate: 0 bytes, allocs estimate: 0. ``` @@ -74,6 +82,15 @@ The benchmark and plotting scripts could be found in the benchmarks folder. For matrix size large enough ($n > 3000$), the computation power is about $13$ TFlops for FP32 operations and $3$ TFlops for FP64 operations. +## Questions and Contributions + +Please open an [issue](https://github.com/TensorBFS/CuTropicalGEMM.jl/issues) +if you encounter any problems, or have any feature requests. + +## Acknowalgement + +We would like to thank Tim Besard for his invaluable guidance and support during the development of the package, his expertise in GPU utilization have been immensely helpful. We would also like to thank Tyler Thomas for his assistance in understanding the usage of `BinaryBuilder.jl`. + ## References 1. This package originates from the following issue: https://github.com/JuliaSIMD/LoopVectorization.jl/issues/201