diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 9626410..0000000 --- a/.travis.yml +++ /dev/null @@ -1,26 +0,0 @@ -# Documentation: http://docs.travis-ci.com/user/languages/julia -language: julia -notifications: - email: false -julia: - - 1.0 - - 1.9 - - nightly -os: - - linux -arch: - - x64 -cache: - directories: - - ~/.julia/artifacts -jobs: - fast_finish: true - allow_failures: - - julia: nightly -after_success: - - | - julia -e ' - using Pkg - Pkg.add("Coverage") - using Coverage - Codecov.submit(process_folder())' diff --git a/Artifacts.toml b/Artifacts.toml deleted file mode 100644 index 795958d..0000000 --- a/Artifacts.toml +++ /dev/null @@ -1,13 +0,0 @@ -[CUDA_lib] -git-tree-sha1 = "2918fba865582556e219191a7f393c47c2e822e0" - - [[CUDA_lib.download]] - sha256 = "751bf9d1f2921d4176ffb8ed1ddbd59bb60d6a517e6784bb71d61b62357c0007" - url = "https://gist.github.com/ArrogantGao/c38791f143d36d4b2481ac7e4aa4ecce/raw/2918fba865582556e219191a7f393c47c2e822e0.tar.gz" - -[CuTropicalGemm_lib] -git-tree-sha1 = "4ba15e5eb224d3a635e827eb0a34e304bcbb8cc0" - - [[CuTropicalGemm_lib.download]] - sha256 = "9c38e1268465fb8dbeae547b7af411a1eebd1890015447e0d3f996a462f808f2" - url = "https://gist.github.com/ArrogantGao/163ac76a4de8c2c0e3694b4cae1068ae/raw/4ba15e5eb224d3a635e827eb0a34e304bcbb8cc0.tar.gz" diff --git a/Project.toml b/Project.toml index 17380cd..2950470 100644 --- a/Project.toml +++ b/Project.toml @@ -4,17 +4,14 @@ authors = ["Xuanzhao Gao and contributors"] version = "1.0.0-DEV" [deps] -ArtifactUtils = "8b73e784-e7d8-4ea5-973d-377fed4e3bce" -Artifacts = "56f22d72-fd6d-98f1-02f0-08ddc0907c33" CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" -LLVM = "929cbde3-209d-540e-8aea-75f648917ca0" -LLVMLoopInfo = "8b046642-f1f6-4319-8d3c-209ddc03c586" -Libdl = "8f399da3-3557-5675-b5ff-fb832c97cbdb" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" -Revise = "295af30f-e4ad-537b-8983-00126c2a3abe" +Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" TropicalNumbers = "b3a74e9c-7526-4576-a4eb-79c0d4c32334" [compat] +CUDA = "5" +TropicalNumbers = "0.6.2" julia = "1" [extras] diff --git a/README.md b/README.md index 6f08c95..006e6a8 100644 --- a/README.md +++ b/README.md @@ -1,67 +1,74 @@ # CuTropicalGEMM -[![Build Status](https://github.com/ArrogantGao/CuTropicalGEMM.jl/actions/workflows/CI.yml/badge.svg?branch=main)](https://github.com/ArrogantGao/CuTropicalGEMM.jl/actions/workflows/CI.yml?query=branch%3Amain) -[![Build Status](https://travis-ci.com/ArrogantGao/CuTropicalGEMM.jl.svg?branch=main)](https://travis-ci.com/ArrogantGao/CuTropicalGEMM.jl) +[![Build status](https://badge.buildkite.com/06c24dc7b1a9d7c38897acd21575ffd678ee03de190c0b8d81.svg)](https://buildkite.com/julialang/cutropicalgemm-dot-jl) [![Coverage](https://codecov.io/gh/ArrogantGao/CuTropicalGEMM.jl/branch/main/graph/badge.svg)](https://codecov.io/gh/ArrogantGao/CuTropicalGEMM.jl) -CuTropicalGEMM is a fast Tropical matrix multiplication on Nvidia GPU. Supported matrix element types include +

+CuTropicalGEMM is an open source   + + + Julia + +  package for fast generic matrix mulplication (GEMM) of tropical numbers on Nvidia GPU base on CUDA. +It greatly speed up the tropical GEMM, which is widely used in tensor network contractions. +

+ +## Features + +CuTropicalGEMM support GEMM for various matrix element types: * and-or algebra: `TropicalAndOr` * max-plus algebra: `Tropical{Float32/Float64}` -* min-plus algebra numbers: `TropicalMinPlus{Float32/Float64}` -* max-times algebra numbers: `TropicalMaxMul{Float32/Float64/Int32/Int64}` +* min-plus algebra: `TropicalMinPlus{Float32/Float64}` +* max-times algebra: `TropicalMaxMul{Float32/Float64/Int32/Int64}` -Please check [`TropicalNumbers.jl`](https://github.com/TensorBFS/TropicalNumbers.jl) for the definitions of these types. +Please check [`TropicalNumbers.jl`](https://github.com/TensorBFS/TropicalNumbers.jl) for the definitions of these types and semiring algebras. -## Get started +## Getting Started Open a Julia REPL and type `]` to enter the `pkg>` mode, and then install related packages with ```julia pkg> add CuTropicalGEMM, BenchmarkTools, TropicalNumbers, CUDA -pkg> build ``` -Loading `CuTropicalGEMM` module into the workspace affects the `*` and `LinearAlgebra.mul!` on CuTropical matrices immediately. The following is a minimum working example + +Loading `CuTropicalGEMM` module into the workspace affects the `*` and `LinearAlgebra.mul!` on CuTropical matrices immediately. +The following is a minimum working example: ```julia julia> using TropicalNumbers, CUDA, BenchmarkTools, LinearAlgebra julia> a = Tropical.(CUDA.randn(4096, 4096)); -julia> @benchmark CUDA.@sync $a * $a -BenchmarkTools.Trial: 44 samples with 1 evaluation. - Range (min … max): 108.365 ms … 123.031 ms ┊ GC (min … max): 0.00% … 0.00% - Time (median): 116.051 ms ┊ GC (median): 0.00% - Time (mean ± σ): 116.289 ms ± 4.390 ms ┊ GC (mean ± σ): 0.00% ± 0.00% - - ▁▁ ▁▁ ▁ ▄ ▁ ▁█ - ▆▁▁▁▁▁▆▆▆▆▆██▁▆▁▁▆▁▆██▁▁▆▁▆▆▆▁▁▆▆▁▁▁▁▆▆▁▁▁▁█▆▆▁▁▁█▆█▆▆██▁▁▁▁▆ ▁ - 108 ms Histogram: frequency by time 123 ms < - - Memory estimate: 5.03 KiB, allocs estimate: 95. +julia> @btime CUDA.@sync $a * $a; + 116.272 ms (60 allocations: 2.69 KiB) julia> using CuTropicalGEMM julia> @benchmark CUDA.@sync $a * $a -BenchmarkTools.Trial: 440 samples with 1 evaluation. - Range (min … max): 8.920 μs … 24.497 ms ┊ GC (min … max): 0.00% … 0.00% - Time (median): 10.733 ms ┊ GC (median): 0.00% - Time (mean ± σ): 11.363 ms ± 11.347 ms ┊ GC (mean ± σ): 0.00% ± 0.00% +BenchmarkTools.Trial: 93 samples with 4 evaluations. + Range (min … max): 6.653 μs … 158.961 ms ┊ GC (min … max): 0.00% … 0.00% + Time (median): 13.535 ms ┊ GC (median): 0.00% + Time (mean ± σ): 13.499 ms ± 15.867 ms ┊ GC (mean ± σ): 0.00% ± 0.00% - █ ▅▇▂ - █▅▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▄▁▄▄███ ▆ - 8.92 μs Histogram: log(frequency) by time 23.3 ms < + █ + ▄▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁█ ▁ + 6.65 μs Histogram: frequency by time 13.5 ms < - Memory estimate: 160 bytes, allocs estimate: 4. + Memory estimate: 256 bytes, allocs estimate: 7. +``` - julia> o = Tropical.(CUDA.zeros(4096, 4096)); +You can also use the function `LinearAlgebra.mul!(o, a, b)`, which allows you to manually allocate memory for the result: -julia> @benchmark CUDA.@sync LinearAlgebra.mul!($o, $a, $a) -BenchmarkTools.Trial: 70 samples with 7 evaluations. - Range (min … max): 4.232 μs … 11.490 ms ┊ GC (min … max): 0.00% … 0.00% - Time (median): 11.459 ms ┊ GC (median): 0.00% - Time (mean ± σ): 10.349 ms ± 3.337 ms ┊ GC (mean ± σ): 0.00% ± 0.00% +```julia +julia> o = Tropical.(CUDA.zeros(4096, 4096)); + +julia> @benchmark CUDA.@sync mul!($o, $a, $a) +BenchmarkTools.Trial: 61 samples with 7 evaluations. + Range (min … max): 4.584 μs … 13.540 ms ┊ GC (min … max): 0.00% … 0.00% + Time (median): 13.536 ms ┊ GC (median): 0.00% + Time (mean ± σ): 11.892 ms ± 4.375 ms ┊ GC (mean ± σ): 0.00% ± 0.00% █ - ▃▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁█ ▁ - 4.23 μs Histogram: frequency by time 11.5 ms < + ▄▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁█ ▁ + 4.58 μs Histogram: frequency by time 13.5 ms < Memory estimate: 0 bytes, allocs estimate: 0. ``` @@ -75,6 +82,15 @@ The benchmark and plotting scripts could be found in the benchmarks folder. For matrix size large enough ($n > 3000$), the computation power is about $13$ TFlops for FP32 operations and $3$ TFlops for FP64 operations. +## Questions and Contributions + +Please open an [issue](https://github.com/TensorBFS/CuTropicalGEMM.jl/issues) +if you encounter any problems, or have any feature requests. + +## Acknowalgement + +We would like to thank Tim Besard for his invaluable guidance and support during the development of the package, his expertise in GPU utilization have been immensely helpful. We would also like to thank Tyler Thomas for his assistance in understanding the usage of `BinaryBuilder.jl`. + ## References 1. This package originates from the following issue: https://github.com/JuliaSIMD/LoopVectorization.jl/issues/201 diff --git a/deps/tropicalgemm_kernels.cu b/deps/tropicalgemm_kernels.cu index 4a23c01..8d3ac9b 100644 --- a/deps/tropicalgemm_kernels.cu +++ b/deps/tropicalgemm_kernels.cu @@ -105,7 +105,9 @@ __global__ void CONCATENATETHREE(TYPENAME, FUNCNAME, TT)( TYPE beta, int M, int N, - int K + int K, + int DIM_GRID_X, + int DIM_GRID_Y ) { // size of thread block @@ -115,6 +117,8 @@ __global__ void CONCATENATETHREE(TYPENAME, FUNCNAME, TT)( // thread id const int tid = threadIdx.y * bszx + threadIdx.x; + int BLOCK_IDX = blockIdx.x % DIM_GRID_X; + int BLOCK_IDY = blockIdx.x / DIM_GRID_X; // shared memory @@ -153,9 +157,9 @@ __global__ void CONCATENATETHREE(TYPENAME, FUNCNAME, TT)( #pragma unroll for ( int i = 0 ; i < BLOCK_SIZE_M ; i += A_TILE_ROW_STRIDE) { - const int row = BLOCK_SIZE_M * blockIdx.y + i + A_TILE_ROW ; + const int row = BLOCK_SIZE_M * BLOCK_IDY + i + A_TILE_ROW ; const int col = A_TILE_COL + tile_idx; - if (tile_idx > K - BLOCK_SIZE_K || blockIdx.y == gridDim.y - 1) { + if (tile_idx > K - BLOCK_SIZE_K || BLOCK_IDY == DIM_GRID_Y - 1) { As[OFFSET_row(i + A_TILE_ROW, A_TILE_COL, BLOCK_SIZE_K)] = row < M && col < K ? A[OFFSET_row( row, // row col, // col @@ -172,8 +176,8 @@ __global__ void CONCATENATETHREE(TYPENAME, FUNCNAME, TT)( #pragma unroll for ( int i = 0 ; i < BLOCK_SIZE_K; i += B_TILE_ROW_STRIDE) { const int row = tile_idx + i + B_TILE_ROW; - const int col = B_TILE_COL + BLOCK_SIZE_N * blockIdx.x; - if (blockIdx.x == gridDim.x -1 || tile_idx > K - BLOCK_SIZE_K) { + const int col = B_TILE_COL + BLOCK_SIZE_N * BLOCK_IDX; + if (BLOCK_IDX == DIM_GRID_X -1 || tile_idx > K - BLOCK_SIZE_K) { Bs[OFFSET_row(i + B_TILE_ROW, B_TILE_COL, BLOCK_SIZE_N)] = row < K && col < N ? B[OFFSET_row( row, // row col, // col @@ -219,9 +223,9 @@ __global__ void CONCATENATETHREE(TYPENAME, FUNCNAME, TT)( for (int thread_y = 0; thread_y < THREAD_SIZE_M; ++thread_y) { #pragma unroll for (int thread_x = 0; thread_x < THREAD_SIZE_N; ++thread_x) { - const int row = BLOCK_SIZE_M * blockIdx.y + THREAD_SIZE_M * threadIdx.y + thread_y; - const int col = BLOCK_SIZE_N * blockIdx.x + THREAD_SIZE_N * threadIdx.x + thread_x; - if (blockIdx.x == gridDim.x -1 || blockIdx.y == gridDim.y - 1) { + const int row = BLOCK_SIZE_M * BLOCK_IDY + THREAD_SIZE_M * threadIdx.y + thread_y; + const int col = BLOCK_SIZE_N * BLOCK_IDX + THREAD_SIZE_N * threadIdx.x + thread_x; + if (BLOCK_IDX == DIM_GRID_X -1 || BLOCK_IDY == DIM_GRID_Y - 1) { if (row < M && col < N) { C[OFFSET_col(row, col, M)] = OPERATOR_ADD( OPERATOR_MUL(C[OFFSET_col(row, col, M)], beta), @@ -236,6 +240,7 @@ __global__ void CONCATENATETHREE(TYPENAME, FUNCNAME, TT)( } } } + __syncthreads(); } template < @@ -253,7 +258,9 @@ __global__ void CONCATENATETHREE(TYPENAME, FUNCNAME, TN)( TYPE beta, int M, int N, - int K + int K, + int DIM_GRID_X, + int DIM_GRID_Y ) { // size of thread block @@ -265,6 +272,9 @@ __global__ void CONCATENATETHREE(TYPENAME, FUNCNAME, TN)( const int tid_A = threadIdx.y * bszx + threadIdx.x; const int tid_B = threadIdx.y + threadIdx.x * bszy; + int BLOCK_IDX = blockIdx.x % DIM_GRID_X; + int BLOCK_IDY = blockIdx.x / DIM_GRID_X; + // shared memory __shared__ TYPE As[BLOCK_SIZE_M * BLOCK_SIZE_K]; // avoid bank conflict @@ -301,9 +311,9 @@ __global__ void CONCATENATETHREE(TYPENAME, FUNCNAME, TN)( #pragma unroll for ( int i = 0 ; i < BLOCK_SIZE_M ; i += A_TILE_ROW_STRIDE) { - const int row = BLOCK_SIZE_M * blockIdx.y + i + A_TILE_ROW ; + const int row = BLOCK_SIZE_M * BLOCK_IDY + i + A_TILE_ROW ; const int col = A_TILE_COL + tile_idx; - if (tile_idx > K - BLOCK_SIZE_K || blockIdx.y == gridDim.y - 1) { + if (tile_idx > K - BLOCK_SIZE_K || BLOCK_IDY == DIM_GRID_Y - 1) { As[OFFSET_row(i + A_TILE_ROW, A_TILE_COL, BLOCK_SIZE_K)] = row < M && col < K ? A[OFFSET_row( row, // row col, // col @@ -320,8 +330,8 @@ __global__ void CONCATENATETHREE(TYPENAME, FUNCNAME, TN)( #pragma unroll for ( int i = 0 ; i < BLOCK_SIZE_N; i += B_TILE_COL_STRIDE) { const int row = tile_idx + B_TILE_ROW; - const int col = B_TILE_COL + i + BLOCK_SIZE_N * blockIdx.x; - if (blockIdx.x == gridDim.x -1 || tile_idx > K - BLOCK_SIZE_K) { + const int col = B_TILE_COL + i + BLOCK_SIZE_N * BLOCK_IDX; + if (BLOCK_IDX == DIM_GRID_X -1 || tile_idx > K - BLOCK_SIZE_K) { Bs[OFFSET_row(B_TILE_ROW, i + B_TILE_COL, BLOCK_SIZE_N)] = row < K && col < N ? B[OFFSET_col(row, col, K)] : PADDING; } else { Bs[OFFSET_row(B_TILE_ROW, i + B_TILE_COL, BLOCK_SIZE_N)] = B[OFFSET_col(row, col, K)]; @@ -361,9 +371,9 @@ __global__ void CONCATENATETHREE(TYPENAME, FUNCNAME, TN)( for (int thread_y = 0; thread_y < THREAD_SIZE_M; ++thread_y) { #pragma unroll for (int thread_x = 0; thread_x < THREAD_SIZE_N; ++thread_x) { - const int row = BLOCK_SIZE_M * blockIdx.y + THREAD_SIZE_M * threadIdx.y + thread_y; - const int col = BLOCK_SIZE_N * blockIdx.x + THREAD_SIZE_N * threadIdx.x + thread_x; - if (blockIdx.x == gridDim.x -1 || blockIdx.y == gridDim.y - 1) { + const int row = BLOCK_SIZE_M * BLOCK_IDY + THREAD_SIZE_M * threadIdx.y + thread_y; + const int col = BLOCK_SIZE_N * BLOCK_IDX + THREAD_SIZE_N * threadIdx.x + thread_x; + if (BLOCK_IDX == DIM_GRID_X -1 || BLOCK_IDY == DIM_GRID_Y - 1) { if (row < M && col < N) { C[OFFSET_col(row, col, M)] = OPERATOR_ADD( OPERATOR_MUL(C[OFFSET_col(row, col, M)], beta), @@ -378,6 +388,7 @@ __global__ void CONCATENATETHREE(TYPENAME, FUNCNAME, TN)( } } } + __syncthreads(); } template < @@ -395,7 +406,9 @@ __global__ void CONCATENATETHREE(TYPENAME, FUNCNAME, NT)( TYPE beta, int M, int N, - int K + int K, + int DIM_GRID_X, + int DIM_GRID_Y ) { // size of thread block @@ -407,6 +420,9 @@ __global__ void CONCATENATETHREE(TYPENAME, FUNCNAME, NT)( const int BLOCK_SIZE_KN = BLOCK_SIZE_K * BLOCK_SIZE_N; const int THREAD_SIZE_MN = THREAD_SIZE_M * THREAD_SIZE_N; + int BLOCK_IDX = blockIdx.x % DIM_GRID_X; + int BLOCK_IDY = blockIdx.x / DIM_GRID_X; + // thread id const int tid = threadIdx.y * bszm + threadIdx.x; @@ -446,10 +462,10 @@ __global__ void CONCATENATETHREE(TYPENAME, FUNCNAME, NT)( // load A from global memory to shared memory #pragma unroll for ( int i = 0 ; i < BLOCK_SIZE_K ; i += A_TILE_COL_STRIDE) { - const int row = BLOCK_SIZE_M * blockIdx.x + A_TILE_ROW ; + const int row = BLOCK_SIZE_M * BLOCK_IDX + A_TILE_ROW ; const int col = A_TILE_COL + i + tile_idx; - if (blockIdx.x == gridDim.x -1 || tile_idx >= K - BLOCK_SIZE_K) { + if (BLOCK_IDX == DIM_GRID_X -1 || tile_idx >= K - BLOCK_SIZE_K) { As[OFFSET_col(A_TILE_ROW, i + A_TILE_COL, BLOCK_SIZE_M)] = row < M && col < K ? A[OFFSET_col(row, col, M)] : PADDING; } else { As[OFFSET_col(A_TILE_ROW, i + A_TILE_COL, BLOCK_SIZE_M)] = A[OFFSET_col(row, col, M)]; @@ -460,9 +476,9 @@ __global__ void CONCATENATETHREE(TYPENAME, FUNCNAME, NT)( #pragma unroll for ( int i = 0 ; i < BLOCK_SIZE_K; i += B_TILE_ROW_STRIDE) { const int row = tile_idx + i + B_TILE_ROW; - const int col = B_TILE_COL + BLOCK_SIZE_N * blockIdx.y; + const int col = B_TILE_COL + BLOCK_SIZE_N * BLOCK_IDY; - if (blockIdx.y == gridDim.y -1 || tile_idx > K - BLOCK_SIZE_K) { + if (BLOCK_IDY == DIM_GRID_Y -1 || tile_idx > K - BLOCK_SIZE_K) { Bs[OFFSET_row(i + B_TILE_ROW, B_TILE_COL, BLOCK_SIZE_N)] = row < K && col < N ? B[OFFSET_row(row, col, N)] : PADDING; } else { Bs[OFFSET_row(i + B_TILE_ROW, B_TILE_COL, BLOCK_SIZE_N)] = B[OFFSET_row(row, col, N)]; @@ -504,9 +520,9 @@ __global__ void CONCATENATETHREE(TYPENAME, FUNCNAME, NT)( for (int thread_m = 0; thread_m < THREAD_SIZE_M; ++thread_m) { #pragma unroll for (int thread_n = 0; thread_n < THREAD_SIZE_N; ++thread_n) { - const int col = BLOCK_SIZE_N * blockIdx.y + THREAD_SIZE_N * threadIdx.y + thread_n; - const int row = BLOCK_SIZE_M * blockIdx.x + THREAD_SIZE_M * threadIdx.x + thread_m; - if (blockIdx.x == gridDim.x -1 || blockIdx.y == gridDim.y - 1) { + const int col = BLOCK_SIZE_N * BLOCK_IDY + THREAD_SIZE_N * threadIdx.y + thread_n; + const int row = BLOCK_SIZE_M * BLOCK_IDX + THREAD_SIZE_M * threadIdx.x + thread_m; + if (BLOCK_IDX == DIM_GRID_X -1 || BLOCK_IDY == DIM_GRID_Y - 1) { if (row < M && col < N) { C[OFFSET_col(row, col, M)] = OPERATOR_ADD( OPERATOR_MUL(accum[OFFSET_col(thread_m, thread_n, THREAD_SIZE_M)], alpha), @@ -521,6 +537,7 @@ __global__ void CONCATENATETHREE(TYPENAME, FUNCNAME, NT)( } } } + __syncthreads(); } template < @@ -538,7 +555,9 @@ __global__ void CONCATENATETHREE(TYPENAME, FUNCNAME, NN)( TYPE beta, int M, int N, - int K + int K, + int DIM_GRID_X, + int DIM_GRID_Y ) { // size of thread block @@ -550,6 +569,9 @@ __global__ void CONCATENATETHREE(TYPENAME, FUNCNAME, NN)( const int BLOCK_SIZE_KN = BLOCK_SIZE_K * BLOCK_SIZE_N; const int THREAD_SIZE_MN = THREAD_SIZE_M * THREAD_SIZE_N; + int BLOCK_IDX = blockIdx.x % DIM_GRID_X; + int BLOCK_IDY = blockIdx.x / DIM_GRID_X; + // thread id const int tid = threadIdx.y * bszm + threadIdx.x; @@ -592,10 +614,10 @@ __global__ void CONCATENATETHREE(TYPENAME, FUNCNAME, NN)( // load A from global memory to shared memory #pragma unroll for ( int i = 0 ; i < BLOCK_SIZE_K ; i += A_TILE_COL_STRIDE) { - const int row = BLOCK_SIZE_M * blockIdx.x + A_TILE_ROW ; + const int row = BLOCK_SIZE_M * BLOCK_IDX + A_TILE_ROW ; const int col = A_TILE_COL + i + tile_idx; - if (blockIdx.x == gridDim.x -1 || tile_idx >= K - BLOCK_SIZE_K) { + if (BLOCK_IDX == DIM_GRID_X -1 || tile_idx >= K - BLOCK_SIZE_K) { As[OFFSET_col(A_TILE_ROW, i + A_TILE_COL, BLOCK_SIZE_M)] = row < M && col < K ? A[OFFSET_col(row, col, M)] : PADDING; } else { As[OFFSET_col(A_TILE_ROW, i + A_TILE_COL, BLOCK_SIZE_M)] = A[OFFSET_col(row, col, M)]; @@ -606,9 +628,9 @@ __global__ void CONCATENATETHREE(TYPENAME, FUNCNAME, NN)( #pragma unroll for ( int i = 0 ; i < BLOCK_SIZE_N; i += B_TILE_COL_STRIDE) { const int row = tile_idx + B_TILE_ROW; - const int col = BLOCK_SIZE_N * blockIdx.y + i + B_TILE_COL; + const int col = BLOCK_SIZE_N * BLOCK_IDY + i + B_TILE_COL; - if (tile_idx >= K - BLOCK_SIZE_K || blockIdx.y == gridDim.y - 1) { + if (tile_idx >= K - BLOCK_SIZE_K || BLOCK_IDY == DIM_GRID_Y - 1) { Bs[OFFSET_col(B_TILE_ROW, i + B_TILE_COL, BLOCK_SIZE_K)] = row < K && col < N ? B[OFFSET_col(row, col, K)] : PADDING; } else { Bs[OFFSET_col(B_TILE_ROW, i + B_TILE_COL, BLOCK_SIZE_K)] = B[OFFSET_col(row, col, K)]; @@ -650,10 +672,10 @@ __global__ void CONCATENATETHREE(TYPENAME, FUNCNAME, NN)( for (int thread_m = 0; thread_m < THREAD_SIZE_M; ++thread_m) { #pragma unroll for (int thread_n = 0; thread_n < THREAD_SIZE_N; ++thread_n) { - const int col = BLOCK_SIZE_N * blockIdx.y + THREAD_SIZE_N * threadIdx.y + thread_n; - const int row = BLOCK_SIZE_M * blockIdx.x + THREAD_SIZE_M * threadIdx.x + thread_m; + const int col = BLOCK_SIZE_N * BLOCK_IDY + THREAD_SIZE_N * threadIdx.y + thread_n; + const int row = BLOCK_SIZE_M * BLOCK_IDX + THREAD_SIZE_M * threadIdx.x + thread_m; - if (blockIdx.x == gridDim.x -1 || blockIdx.y == gridDim.y - 1) { + if (BLOCK_IDX == DIM_GRID_X -1 || BLOCK_IDY == DIM_GRID_Y - 1) { if (row < M && col < N) { C[OFFSET_col(row, col, M)] = OPERATOR_ADD( OPERATOR_MUL(accum[OFFSET_col(thread_m, thread_n, THREAD_SIZE_M)], alpha), @@ -668,6 +690,7 @@ __global__ void CONCATENATETHREE(TYPENAME, FUNCNAME, NN)( } } } + __syncthreads(); } extern "C"{ @@ -682,53 +705,70 @@ void CONCATENATE(TYPENAME, FUNCNAME)(const int m, const int n, const int k, TYPE const int BLOCK_SIZE_N = 64; const int THREAD_SIZE_M = 4; const int THREAD_SIZE_N = 4; + if (TA == T && TB == T) { dim3 dimBlock(BLOCK_SIZE_N / THREAD_SIZE_N, BLOCK_SIZE_M / THREAD_SIZE_M); - dim3 dimGrid(n / BLOCK_SIZE_N, m / BLOCK_SIZE_M); + + int DIM_GRID_X = n / BLOCK_SIZE_N; + int DIM_GRID_Y = m / BLOCK_SIZE_M; if (n % BLOCK_SIZE_N != 0) - dimGrid.x++; + DIM_GRID_X++; if (m % BLOCK_SIZE_M != 0) - dimGrid.y++; + DIM_GRID_Y++; + + dim3 dimGrid(DIM_GRID_X * DIM_GRID_Y); CONCATENATETHREE(TYPENAME, FUNCNAME, TT) - <<< dimGrid, dimBlock >>>(d_A, d_B, d_C, alpha, beta, m, n, k); + <<< dimGrid, dimBlock >>>(d_A, d_B, d_C, alpha, beta, m, n, k, DIM_GRID_X, DIM_GRID_Y); } if (TA == T && TB == N) { dim3 dimBlock(BLOCK_SIZE_N / THREAD_SIZE_N, BLOCK_SIZE_M / THREAD_SIZE_M); - dim3 dimGrid(n / BLOCK_SIZE_N, m / BLOCK_SIZE_M); + + int DIM_GRID_X = n / BLOCK_SIZE_N; + int DIM_GRID_Y = m / BLOCK_SIZE_M; if (n % BLOCK_SIZE_N != 0) - dimGrid.x++; + DIM_GRID_X++; if (m % BLOCK_SIZE_M != 0) - dimGrid.y++; + DIM_GRID_Y++; + + dim3 dimGrid(DIM_GRID_X * DIM_GRID_Y); CONCATENATETHREE(TYPENAME, FUNCNAME, TN) - <<< dimGrid, dimBlock >>>(d_A, d_B, d_C, alpha, beta, m, n, k); + <<< dimGrid, dimBlock >>>(d_A, d_B, d_C, alpha, beta, m, n, k, DIM_GRID_X, DIM_GRID_Y); } if (TA == N && TB == T) { dim3 dimBlock(BLOCK_SIZE_M / THREAD_SIZE_M, BLOCK_SIZE_N / THREAD_SIZE_N); - dim3 dimGrid(m / BLOCK_SIZE_M, n / BLOCK_SIZE_N); + + int DIM_GRID_X = m / BLOCK_SIZE_M; + int DIM_GRID_Y = n / BLOCK_SIZE_N; if (m % BLOCK_SIZE_M != 0) - dimGrid.x++; + DIM_GRID_X++; if (n % BLOCK_SIZE_N != 0) - dimGrid.y++; + DIM_GRID_Y++; + + dim3 dimGrid(DIM_GRID_X * DIM_GRID_Y); CONCATENATETHREE(TYPENAME, FUNCNAME, NT) - <<< dimGrid, dimBlock >>>(d_A, d_B, d_C, alpha, beta, m, n, k); + <<< dimGrid, dimBlock >>>(d_A, d_B, d_C, alpha, beta, m, n, k, DIM_GRID_X, DIM_GRID_Y); } if (TA == N && TB == N) { dim3 dimBlock(BLOCK_SIZE_M / THREAD_SIZE_M, BLOCK_SIZE_N / THREAD_SIZE_N); - dim3 dimGrid(m / BLOCK_SIZE_M, n / BLOCK_SIZE_N); + + int DIM_GRID_X = m / BLOCK_SIZE_M; + int DIM_GRID_Y = n / BLOCK_SIZE_N; if (m % BLOCK_SIZE_M != 0) - dimGrid.x++; + DIM_GRID_X++; if (n % BLOCK_SIZE_N != 0) - dimGrid.y++; + DIM_GRID_Y++; + + dim3 dimGrid(DIM_GRID_X * DIM_GRID_Y); CONCATENATETHREE(TYPENAME, FUNCNAME, NN) - <<< dimGrid, dimBlock >>>(d_A, d_B, d_C, alpha, beta, m, n, k); + <<< dimGrid, dimBlock >>>(d_A, d_B, d_C, alpha, beta, m, n, k, DIM_GRID_X, DIM_GRID_Y); } } diff --git a/test/Project.toml b/test/Project.toml new file mode 100644 index 0000000..416b83f --- /dev/null +++ b/test/Project.toml @@ -0,0 +1,5 @@ +[deps] +CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" +LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" +TropicalNumbers = "b3a74e9c-7526-4576-a4eb-79c0d4c32334" +Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" \ No newline at end of file