diff --git a/.travis.yml b/.travis.yml
deleted file mode 100644
index 9626410..0000000
--- a/.travis.yml
+++ /dev/null
@@ -1,26 +0,0 @@
-# Documentation: http://docs.travis-ci.com/user/languages/julia
-language: julia
-notifications:
-  email: false
-julia:
-  - 1.0
-  - 1.9
-  - nightly
-os:
-  - linux
-arch:
-  - x64
-cache:
-  directories:
-    - ~/.julia/artifacts
-jobs:
-  fast_finish: true
-  allow_failures:
-    - julia: nightly
-after_success:
-  - |
-    julia -e '
-      using Pkg
-      Pkg.add("Coverage")
-      using Coverage
-      Codecov.submit(process_folder())'
diff --git a/Artifacts.toml b/Artifacts.toml
deleted file mode 100644
index 795958d..0000000
--- a/Artifacts.toml
+++ /dev/null
@@ -1,13 +0,0 @@
-[CUDA_lib]
-git-tree-sha1 = "2918fba865582556e219191a7f393c47c2e822e0"
-
-    [[CUDA_lib.download]]
-    sha256 = "751bf9d1f2921d4176ffb8ed1ddbd59bb60d6a517e6784bb71d61b62357c0007"
-    url = "https://gist.github.com/ArrogantGao/c38791f143d36d4b2481ac7e4aa4ecce/raw/2918fba865582556e219191a7f393c47c2e822e0.tar.gz"
-
-[CuTropicalGemm_lib]
-git-tree-sha1 = "4ba15e5eb224d3a635e827eb0a34e304bcbb8cc0"
-
-    [[CuTropicalGemm_lib.download]]
-    sha256 = "9c38e1268465fb8dbeae547b7af411a1eebd1890015447e0d3f996a462f808f2"
-    url = "https://gist.github.com/ArrogantGao/163ac76a4de8c2c0e3694b4cae1068ae/raw/4ba15e5eb224d3a635e827eb0a34e304bcbb8cc0.tar.gz"
diff --git a/Project.toml b/Project.toml
index 17380cd..2950470 100644
--- a/Project.toml
+++ b/Project.toml
@@ -4,17 +4,14 @@ authors = ["Xuanzhao Gao <gaoxuanzhao@gmail.com> and contributors"]
 version = "1.0.0-DEV"
 
 [deps]
-ArtifactUtils = "8b73e784-e7d8-4ea5-973d-377fed4e3bce"
-Artifacts = "56f22d72-fd6d-98f1-02f0-08ddc0907c33"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
-LLVM = "929cbde3-209d-540e-8aea-75f648917ca0"
-LLVMLoopInfo = "8b046642-f1f6-4319-8d3c-209ddc03c586"
-Libdl = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
-Revise = "295af30f-e4ad-537b-8983-00126c2a3abe"
+Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 TropicalNumbers = "b3a74e9c-7526-4576-a4eb-79c0d4c32334"
 
 [compat]
+CUDA = "5"
+TropicalNumbers = "0.6.2"
 julia = "1"
 
 [extras]
diff --git a/README.md b/README.md
index 6f08c95..006e6a8 100644
--- a/README.md
+++ b/README.md
@@ -1,67 +1,74 @@
 # CuTropicalGEMM
 
-[![Build Status](https://github.com/ArrogantGao/CuTropicalGEMM.jl/actions/workflows/CI.yml/badge.svg?branch=main)](https://github.com/ArrogantGao/CuTropicalGEMM.jl/actions/workflows/CI.yml?query=branch%3Amain)
-[![Build Status](https://travis-ci.com/ArrogantGao/CuTropicalGEMM.jl.svg?branch=main)](https://travis-ci.com/ArrogantGao/CuTropicalGEMM.jl)
+[![Build status](https://badge.buildkite.com/06c24dc7b1a9d7c38897acd21575ffd678ee03de190c0b8d81.svg)](https://buildkite.com/julialang/cutropicalgemm-dot-jl)
 [![Coverage](https://codecov.io/gh/ArrogantGao/CuTropicalGEMM.jl/branch/main/graph/badge.svg)](https://codecov.io/gh/ArrogantGao/CuTropicalGEMM.jl)
 
-CuTropicalGEMM is a fast Tropical matrix multiplication on Nvidia GPU. Supported matrix element types include
+<p>
+CuTropicalGEMM is an open source &nbsp;
+    <a href="https://julialang.org">
+        <img src="https://raw.githubusercontent.com/JuliaLang/julia-logo-graphics/master/images/julia.ico" width="16em">
+        Julia
+    </a>
+&nbsp; package for fast generic matrix mulplication (GEMM) of tropical numbers on Nvidia GPU base on CUDA.
+It greatly speed up the tropical GEMM, which is widely used in tensor network contractions.
+</p>
+
+## Features
+
+CuTropicalGEMM support GEMM for various matrix element types:
 * and-or algebra: `TropicalAndOr`
 * max-plus algebra: `Tropical{Float32/Float64}`
-* min-plus algebra numbers: `TropicalMinPlus{Float32/Float64}`
-* max-times algebra numbers: `TropicalMaxMul{Float32/Float64/Int32/Int64}`
+* min-plus algebra: `TropicalMinPlus{Float32/Float64}`
+* max-times algebra: `TropicalMaxMul{Float32/Float64/Int32/Int64}`
 
-Please check [`TropicalNumbers.jl`](https://github.com/TensorBFS/TropicalNumbers.jl) for the definitions of these types. 
+Please check [`TropicalNumbers.jl`](https://github.com/TensorBFS/TropicalNumbers.jl) for the definitions of these types and semiring algebras. 
 
-## Get started
+## Getting Started
 
 Open a Julia REPL and type `]` to enter the `pkg>` mode, and then install related packages with
 ```julia
 pkg> add CuTropicalGEMM, BenchmarkTools, TropicalNumbers, CUDA
-pkg> build
 ```
-Loading `CuTropicalGEMM` module into the workspace affects the `*` and `LinearAlgebra.mul!` on CuTropical matrices immediately. The following is a minimum working example
+
+Loading `CuTropicalGEMM` module into the workspace affects the `*` and `LinearAlgebra.mul!` on CuTropical matrices immediately. 
+The following is a minimum working example:
 ```julia
 julia> using TropicalNumbers, CUDA, BenchmarkTools, LinearAlgebra
 
 julia> a = Tropical.(CUDA.randn(4096, 4096));
 
-julia> @benchmark CUDA.@sync $a * $a
-BenchmarkTools.Trial: 44 samples with 1 evaluation.
- Range (min … max):  108.365 ms … 123.031 ms  ┊ GC (min … max): 0.00% … 0.00%
- Time  (median):     116.051 ms               ┊ GC (median):    0.00%
- Time  (mean ± σ):   116.289 ms ±   4.390 ms  ┊ GC (mean ± σ):  0.00% ± 0.00%
-
-             ▁▁       ▁▁                     ▁     ▄ ▁  ▁█       
-  ▆▁▁▁▁▁▆▆▆▆▆██▁▆▁▁▆▁▆██▁▁▆▁▆▆▆▁▁▆▆▁▁▁▁▆▆▁▁▁▁█▆▆▁▁▁█▆█▆▆██▁▁▁▁▆ ▁
-  108 ms           Histogram: frequency by time          123 ms <
-
- Memory estimate: 5.03 KiB, allocs estimate: 95.
+julia> @btime CUDA.@sync $a * $a;
+  116.272 ms (60 allocations: 2.69 KiB)
 
 julia> using CuTropicalGEMM
 
 julia> @benchmark CUDA.@sync $a * $a
-BenchmarkTools.Trial: 440 samples with 1 evaluation.
- Range (min … max):   8.920 μs … 24.497 ms  ┊ GC (min … max): 0.00% … 0.00%
- Time  (median):     10.733 ms              ┊ GC (median):    0.00%
- Time  (mean ± σ):   11.363 ms ± 11.347 ms  ┊ GC (mean ± σ):  0.00% ± 0.00%
+BenchmarkTools.Trial: 93 samples with 4 evaluations.
+ Range (min … max):   6.653 μs … 158.961 ms  ┊ GC (min … max): 0.00% … 0.00%
+ Time  (median):     13.535 ms               ┊ GC (median):    0.00%
+ Time  (mean ± σ):   13.499 ms ±  15.867 ms  ┊ GC (mean ± σ):  0.00% ± 0.00%
 
-  █                                                       ▅▇▂  
-  █▅▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▄▁▄▄███ ▆
-  8.92 μs      Histogram: log(frequency) by time      23.3 ms <
+                                                             █  
+  ▄▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁█ ▁
+  6.65 μs         Histogram: frequency by time         13.5 ms <
 
- Memory estimate: 160 bytes, allocs estimate: 4.
+ Memory estimate: 256 bytes, allocs estimate: 7.
+```
 
- julia> o = Tropical.(CUDA.zeros(4096, 4096));
+You can also use the function `LinearAlgebra.mul!(o, a, b)`, which allows you to manually allocate memory for the result:
 
-julia> @benchmark CUDA.@sync LinearAlgebra.mul!($o, $a, $a)
-BenchmarkTools.Trial: 70 samples with 7 evaluations.
- Range (min … max):   4.232 μs … 11.490 ms  ┊ GC (min … max): 0.00% … 0.00%
- Time  (median):     11.459 ms              ┊ GC (median):    0.00%
- Time  (mean ± σ):   10.349 ms ±  3.337 ms  ┊ GC (mean ± σ):  0.00% ± 0.00%
+```julia
+julia> o = Tropical.(CUDA.zeros(4096, 4096));
+
+julia> @benchmark CUDA.@sync mul!($o, $a, $a)
+BenchmarkTools.Trial: 61 samples with 7 evaluations.
+ Range (min … max):   4.584 μs … 13.540 ms  ┊ GC (min … max): 0.00% … 0.00%
+ Time  (median):     13.536 ms              ┊ GC (median):    0.00%
+ Time  (mean ± σ):   11.892 ms ±  4.375 ms  ┊ GC (mean ± σ):  0.00% ± 0.00%
 
                                                             █  
-  ▃▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁█ ▁
-  4.23 μs         Histogram: frequency by time        11.5 ms <
+  ▄▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁█ ▁
+  4.58 μs         Histogram: frequency by time        13.5 ms <
 
  Memory estimate: 0 bytes, allocs estimate: 0.
 ```
@@ -75,6 +82,15 @@ The benchmark and plotting scripts could be found in the benchmarks folder.
 
 For matrix size large enough ($n > 3000$), the computation power is about $13$ TFlops for FP32 operations and $3$ TFlops for FP64 operations.
 
+## Questions and Contributions
+
+Please open an [issue](https://github.com/TensorBFS/CuTropicalGEMM.jl/issues)
+if you encounter any problems, or have any feature requests.
+
+## Acknowalgement
+
+We would like to thank Tim Besard for his invaluable guidance and support during the development of the package, his expertise in GPU utilization have been immensely helpful. We would also like to thank Tyler Thomas for his assistance in understanding the usage of `BinaryBuilder.jl`.
+
 ## References
 1. This package originates from the following issue:
 https://github.com/JuliaSIMD/LoopVectorization.jl/issues/201
diff --git a/deps/tropicalgemm_kernels.cu b/deps/tropicalgemm_kernels.cu
index 4a23c01..8d3ac9b 100644
--- a/deps/tropicalgemm_kernels.cu
+++ b/deps/tropicalgemm_kernels.cu
@@ -105,7 +105,9 @@ __global__ void CONCATENATETHREE(TYPENAME, FUNCNAME, TT)(
     TYPE beta,
     int M,
     int N,
-    int K
+    int K, 
+    int DIM_GRID_X, 
+    int DIM_GRID_Y
     ) {
     
     // size of thread block
@@ -115,6 +117,8 @@ __global__ void CONCATENATETHREE(TYPENAME, FUNCNAME, TT)(
 
     // thread id
     const int tid = threadIdx.y * bszx + threadIdx.x;
+    int BLOCK_IDX = blockIdx.x % DIM_GRID_X;
+    int BLOCK_IDY = blockIdx.x / DIM_GRID_X;
 
     // shared memory
 
@@ -153,9 +157,9 @@ __global__ void CONCATENATETHREE(TYPENAME, FUNCNAME, TT)(
 
         #pragma unroll
         for ( int i = 0 ; i < BLOCK_SIZE_M ; i += A_TILE_ROW_STRIDE) {
-            const int row = BLOCK_SIZE_M * blockIdx.y + i + A_TILE_ROW ;
+            const int row = BLOCK_SIZE_M * BLOCK_IDY + i + A_TILE_ROW ;
             const int col = A_TILE_COL + tile_idx;
-            if (tile_idx > K - BLOCK_SIZE_K || blockIdx.y == gridDim.y - 1) {
+            if (tile_idx > K - BLOCK_SIZE_K || BLOCK_IDY == DIM_GRID_Y - 1) {
                 As[OFFSET_row(i + A_TILE_ROW, A_TILE_COL, BLOCK_SIZE_K)] = row < M && col < K ? A[OFFSET_row(
                     row, // row
                     col, // col
@@ -172,8 +176,8 @@ __global__ void CONCATENATETHREE(TYPENAME, FUNCNAME, TT)(
         #pragma unroll
         for ( int i = 0 ; i < BLOCK_SIZE_K; i += B_TILE_ROW_STRIDE) {
             const int row = tile_idx + i + B_TILE_ROW;
-            const int col = B_TILE_COL + BLOCK_SIZE_N * blockIdx.x;
-            if (blockIdx.x == gridDim.x -1 || tile_idx > K - BLOCK_SIZE_K) {
+            const int col = B_TILE_COL + BLOCK_SIZE_N * BLOCK_IDX;
+            if (BLOCK_IDX == DIM_GRID_X -1 || tile_idx > K - BLOCK_SIZE_K) {
                 Bs[OFFSET_row(i + B_TILE_ROW, B_TILE_COL, BLOCK_SIZE_N)] = row < K && col < N ? B[OFFSET_row(
                     row, // row
                     col, // col
@@ -219,9 +223,9 @@ __global__ void CONCATENATETHREE(TYPENAME, FUNCNAME, TT)(
     for (int thread_y = 0; thread_y < THREAD_SIZE_M; ++thread_y) {
         #pragma unroll
         for (int thread_x = 0; thread_x < THREAD_SIZE_N; ++thread_x) {
-            const int row = BLOCK_SIZE_M * blockIdx.y + THREAD_SIZE_M * threadIdx.y + thread_y;
-            const int col = BLOCK_SIZE_N * blockIdx.x + THREAD_SIZE_N * threadIdx.x + thread_x;
-            if (blockIdx.x == gridDim.x -1 || blockIdx.y == gridDim.y - 1) {
+            const int row = BLOCK_SIZE_M * BLOCK_IDY + THREAD_SIZE_M * threadIdx.y + thread_y;
+            const int col = BLOCK_SIZE_N * BLOCK_IDX + THREAD_SIZE_N * threadIdx.x + thread_x;
+            if (BLOCK_IDX == DIM_GRID_X -1 || BLOCK_IDY == DIM_GRID_Y - 1) {
                 if (row < M && col < N) {
                     C[OFFSET_col(row, col, M)] = OPERATOR_ADD(
                         OPERATOR_MUL(C[OFFSET_col(row, col, M)], beta), 
@@ -236,6 +240,7 @@ __global__ void CONCATENATETHREE(TYPENAME, FUNCNAME, TT)(
             }
         }
     }
+    __syncthreads();
 }
 
 template <
@@ -253,7 +258,9 @@ __global__ void CONCATENATETHREE(TYPENAME, FUNCNAME, TN)(
     TYPE beta,
     int M,
     int N,
-    int K
+    int K, 
+    int DIM_GRID_X, 
+    int DIM_GRID_Y
     ) {
     
     // size of thread block
@@ -265,6 +272,9 @@ __global__ void CONCATENATETHREE(TYPENAME, FUNCNAME, TN)(
     const int tid_A = threadIdx.y * bszx + threadIdx.x;
     const int tid_B = threadIdx.y + threadIdx.x * bszy;
 
+    int BLOCK_IDX = blockIdx.x % DIM_GRID_X;
+    int BLOCK_IDY = blockIdx.x / DIM_GRID_X;
+
     // shared memory
 
     __shared__ TYPE As[BLOCK_SIZE_M * BLOCK_SIZE_K]; // avoid bank conflict
@@ -301,9 +311,9 @@ __global__ void CONCATENATETHREE(TYPENAME, FUNCNAME, TN)(
 
         #pragma unroll
         for ( int i = 0 ; i < BLOCK_SIZE_M ; i += A_TILE_ROW_STRIDE) {
-            const int row = BLOCK_SIZE_M * blockIdx.y + i + A_TILE_ROW ;
+            const int row = BLOCK_SIZE_M * BLOCK_IDY + i + A_TILE_ROW ;
             const int col = A_TILE_COL + tile_idx;
-            if (tile_idx > K - BLOCK_SIZE_K || blockIdx.y == gridDim.y - 1) {
+            if (tile_idx > K - BLOCK_SIZE_K || BLOCK_IDY == DIM_GRID_Y - 1) {
                 As[OFFSET_row(i + A_TILE_ROW, A_TILE_COL, BLOCK_SIZE_K)] = row < M && col < K ? A[OFFSET_row(
                     row, // row
                     col, // col
@@ -320,8 +330,8 @@ __global__ void CONCATENATETHREE(TYPENAME, FUNCNAME, TN)(
         #pragma unroll
         for ( int i = 0 ; i < BLOCK_SIZE_N; i += B_TILE_COL_STRIDE) {
             const int row = tile_idx + B_TILE_ROW;
-            const int col = B_TILE_COL + i + BLOCK_SIZE_N * blockIdx.x;
-            if (blockIdx.x == gridDim.x -1 || tile_idx > K - BLOCK_SIZE_K) {
+            const int col = B_TILE_COL + i + BLOCK_SIZE_N * BLOCK_IDX;
+            if (BLOCK_IDX == DIM_GRID_X -1 || tile_idx > K - BLOCK_SIZE_K) {
                 Bs[OFFSET_row(B_TILE_ROW, i + B_TILE_COL, BLOCK_SIZE_N)] = row < K && col < N ? B[OFFSET_col(row, col, K)] : PADDING;
             } else {
                 Bs[OFFSET_row(B_TILE_ROW, i + B_TILE_COL, BLOCK_SIZE_N)] = B[OFFSET_col(row, col, K)];
@@ -361,9 +371,9 @@ __global__ void CONCATENATETHREE(TYPENAME, FUNCNAME, TN)(
     for (int thread_y = 0; thread_y < THREAD_SIZE_M; ++thread_y) {
         #pragma unroll
         for (int thread_x = 0; thread_x < THREAD_SIZE_N; ++thread_x) {
-            const int row = BLOCK_SIZE_M * blockIdx.y + THREAD_SIZE_M * threadIdx.y + thread_y;
-            const int col = BLOCK_SIZE_N * blockIdx.x + THREAD_SIZE_N * threadIdx.x + thread_x;
-            if (blockIdx.x == gridDim.x -1 || blockIdx.y == gridDim.y - 1) {
+            const int row = BLOCK_SIZE_M * BLOCK_IDY + THREAD_SIZE_M * threadIdx.y + thread_y;
+            const int col = BLOCK_SIZE_N * BLOCK_IDX + THREAD_SIZE_N * threadIdx.x + thread_x;
+            if (BLOCK_IDX == DIM_GRID_X -1 || BLOCK_IDY == DIM_GRID_Y - 1) {
                 if (row < M && col < N) {
                     C[OFFSET_col(row, col, M)] = OPERATOR_ADD(
                         OPERATOR_MUL(C[OFFSET_col(row, col, M)], beta), 
@@ -378,6 +388,7 @@ __global__ void CONCATENATETHREE(TYPENAME, FUNCNAME, TN)(
             }
         }
     }
+    __syncthreads();
 }
 
 template <
@@ -395,7 +406,9 @@ __global__ void CONCATENATETHREE(TYPENAME, FUNCNAME, NT)(
     TYPE beta,
     int M,
     int N,
-    int K
+    int K, 
+    int DIM_GRID_X, 
+    int DIM_GRID_Y
     ) {
     
     // size of thread block
@@ -407,6 +420,9 @@ __global__ void CONCATENATETHREE(TYPENAME, FUNCNAME, NT)(
     const int BLOCK_SIZE_KN = BLOCK_SIZE_K * BLOCK_SIZE_N;
     const int THREAD_SIZE_MN = THREAD_SIZE_M * THREAD_SIZE_N;
 
+    int BLOCK_IDX = blockIdx.x % DIM_GRID_X;
+    int BLOCK_IDY = blockIdx.x / DIM_GRID_X;
+
     // thread id
     const int tid = threadIdx.y * bszm + threadIdx.x;
 
@@ -446,10 +462,10 @@ __global__ void CONCATENATETHREE(TYPENAME, FUNCNAME, NT)(
         // load A from global memory to shared memory
         #pragma unroll
         for ( int i = 0 ; i < BLOCK_SIZE_K ; i += A_TILE_COL_STRIDE) {
-            const int row = BLOCK_SIZE_M * blockIdx.x + A_TILE_ROW ;
+            const int row = BLOCK_SIZE_M * BLOCK_IDX + A_TILE_ROW ;
             const int col = A_TILE_COL + i + tile_idx;
 
-            if (blockIdx.x == gridDim.x -1 || tile_idx >= K - BLOCK_SIZE_K) {
+            if (BLOCK_IDX == DIM_GRID_X -1 || tile_idx >= K - BLOCK_SIZE_K) {
                 As[OFFSET_col(A_TILE_ROW, i + A_TILE_COL, BLOCK_SIZE_M)] = row < M && col < K ? A[OFFSET_col(row, col, M)] : PADDING;
             } else {
                 As[OFFSET_col(A_TILE_ROW, i + A_TILE_COL, BLOCK_SIZE_M)] = A[OFFSET_col(row, col, M)];
@@ -460,9 +476,9 @@ __global__ void CONCATENATETHREE(TYPENAME, FUNCNAME, NT)(
         #pragma unroll
         for ( int i = 0 ; i < BLOCK_SIZE_K; i += B_TILE_ROW_STRIDE) {
             const int row = tile_idx + i + B_TILE_ROW;
-            const int col = B_TILE_COL + BLOCK_SIZE_N * blockIdx.y;
+            const int col = B_TILE_COL + BLOCK_SIZE_N * BLOCK_IDY;
 
-            if (blockIdx.y == gridDim.y -1 || tile_idx > K - BLOCK_SIZE_K) {
+            if (BLOCK_IDY == DIM_GRID_Y -1 || tile_idx > K - BLOCK_SIZE_K) {
                 Bs[OFFSET_row(i + B_TILE_ROW, B_TILE_COL, BLOCK_SIZE_N)] = row < K && col < N ? B[OFFSET_row(row, col, N)] : PADDING;
             } else {
                 Bs[OFFSET_row(i + B_TILE_ROW, B_TILE_COL, BLOCK_SIZE_N)] = B[OFFSET_row(row, col, N)];
@@ -504,9 +520,9 @@ __global__ void CONCATENATETHREE(TYPENAME, FUNCNAME, NT)(
     for (int thread_m = 0; thread_m < THREAD_SIZE_M; ++thread_m) {
         #pragma unroll
         for (int thread_n = 0; thread_n < THREAD_SIZE_N; ++thread_n) {
-            const int col = BLOCK_SIZE_N * blockIdx.y + THREAD_SIZE_N * threadIdx.y + thread_n;
-            const int row = BLOCK_SIZE_M * blockIdx.x + THREAD_SIZE_M * threadIdx.x + thread_m;
-            if (blockIdx.x == gridDim.x -1 || blockIdx.y == gridDim.y - 1) {
+            const int col = BLOCK_SIZE_N * BLOCK_IDY + THREAD_SIZE_N * threadIdx.y + thread_n;
+            const int row = BLOCK_SIZE_M * BLOCK_IDX + THREAD_SIZE_M * threadIdx.x + thread_m;
+            if (BLOCK_IDX == DIM_GRID_X -1 || BLOCK_IDY == DIM_GRID_Y - 1) {
                 if (row < M && col < N) {
                     C[OFFSET_col(row, col, M)] = OPERATOR_ADD(
                         OPERATOR_MUL(accum[OFFSET_col(thread_m, thread_n, THREAD_SIZE_M)], alpha), 
@@ -521,6 +537,7 @@ __global__ void CONCATENATETHREE(TYPENAME, FUNCNAME, NT)(
             }
         }
     }
+    __syncthreads();
 }
 
 template <
@@ -538,7 +555,9 @@ __global__ void CONCATENATETHREE(TYPENAME, FUNCNAME, NN)(
     TYPE beta,
     int M,
     int N,
-    int K
+    int K, 
+    int DIM_GRID_X, 
+    int DIM_GRID_Y
     ) {
     
     // size of thread block
@@ -550,6 +569,9 @@ __global__ void CONCATENATETHREE(TYPENAME, FUNCNAME, NN)(
     const int BLOCK_SIZE_KN = BLOCK_SIZE_K * BLOCK_SIZE_N;
     const int THREAD_SIZE_MN = THREAD_SIZE_M * THREAD_SIZE_N;
 
+    int BLOCK_IDX = blockIdx.x % DIM_GRID_X;
+    int BLOCK_IDY = blockIdx.x / DIM_GRID_X;
+
     // thread id
     const int tid = threadIdx.y * bszm + threadIdx.x;
 
@@ -592,10 +614,10 @@ __global__ void CONCATENATETHREE(TYPENAME, FUNCNAME, NN)(
         // load A from global memory to shared memory
         #pragma unroll
         for ( int i = 0 ; i < BLOCK_SIZE_K ; i += A_TILE_COL_STRIDE) {
-            const int row = BLOCK_SIZE_M * blockIdx.x + A_TILE_ROW ;
+            const int row = BLOCK_SIZE_M * BLOCK_IDX + A_TILE_ROW ;
             const int col = A_TILE_COL + i + tile_idx;
 
-            if (blockIdx.x == gridDim.x -1 || tile_idx >= K - BLOCK_SIZE_K) {
+            if (BLOCK_IDX == DIM_GRID_X -1 || tile_idx >= K - BLOCK_SIZE_K) {
                 As[OFFSET_col(A_TILE_ROW, i + A_TILE_COL, BLOCK_SIZE_M)] = row < M && col < K ? A[OFFSET_col(row, col, M)] : PADDING;
             } else {
                 As[OFFSET_col(A_TILE_ROW, i + A_TILE_COL, BLOCK_SIZE_M)] = A[OFFSET_col(row, col, M)];
@@ -606,9 +628,9 @@ __global__ void CONCATENATETHREE(TYPENAME, FUNCNAME, NN)(
         #pragma unroll
         for ( int i = 0 ; i < BLOCK_SIZE_N; i += B_TILE_COL_STRIDE) {
             const int row = tile_idx + B_TILE_ROW;
-            const int col = BLOCK_SIZE_N * blockIdx.y + i + B_TILE_COL;
+            const int col = BLOCK_SIZE_N * BLOCK_IDY + i + B_TILE_COL;
 
-            if (tile_idx >= K - BLOCK_SIZE_K || blockIdx.y == gridDim.y - 1) {
+            if (tile_idx >= K - BLOCK_SIZE_K || BLOCK_IDY == DIM_GRID_Y - 1) {
                 Bs[OFFSET_col(B_TILE_ROW, i + B_TILE_COL, BLOCK_SIZE_K)] = row < K && col < N ? B[OFFSET_col(row, col, K)] : PADDING;
             } else {
                 Bs[OFFSET_col(B_TILE_ROW, i + B_TILE_COL, BLOCK_SIZE_K)] = B[OFFSET_col(row, col, K)];
@@ -650,10 +672,10 @@ __global__ void CONCATENATETHREE(TYPENAME, FUNCNAME, NN)(
     for (int thread_m = 0; thread_m < THREAD_SIZE_M; ++thread_m) {
         #pragma unroll
         for (int thread_n = 0; thread_n < THREAD_SIZE_N; ++thread_n) {
-            const int col = BLOCK_SIZE_N * blockIdx.y + THREAD_SIZE_N * threadIdx.y + thread_n;
-            const int row = BLOCK_SIZE_M * blockIdx.x + THREAD_SIZE_M * threadIdx.x + thread_m;
+            const int col = BLOCK_SIZE_N * BLOCK_IDY + THREAD_SIZE_N * threadIdx.y + thread_n;
+            const int row = BLOCK_SIZE_M * BLOCK_IDX + THREAD_SIZE_M * threadIdx.x + thread_m;
 
-            if (blockIdx.x == gridDim.x -1 || blockIdx.y == gridDim.y - 1) {
+            if (BLOCK_IDX == DIM_GRID_X -1 || BLOCK_IDY == DIM_GRID_Y - 1) {
                 if (row < M && col < N) {
                     C[OFFSET_col(row, col, M)] = OPERATOR_ADD(
                         OPERATOR_MUL(accum[OFFSET_col(thread_m, thread_n, THREAD_SIZE_M)], alpha), 
@@ -668,6 +690,7 @@ __global__ void CONCATENATETHREE(TYPENAME, FUNCNAME, NN)(
             }
         }
     }
+    __syncthreads();
 }
 
 extern "C"{
@@ -682,53 +705,70 @@ void CONCATENATE(TYPENAME, FUNCNAME)(const int m, const int n, const int k, TYPE
     const int BLOCK_SIZE_N = 64;
     const int THREAD_SIZE_M = 4;
     const int THREAD_SIZE_N = 4;
+
     
     if (TA == T && TB == T) {
         dim3 dimBlock(BLOCK_SIZE_N / THREAD_SIZE_N, BLOCK_SIZE_M / THREAD_SIZE_M);
-        dim3 dimGrid(n / BLOCK_SIZE_N, m / BLOCK_SIZE_M);
+        
+        int DIM_GRID_X = n / BLOCK_SIZE_N;
+        int DIM_GRID_Y = m / BLOCK_SIZE_M;
         if (n % BLOCK_SIZE_N != 0)
-            dimGrid.x++;
+            DIM_GRID_X++;
         if (m % BLOCK_SIZE_M != 0)
-            dimGrid.y++;
+            DIM_GRID_Y++;
+
+        dim3 dimGrid(DIM_GRID_X * DIM_GRID_Y);
 
         CONCATENATETHREE(TYPENAME, FUNCNAME, TT)<BLOCK_SIZE_M, BLOCK_SIZE_K, BLOCK_SIZE_N, THREAD_SIZE_M, THREAD_SIZE_N> 
-            <<< dimGrid, dimBlock >>>(d_A, d_B, d_C, alpha, beta, m, n, k);
+        <<< dimGrid, dimBlock >>>(d_A, d_B, d_C, alpha, beta, m, n, k, DIM_GRID_X, DIM_GRID_Y);
     }
 
     if (TA == T && TB == N) {
         dim3 dimBlock(BLOCK_SIZE_N / THREAD_SIZE_N, BLOCK_SIZE_M / THREAD_SIZE_M);
-        dim3 dimGrid(n / BLOCK_SIZE_N, m / BLOCK_SIZE_M);
+        
+        int DIM_GRID_X = n / BLOCK_SIZE_N;
+        int DIM_GRID_Y = m / BLOCK_SIZE_M;
         if (n % BLOCK_SIZE_N != 0)
-            dimGrid.x++;
+            DIM_GRID_X++;
         if (m % BLOCK_SIZE_M != 0)
-            dimGrid.y++;
+            DIM_GRID_Y++;
+
+        dim3 dimGrid(DIM_GRID_X * DIM_GRID_Y);
             
         CONCATENATETHREE(TYPENAME, FUNCNAME, TN)<BLOCK_SIZE_M, BLOCK_SIZE_K, BLOCK_SIZE_N, THREAD_SIZE_M, THREAD_SIZE_N> 
-        <<< dimGrid, dimBlock >>>(d_A, d_B, d_C, alpha, beta, m, n, k);
+        <<< dimGrid, dimBlock >>>(d_A, d_B, d_C, alpha, beta, m, n, k, DIM_GRID_X, DIM_GRID_Y);
     }
 
     if (TA == N && TB == T) {
         dim3 dimBlock(BLOCK_SIZE_M / THREAD_SIZE_M, BLOCK_SIZE_N / THREAD_SIZE_N);
-        dim3 dimGrid(m / BLOCK_SIZE_M, n / BLOCK_SIZE_N);
+
+        int DIM_GRID_X = m / BLOCK_SIZE_M;
+        int DIM_GRID_Y = n / BLOCK_SIZE_N;
         if (m % BLOCK_SIZE_M != 0)
-            dimGrid.x++;
+            DIM_GRID_X++;
         if (n % BLOCK_SIZE_N != 0)
-            dimGrid.y++;
+            DIM_GRID_Y++;
+
+        dim3 dimGrid(DIM_GRID_X * DIM_GRID_Y);
 
         CONCATENATETHREE(TYPENAME, FUNCNAME, NT)<BLOCK_SIZE_M, BLOCK_SIZE_K, BLOCK_SIZE_N, THREAD_SIZE_M, THREAD_SIZE_N> 
-            <<< dimGrid, dimBlock >>>(d_A, d_B, d_C, alpha, beta, m, n, k);
+            <<< dimGrid, dimBlock >>>(d_A, d_B, d_C, alpha, beta, m, n, k, DIM_GRID_X, DIM_GRID_Y);
     }
 
     if (TA == N && TB == N) {
         dim3 dimBlock(BLOCK_SIZE_M / THREAD_SIZE_M, BLOCK_SIZE_N / THREAD_SIZE_N);
-        dim3 dimGrid(m / BLOCK_SIZE_M, n / BLOCK_SIZE_N);
+        
+        int DIM_GRID_X = m / BLOCK_SIZE_M;
+        int DIM_GRID_Y = n / BLOCK_SIZE_N;
         if (m % BLOCK_SIZE_M != 0)
-            dimGrid.x++;
+            DIM_GRID_X++;
         if (n % BLOCK_SIZE_N != 0)
-            dimGrid.y++;
+            DIM_GRID_Y++;
+
+        dim3 dimGrid(DIM_GRID_X * DIM_GRID_Y);
 
         CONCATENATETHREE(TYPENAME, FUNCNAME, NN)<BLOCK_SIZE_M, BLOCK_SIZE_K, BLOCK_SIZE_N, THREAD_SIZE_M, THREAD_SIZE_N> 
-            <<< dimGrid, dimBlock >>>(d_A, d_B, d_C, alpha, beta, m, n, k);
+            <<< dimGrid, dimBlock >>>(d_A, d_B, d_C, alpha, beta, m, n, k, DIM_GRID_X, DIM_GRID_Y);
     }
 
 }
diff --git a/test/Project.toml b/test/Project.toml
new file mode 100644
index 0000000..416b83f
--- /dev/null
+++ b/test/Project.toml
@@ -0,0 +1,5 @@
+[deps]
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+TropicalNumbers = "b3a74e9c-7526-4576-a4eb-79c0d4c32334"
+Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
\ No newline at end of file