diff --git a/scripts/build_spock_offload_cce.sh b/scripts/build_spock_offload_cce.sh
new file mode 100644
index 000000000..c8f6c5321
--- /dev/null
+++ b/scripts/build_spock_offload_cce.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+
+# Make sure all the paths are correct
+
+rm -r build
+rm -r install
+
+MY_PATH=$(pwd)
+
+export CC=${CC:=cc}
+export FC=${FC:=ftn}
+export CXX=${CXX:=CC}
+export BLAS_VENDOR=${BLAS_VENDOR:=Auto}
+export BML_OPENMP=${BML_OPENMP:=yes}
+export INSTALL_DIR=${INSTALL_DIR:="${MY_PATH}/install"}
+export BML_MAGMA=${BML_MAGMA:=yes}
+export MAGMA_ROOT=${MAGMA_HOME}
+export BML_TESTING=${BML_TESTING:=yes}
+export CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:=Release}
+export CMAKE_Fortran_FLAGS=${CMAKE_Fortran_FLAGS:="-ef -DCRAY_SDK"}
+#export EXTRA_CFLAGS=${EXTRA_CFLAGS:="-Ofast -DUSE_OMP_OFFLOAD -DCRAY_SDK"}
+export CMAKE_C_FLAGS=${CMAKE_C_FLAGS:="-Ofast -DUSE_OMP_OFFLOAD -DCRAY_SDK"}
+export EXTRA_LINK_FLAGS=${EXTRA_LINK_FLAGS:="-L${LIBSCI_BASE_DIR}/cray/9.0/x86_64/lib"}
+
+./build.sh configure
+
+pushd build
+make -j8
+make install
+popd
diff --git a/scripts/build_tulip_offload_cce.sh b/scripts/build_tulip_offload_cce.sh
new file mode 100755
index 000000000..a06f06d50
--- /dev/null
+++ b/scripts/build_tulip_offload_cce.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+
+# Make sure all the paths are correct
+
+rm -r build
+rm -r install
+
+MY_PATH=$(pwd)
+
+export CC=${CC:=cc}
+export FC=${FC:=ftn}
+export CXX=${CXX:=CC}
+export BLAS_VENDOR=${BLAS_VENDOR:=Auto}
+export BML_OPENMP=${BML_OPENMP:=yes}
+export INSTALL_DIR=${INSTALL_DIR:="${MY_PATH}/install"}
+export BML_MAGMA=${BML_MAGMA:=yes}
+export MAGMA_ROOT=${MAGMA_HOME}
+export BML_TESTING=${BML_TESTING:=yes}
+export CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:=Release}
+export CMAKE_Fortran_FLAGS=${CMAKE_Fortran_FLAGS:="-ef -DCRAY_SDK"}
+#export EXTRA_CFLAGS=${EXTRA_CFLAGS:="-Ofast -DUSE_OMP_OFFLOAD -DCRAY_SDK"}
+export CMAKE_C_FLAGS=${CMAKE_C_FLAGS:="-Ofast -DUSE_OMP_OFFLOAD -DCRAY_SDK"}
+export EXTRA_LINK_FLAGS=${EXTRA_LINK_FLAGS:="-L${BLASDIR} -l${BLASLIB}"}
+
+./build.sh configure
+
+pushd build
+make -j8
+make install
+popd
diff --git a/scripts/setenv_spock_offload.sh b/scripts/setenv_spock_offload.sh
new file mode 100644
index 000000000..66c076a51
--- /dev/null
+++ b/scripts/setenv_spock_offload.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+module load craype-accel-amd-gfx908
+#module unload cray-mvapich2 cray-libsci
+#module use /home/groups/coegroup/share/coe/modulefiles
+module load rocm/4.1.0
+#module swap cce cce/11.0.4
+#module load ompi/4.1.0/cce/11.0.4/rocm/4.1.1
+#module load blas
+#module load rocm/4.2.0
+module load cmake
+export LD_LIBRARY_PATH="$CRAY_LD_LIBRARY_PATH:$LD_LIBRARY_PATH"
+
diff --git a/scripts/setenv_tulip_offload.sh b/scripts/setenv_tulip_offload.sh
new file mode 100644
index 000000000..8405ad6d6
--- /dev/null
+++ b/scripts/setenv_tulip_offload.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+module load craype-accel-amd-gfx908
+module unload cray-mvapich2 cray-libsci
+module use /home/groups/coegroup/share/coe/modulefiles
+module load rocm/4.1.1
+module swap cce cce/11.0.4
+module load ompi/4.1.0/cce/11.0.4/rocm/4.1.1
+module load blas
+#module load rocm/4.2.0
+module load cmake
+export LD_LIBRARY_PATH="$CRAY_LD_LIBRARY_PATH:$LD_LIBRARY_PATH"
+
diff --git a/src/C-interface/ellpack/bml_add_ellpack_typed.c b/src/C-interface/ellpack/bml_add_ellpack_typed.c
index 8766f4ee8..41ecf5c4c 100644
--- a/src/C-interface/ellpack/bml_add_ellpack_typed.c
+++ b/src/C-interface/ellpack/bml_add_ellpack_typed.c
@@ -57,7 +57,7 @@ void TYPED_FUNC(
     int rowMin = A_localRowMin[myRank];
     int rowMax = A_localRowMax[myRank];
 
-#if !(defined(__IBMC__) || defined(__ibmxl__) || (defined(USE_OMP_OFFLOAD) && defined(INTEL_SDK)))
+#if !(defined(__IBMC__) || defined(__ibmxl__) || (defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK))))
     int ix[N], jx[N];
     REAL_T x[N];
 
@@ -66,7 +66,7 @@ void TYPED_FUNC(
     memset(x, 0.0, N * sizeof(REAL_T));
 #endif
 
-#if defined(USE_OMP_OFFLOAD) && defined(INTEL_SDK)
+#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK))
     int num_chunks = MIN(OFFLOAD_NUM_CHUNKS, rowMax - rowMin + 1);
 
     int all_ix[N * num_chunks], all_jx[N * num_chunks];
@@ -81,7 +81,7 @@ void TYPED_FUNC(
 #endif
 
 #if defined (USE_OMP_OFFLOAD)
-#if defined(INTEL_SDK)
+#if defined(INTEL_SDK) || defined(CRAY_SDK)
 #pragma omp teams distribute parallel for \
     shared(rowMin, rowMax)                \
     shared(A_index, A_value, A_nnz)       \
@@ -116,7 +116,7 @@ void TYPED_FUNC(
     firstprivate(ix, jx, x)
 #endif
 #endif
-#if defined(USE_OMP_OFFLOAD) && defined(INTEL_SDK)
+#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK))
     for (int i = rowMin + chunk; i < rowMax; i = i + num_chunks)
 #else
     for (int i = rowMin; i < rowMax; i++)
@@ -175,7 +175,7 @@ void TYPED_FUNC(
         }
         A_nnz[i] = ll;
     }
-#if defined(USE_OMP_OFFLOAD) && defined(INTEL_SDK)
+#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK))
 }
 #endif
 }
@@ -223,7 +223,7 @@ double TYPED_FUNC(
     int rowMin = A_localRowMin[myRank];
     int rowMax = A_localRowMax[myRank];
 
-#if !(defined(__IBMC__) || defined(__ibmxl__))
+#if !(defined(__IBMC__) || defined(__ibmxl__) || (defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK))))
     int ix[N], jx[N];
     REAL_T x[N];
     REAL_T y[N];
@@ -234,13 +234,47 @@ double TYPED_FUNC(
     memset(y, 0.0, N * sizeof(REAL_T));
 #endif
 
+#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK))
+    int num_chunks = MIN(OFFLOAD_NUM_CHUNKS, rowMax - rowMin + 1);
+
+    int all_ix[N * num_chunks], all_jx[N * num_chunks];
+    REAL_T all_x[N * num_chunks], all_y[N * num_chunks];
+
+    memset(all_ix, 0, N * num_chunks * sizeof(int));
+    memset(all_jx, 0, N * num_chunks * sizeof(int));
+    memset(all_x, 0.0, N * num_chunks * sizeof(REAL_T));
+    memset(all_y, 0.0, N * num_chunks * sizeof(REAL_T));
+
+#pragma omp target map(to:all_ix[0:N*num_chunks],all_jx[0:N*num_chunks],all_x[0:N*num_chunks],all_y[0:N*num_chunks]) map(tofrom:trnorm)
+
+#endif
+
 #if defined (USE_OMP_OFFLOAD)
-#pragma omp target teams distribute parallel for \
+#if defined(INTEL_SDK) || defined(CRAY_SDK)
+#pragma omp teams distribute parallel for \
+    shared(rowMin, rowMax)                \
+    shared(A_index, A_value, A_nnz)       \
+    shared(B_index, B_value, B_nnz)       \
+    reduction(+:trnorm)
+    for (int chunk = 0; chunk < num_chunks; chunk++)
+    {
+        int *ix, *jx;
+        REAL_T *x, *y;
+
+        ix = &all_ix[chunk * N];
+        jx = &all_jx[chunk * N];
+        x = &all_x[chunk * N];
+        y = &all_y[chunk * N];
+
+#else
+
+#pragma omp target teams distribute parallel for	\
     shared(rowMin, rowMax)                \
     shared(A_index, A_value, A_nnz)       \
     shared(B_index, B_value, B_nnz)       \
     firstprivate(ix, jx, x, y)            \
     reduction(+:trnorm)
+#endif
 #else
 #if defined(__IBMC__) || defined(__ibmxl__)
 #pragma omp parallel for                  \
@@ -257,7 +291,11 @@ double TYPED_FUNC(
     reduction(+:trnorm)
 #endif
 #endif
+#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK))
+    for (int i = rowMin + chunk; i < rowMax; i = i + num_chunks)
+#else
     for (int i = rowMin; i < rowMax; i++)
+#endif
     {
 
 #if defined(__IBMC__) || defined(__ibmxl__)
@@ -321,8 +359,11 @@ double TYPED_FUNC(
         }
         A_nnz[i] = ll;
     }
+#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK))
+}
+#endif
 
-    return trnorm;
+return trnorm;
 }
 
 /** Matrix addition.
@@ -348,7 +389,7 @@ void TYPED_FUNC(
     int *A_index = A->index;
     REAL_T *A_value = (REAL_T *) A->value;
 
-#if !(defined(__IBMC__) || defined(__ibmxl__))
+#if !(defined(__IBMC__) || defined(__ibmxl__) || (defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK))))
     int jx[A_M];
     REAL_T x[A_M];
 
@@ -356,11 +397,38 @@ void TYPED_FUNC(
     memset(x, 0.0, A_M * sizeof(REAL_T));
 #endif
 
+#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK))
+    int num_chunks = MIN(OFFLOAD_NUM_CHUNKS, N);
+
+    int all_jx[N * num_chunks];
+    REAL_T all_x[N * num_chunks];
+
+    memset(all_jx, 0, N * num_chunks * sizeof(int));
+    memset(all_x, 0.0, N * num_chunks * sizeof(REAL_T));
+
+#pragma omp target map(to:all_jx[0:N*num_chunks],all_x[0:N*num_chunks])
+
+#endif
+
 #if defined (USE_OMP_OFFLOAD)
+#if defined(INTEL_SDK) || defined(CRAY_SDK)
+#pragma omp teams distribute parallel for \
+    shared(N, A_M)                \
+    shared(A_index, A_value, A_nnz)
+    for (int chunk = 0; chunk < num_chunks; chunk++)
+    {
+        int *jx;
+        REAL_T *x;
+
+        jx = &all_jx[chunk * N];
+        x = &all_x[chunk * N];
+
+#else
 #pragma omp target teams distribute parallel for \
   shared(N, A_M)           \
     shared(A_index, A_value, A_nnz)       \
     firstprivate(jx, x)
+#endif
 #else
 #if defined(__IBMC__) || defined(__ibmxl__)
 #pragma omp parallel for \
@@ -373,7 +441,11 @@ void TYPED_FUNC(
     firstprivate(jx, x)
 #endif
 #endif
+#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK))
+    for (int i = chunk; i < N; i = i + num_chunks)
+#else
     for (int i = 0; i < N; i++)
+#endif
     {
 
 #if defined(__IBMC__) || defined(__ibmxl__)
@@ -423,6 +495,9 @@ void TYPED_FUNC(
         }
         A_nnz[i] = ll;
     }
+#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK))
+}
+#endif
 }
 
 /** Matrix addition.
diff --git a/src/C-interface/ellpack/bml_allocate_ellpack.c b/src/C-interface/ellpack/bml_allocate_ellpack.c
index abb80e3aa..b5f2c112b 100644
--- a/src/C-interface/ellpack/bml_allocate_ellpack.c
+++ b/src/C-interface/ellpack/bml_allocate_ellpack.c
@@ -17,24 +17,24 @@ void
 bml_deallocate_ellpack(
     bml_matrix_ellpack_t * A)
 {
-#ifdef USE_OMP_OFFLOAD
-    int N = A->N;
-    int M = A->M;
-
-    int *A_nnz = A->nnz;
-    int *A_index = A->index;
-    // JAMAL: need to make a typed deallocator
-    double *A_value = A->value;
-
-#pragma omp target exit data map(delete: A_nnz[:N], A_index[:N*M], A_value[:N*M])
-#endif
-
-    bml_deallocate_domain(A->domain);
-    bml_deallocate_domain(A->domain2);
-    bml_free_memory(A->value);
-    bml_free_memory(A->index);
-    bml_free_memory(A->nnz);
-    bml_free_memory(A);
+    switch (A->matrix_precision)
+    {
+        case single_real:
+            bml_deallocate_ellpack_single_real(A);
+            break;
+        case double_real:
+            bml_deallocate_ellpack_double_real(A);
+            break;
+        case single_complex:
+            bml_deallocate_ellpack_single_complex(A);
+            break;
+        case double_complex:
+            bml_deallocate_ellpack_double_complex(A);
+            break;
+        default:
+            LOG_ERROR("unknown precision\n");
+            break;
+    }
 }
 
 /** Clear a matrix.
diff --git a/src/C-interface/ellpack/bml_allocate_ellpack.h b/src/C-interface/ellpack/bml_allocate_ellpack.h
index 5b7f80df9..9068e8bc7 100644
--- a/src/C-interface/ellpack/bml_allocate_ellpack.h
+++ b/src/C-interface/ellpack/bml_allocate_ellpack.h
@@ -6,6 +6,18 @@
 void bml_deallocate_ellpack(
     bml_matrix_ellpack_t * A);
 
+void bml_deallocate_ellpack_single_real(
+    bml_matrix_ellpack_t * A);
+
+void bml_deallocate_ellpack_double_real(
+    bml_matrix_ellpack_t * A);
+
+void bml_deallocate_ellpack_single_complex(
+    bml_matrix_ellpack_t * A);
+
+void bml_deallocate_ellpack_double_complex(
+    bml_matrix_ellpack_t * A);
+
 void bml_clear_ellpack(
     bml_matrix_ellpack_t * A);
 
diff --git a/src/C-interface/ellpack/bml_allocate_ellpack_typed.c b/src/C-interface/ellpack/bml_allocate_ellpack_typed.c
index 5d0cc67a6..c7ffa8f7f 100644
--- a/src/C-interface/ellpack/bml_allocate_ellpack_typed.c
+++ b/src/C-interface/ellpack/bml_allocate_ellpack_typed.c
@@ -15,6 +15,35 @@
 #include <omp.h>
 #endif
 
+/** Deallocate a matrix.
+ *
+ * \ingroup allocate_group
+ *
+ * \param A The matrix.
+ */
+void TYPED_FUNC(
+    bml_deallocate_ellpack) (
+    bml_matrix_ellpack_t * A)
+{
+#ifdef USE_OMP_OFFLOAD
+    int N = A->N;
+    int M = A->M;
+
+    int *A_nnz = A->nnz;
+    int *A_index = A->index;
+    REAL_T *A_value = A->value;
+
+#pragma omp target exit data map(delete: A_nnz[:N], A_index[:N*M], A_value[:N*M])
+#endif
+
+    bml_deallocate_domain(A->domain);
+    bml_deallocate_domain(A->domain2);
+    bml_free_memory(A->value);
+    bml_free_memory(A->index);
+    bml_free_memory(A->nnz);
+    bml_free_memory(A);
+}
+
 /** Clear a matrix.
  *
  * Numbers of non-zeroes, indeces, and values are set to zero.
diff --git a/src/C-interface/ellpack/bml_multiply_ellpack_typed.c b/src/C-interface/ellpack/bml_multiply_ellpack_typed.c
index b264410a0..9fc4873a0 100644
--- a/src/C-interface/ellpack/bml_multiply_ellpack_typed.c
+++ b/src/C-interface/ellpack/bml_multiply_ellpack_typed.c
@@ -128,7 +128,7 @@ void *TYPED_FUNC(
     int rowMin = X_localRowMin[myRank];
     int rowMax = X_localRowMax[myRank];
 
-#if !(defined(__IBMC__) || defined(__ibmxl__) || (defined(USE_OMP_OFFLOAD) && defined(INTEL_SDK)))
+#if !(defined(__IBMC__) || defined(__ibmxl__) || (defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK))))
     int ix[X_N], jx[X_N];
     REAL_T x[X_N];
 
@@ -137,7 +137,7 @@ void *TYPED_FUNC(
     memset(x, 0.0, X_N * sizeof(REAL_T));
 #endif
 
-#if defined(USE_OMP_OFFLOAD) && defined(INTEL_SDK)
+#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK))
     int num_chunks = MIN(OFFLOAD_NUM_CHUNKS, rowMax - rowMin + 1);
 
     int all_ix[X_N * num_chunks], all_jx[X_N * num_chunks];
@@ -152,7 +152,7 @@ void *TYPED_FUNC(
 #endif
 
 #if defined (USE_OMP_OFFLOAD)
-#if defined(INTEL_SDK)
+#if defined(INTEL_SDK) || defined(CRAY_SDK)
 #pragma omp teams distribute parallel for	\
     shared(X_N, X_M, X_index, X_nnz, X_value)  \
     shared(X2_N, X2_M, X2_index, X2_nnz, X2_value)     \
@@ -195,7 +195,7 @@ void *TYPED_FUNC(
 #endif
 #endif
 
-#if defined(USE_OMP_OFFLOAD) && defined(INTEL_SDK)
+#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK))
     for (int i = rowMin + chunk; i < rowMax; i = i + num_chunks)
 #else
     for (int i = rowMin; i < rowMax; i++)
@@ -277,7 +277,7 @@ void *TYPED_FUNC(
         X2_nnz[i] = ll;
     }
 
-#if defined(USE_OMP_OFFLOAD) && defined(INTEL_SDK)
+#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK))
 }
 #endif
 
@@ -330,7 +330,7 @@ void TYPED_FUNC(
     int rowMin = A_localRowMin[myRank];
     int rowMax = A_localRowMax[myRank];
 
-#if !(defined(__IBMC__) || defined(__ibmxl__) || (defined(USE_OMP_OFFLOAD) && defined(INTEL_SDK)))
+#if !(defined(__IBMC__) || defined(__ibmxl__) || (defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK))))
     int ix[C->N], jx[C->N];
     REAL_T x[C->N];
 
@@ -339,7 +339,7 @@ void TYPED_FUNC(
     memset(x, 0.0, C->N * sizeof(REAL_T));
 #endif
 
-#if defined(USE_OMP_OFFLOAD) && defined(INTEL_SDK)
+#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK))
     int num_chunks = MIN(OFFLOAD_NUM_CHUNKS, rowMax - rowMin + 1);
 
     int all_ix[C_N * num_chunks], all_jx[C_N * num_chunks];
@@ -354,7 +354,7 @@ void TYPED_FUNC(
 #endif
 
 #if defined (USE_OMP_OFFLOAD)
-#if defined(INTEL_SDK)
+#if defined(INTEL_SDK) || defined(CRAY_SDK)
 #pragma omp teams distribute parallel for \
     shared(A_N, A_M, A_nnz, A_index, A_value)  \
     shared(A_localRowMin, A_localRowMax)       \
@@ -394,7 +394,7 @@ void TYPED_FUNC(
 #endif
 #endif
     //for (int i = 0; i < A_N; i++)
-#if defined(USE_OMP_OFFLOAD) && defined(INTEL_SDK)
+#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK))
     for (int i = rowMin + chunk; i < rowMax; i = i + num_chunks)
 #else
     for (int i = rowMin; i < rowMax; i++)
@@ -460,7 +460,7 @@ void TYPED_FUNC(
         }
         C_nnz[i] = ll;
     }
-#if defined(USE_OMP_OFFLOAD) && defined(INTEL_SDK)
+#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK))
 }
 #endif
 }