diff --git a/scripts/build_spock_offload_cce.sh b/scripts/build_spock_offload_cce.sh new file mode 100644 index 000000000..c8f6c5321 --- /dev/null +++ b/scripts/build_spock_offload_cce.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +# Make sure all the paths are correct + +rm -r build +rm -r install + +MY_PATH=$(pwd) + +export CC=${CC:=cc} +export FC=${FC:=ftn} +export CXX=${CXX:=CC} +export BLAS_VENDOR=${BLAS_VENDOR:=Auto} +export BML_OPENMP=${BML_OPENMP:=yes} +export INSTALL_DIR=${INSTALL_DIR:="${MY_PATH}/install"} +export BML_MAGMA=${BML_MAGMA:=yes} +export MAGMA_ROOT=${MAGMA_HOME} +export BML_TESTING=${BML_TESTING:=yes} +export CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:=Release} +export CMAKE_Fortran_FLAGS=${CMAKE_Fortran_FLAGS:="-ef -DCRAY_SDK"} +#export EXTRA_CFLAGS=${EXTRA_CFLAGS:="-Ofast -DUSE_OMP_OFFLOAD -DCRAY_SDK"} +export CMAKE_C_FLAGS=${CMAKE_C_FLAGS:="-Ofast -DUSE_OMP_OFFLOAD -DCRAY_SDK"} +export EXTRA_LINK_FLAGS=${EXTRA_LINK_FLAGS:="-L${LIBSCI_BASE_DIR}/cray/9.0/x86_64/lib"} + +./build.sh configure + +pushd build +make -j8 +make install +popd diff --git a/scripts/build_tulip_offload_cce.sh b/scripts/build_tulip_offload_cce.sh new file mode 100755 index 000000000..a06f06d50 --- /dev/null +++ b/scripts/build_tulip_offload_cce.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +# Make sure all the paths are correct + +rm -r build +rm -r install + +MY_PATH=$(pwd) + +export CC=${CC:=cc} +export FC=${FC:=ftn} +export CXX=${CXX:=CC} +export BLAS_VENDOR=${BLAS_VENDOR:=Auto} +export BML_OPENMP=${BML_OPENMP:=yes} +export INSTALL_DIR=${INSTALL_DIR:="${MY_PATH}/install"} +export BML_MAGMA=${BML_MAGMA:=yes} +export MAGMA_ROOT=${MAGMA_HOME} +export BML_TESTING=${BML_TESTING:=yes} +export CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:=Release} +export CMAKE_Fortran_FLAGS=${CMAKE_Fortran_FLAGS:="-ef -DCRAY_SDK"} +#export EXTRA_CFLAGS=${EXTRA_CFLAGS:="-Ofast -DUSE_OMP_OFFLOAD -DCRAY_SDK"} +export CMAKE_C_FLAGS=${CMAKE_C_FLAGS:="-Ofast -DUSE_OMP_OFFLOAD -DCRAY_SDK"} +export EXTRA_LINK_FLAGS=${EXTRA_LINK_FLAGS:="-L${BLASDIR} -l${BLASLIB}"} + +./build.sh configure + +pushd build +make -j8 +make install +popd diff --git a/scripts/setenv_spock_offload.sh b/scripts/setenv_spock_offload.sh new file mode 100644 index 000000000..66c076a51 --- /dev/null +++ b/scripts/setenv_spock_offload.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +module load craype-accel-amd-gfx908 +#module unload cray-mvapich2 cray-libsci +#module use /home/groups/coegroup/share/coe/modulefiles +module load rocm/4.1.0 +#module swap cce cce/11.0.4 +#module load ompi/4.1.0/cce/11.0.4/rocm/4.1.1 +#module load blas +#module load rocm/4.2.0 +module load cmake +export LD_LIBRARY_PATH="$CRAY_LD_LIBRARY_PATH:$LD_LIBRARY_PATH" + diff --git a/scripts/setenv_tulip_offload.sh b/scripts/setenv_tulip_offload.sh new file mode 100644 index 000000000..8405ad6d6 --- /dev/null +++ b/scripts/setenv_tulip_offload.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +module load craype-accel-amd-gfx908 +module unload cray-mvapich2 cray-libsci +module use /home/groups/coegroup/share/coe/modulefiles +module load rocm/4.1.1 +module swap cce cce/11.0.4 +module load ompi/4.1.0/cce/11.0.4/rocm/4.1.1 +module load blas +#module load rocm/4.2.0 +module load cmake +export LD_LIBRARY_PATH="$CRAY_LD_LIBRARY_PATH:$LD_LIBRARY_PATH" + diff --git a/src/C-interface/ellpack/bml_add_ellpack_typed.c b/src/C-interface/ellpack/bml_add_ellpack_typed.c index 8766f4ee8..41ecf5c4c 100644 --- a/src/C-interface/ellpack/bml_add_ellpack_typed.c +++ b/src/C-interface/ellpack/bml_add_ellpack_typed.c @@ -57,7 +57,7 @@ void TYPED_FUNC( int rowMin = A_localRowMin[myRank]; int rowMax = A_localRowMax[myRank]; -#if !(defined(__IBMC__) || defined(__ibmxl__) || (defined(USE_OMP_OFFLOAD) && defined(INTEL_SDK))) +#if !(defined(__IBMC__) || defined(__ibmxl__) || (defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK)))) int ix[N], jx[N]; REAL_T x[N]; @@ -66,7 +66,7 @@ void TYPED_FUNC( memset(x, 0.0, N * sizeof(REAL_T)); #endif -#if defined(USE_OMP_OFFLOAD) && defined(INTEL_SDK) +#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK)) int num_chunks = MIN(OFFLOAD_NUM_CHUNKS, rowMax - rowMin + 1); int all_ix[N * num_chunks], all_jx[N * num_chunks]; @@ -81,7 +81,7 @@ void TYPED_FUNC( #endif #if defined (USE_OMP_OFFLOAD) -#if defined(INTEL_SDK) +#if defined(INTEL_SDK) || defined(CRAY_SDK) #pragma omp teams distribute parallel for \ shared(rowMin, rowMax) \ shared(A_index, A_value, A_nnz) \ @@ -116,7 +116,7 @@ void TYPED_FUNC( firstprivate(ix, jx, x) #endif #endif -#if defined(USE_OMP_OFFLOAD) && defined(INTEL_SDK) +#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK)) for (int i = rowMin + chunk; i < rowMax; i = i + num_chunks) #else for (int i = rowMin; i < rowMax; i++) @@ -175,7 +175,7 @@ void TYPED_FUNC( } A_nnz[i] = ll; } -#if defined(USE_OMP_OFFLOAD) && defined(INTEL_SDK) +#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK)) } #endif } @@ -223,7 +223,7 @@ double TYPED_FUNC( int rowMin = A_localRowMin[myRank]; int rowMax = A_localRowMax[myRank]; -#if !(defined(__IBMC__) || defined(__ibmxl__)) +#if !(defined(__IBMC__) || defined(__ibmxl__) || (defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK)))) int ix[N], jx[N]; REAL_T x[N]; REAL_T y[N]; @@ -234,13 +234,47 @@ double TYPED_FUNC( memset(y, 0.0, N * sizeof(REAL_T)); #endif +#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK)) + int num_chunks = MIN(OFFLOAD_NUM_CHUNKS, rowMax - rowMin + 1); + + int all_ix[N * num_chunks], all_jx[N * num_chunks]; + REAL_T all_x[N * num_chunks], all_y[N * num_chunks]; + + memset(all_ix, 0, N * num_chunks * sizeof(int)); + memset(all_jx, 0, N * num_chunks * sizeof(int)); + memset(all_x, 0.0, N * num_chunks * sizeof(REAL_T)); + memset(all_y, 0.0, N * num_chunks * sizeof(REAL_T)); + +#pragma omp target map(to:all_ix[0:N*num_chunks],all_jx[0:N*num_chunks],all_x[0:N*num_chunks],all_y[0:N*num_chunks]) map(tofrom:trnorm) + +#endif + #if defined (USE_OMP_OFFLOAD) -#pragma omp target teams distribute parallel for \ +#if defined(INTEL_SDK) || defined(CRAY_SDK) +#pragma omp teams distribute parallel for \ + shared(rowMin, rowMax) \ + shared(A_index, A_value, A_nnz) \ + shared(B_index, B_value, B_nnz) \ + reduction(+:trnorm) + for (int chunk = 0; chunk < num_chunks; chunk++) + { + int *ix, *jx; + REAL_T *x, *y; + + ix = &all_ix[chunk * N]; + jx = &all_jx[chunk * N]; + x = &all_x[chunk * N]; + y = &all_y[chunk * N]; + +#else + +#pragma omp target teams distribute parallel for \ shared(rowMin, rowMax) \ shared(A_index, A_value, A_nnz) \ shared(B_index, B_value, B_nnz) \ firstprivate(ix, jx, x, y) \ reduction(+:trnorm) +#endif #else #if defined(__IBMC__) || defined(__ibmxl__) #pragma omp parallel for \ @@ -257,7 +291,11 @@ double TYPED_FUNC( reduction(+:trnorm) #endif #endif +#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK)) + for (int i = rowMin + chunk; i < rowMax; i = i + num_chunks) +#else for (int i = rowMin; i < rowMax; i++) +#endif { #if defined(__IBMC__) || defined(__ibmxl__) @@ -321,8 +359,11 @@ double TYPED_FUNC( } A_nnz[i] = ll; } +#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK)) +} +#endif - return trnorm; +return trnorm; } /** Matrix addition. @@ -348,7 +389,7 @@ void TYPED_FUNC( int *A_index = A->index; REAL_T *A_value = (REAL_T *) A->value; -#if !(defined(__IBMC__) || defined(__ibmxl__)) +#if !(defined(__IBMC__) || defined(__ibmxl__) || (defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK)))) int jx[A_M]; REAL_T x[A_M]; @@ -356,11 +397,38 @@ void TYPED_FUNC( memset(x, 0.0, A_M * sizeof(REAL_T)); #endif +#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK)) + int num_chunks = MIN(OFFLOAD_NUM_CHUNKS, N); + + int all_jx[N * num_chunks]; + REAL_T all_x[N * num_chunks]; + + memset(all_jx, 0, N * num_chunks * sizeof(int)); + memset(all_x, 0.0, N * num_chunks * sizeof(REAL_T)); + +#pragma omp target map(to:all_jx[0:N*num_chunks],all_x[0:N*num_chunks]) + +#endif + #if defined (USE_OMP_OFFLOAD) +#if defined(INTEL_SDK) || defined(CRAY_SDK) +#pragma omp teams distribute parallel for \ + shared(N, A_M) \ + shared(A_index, A_value, A_nnz) + for (int chunk = 0; chunk < num_chunks; chunk++) + { + int *jx; + REAL_T *x; + + jx = &all_jx[chunk * N]; + x = &all_x[chunk * N]; + +#else #pragma omp target teams distribute parallel for \ shared(N, A_M) \ shared(A_index, A_value, A_nnz) \ firstprivate(jx, x) +#endif #else #if defined(__IBMC__) || defined(__ibmxl__) #pragma omp parallel for \ @@ -373,7 +441,11 @@ void TYPED_FUNC( firstprivate(jx, x) #endif #endif +#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK)) + for (int i = chunk; i < N; i = i + num_chunks) +#else for (int i = 0; i < N; i++) +#endif { #if defined(__IBMC__) || defined(__ibmxl__) @@ -423,6 +495,9 @@ void TYPED_FUNC( } A_nnz[i] = ll; } +#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK)) +} +#endif } /** Matrix addition. diff --git a/src/C-interface/ellpack/bml_allocate_ellpack.c b/src/C-interface/ellpack/bml_allocate_ellpack.c index abb80e3aa..b5f2c112b 100644 --- a/src/C-interface/ellpack/bml_allocate_ellpack.c +++ b/src/C-interface/ellpack/bml_allocate_ellpack.c @@ -17,24 +17,24 @@ void bml_deallocate_ellpack( bml_matrix_ellpack_t * A) { -#ifdef USE_OMP_OFFLOAD - int N = A->N; - int M = A->M; - - int *A_nnz = A->nnz; - int *A_index = A->index; - // JAMAL: need to make a typed deallocator - double *A_value = A->value; - -#pragma omp target exit data map(delete: A_nnz[:N], A_index[:N*M], A_value[:N*M]) -#endif - - bml_deallocate_domain(A->domain); - bml_deallocate_domain(A->domain2); - bml_free_memory(A->value); - bml_free_memory(A->index); - bml_free_memory(A->nnz); - bml_free_memory(A); + switch (A->matrix_precision) + { + case single_real: + bml_deallocate_ellpack_single_real(A); + break; + case double_real: + bml_deallocate_ellpack_double_real(A); + break; + case single_complex: + bml_deallocate_ellpack_single_complex(A); + break; + case double_complex: + bml_deallocate_ellpack_double_complex(A); + break; + default: + LOG_ERROR("unknown precision\n"); + break; + } } /** Clear a matrix. diff --git a/src/C-interface/ellpack/bml_allocate_ellpack.h b/src/C-interface/ellpack/bml_allocate_ellpack.h index 5b7f80df9..9068e8bc7 100644 --- a/src/C-interface/ellpack/bml_allocate_ellpack.h +++ b/src/C-interface/ellpack/bml_allocate_ellpack.h @@ -6,6 +6,18 @@ void bml_deallocate_ellpack( bml_matrix_ellpack_t * A); +void bml_deallocate_ellpack_single_real( + bml_matrix_ellpack_t * A); + +void bml_deallocate_ellpack_double_real( + bml_matrix_ellpack_t * A); + +void bml_deallocate_ellpack_single_complex( + bml_matrix_ellpack_t * A); + +void bml_deallocate_ellpack_double_complex( + bml_matrix_ellpack_t * A); + void bml_clear_ellpack( bml_matrix_ellpack_t * A); diff --git a/src/C-interface/ellpack/bml_allocate_ellpack_typed.c b/src/C-interface/ellpack/bml_allocate_ellpack_typed.c index 5d0cc67a6..c7ffa8f7f 100644 --- a/src/C-interface/ellpack/bml_allocate_ellpack_typed.c +++ b/src/C-interface/ellpack/bml_allocate_ellpack_typed.c @@ -15,6 +15,35 @@ #include #endif +/** Deallocate a matrix. + * + * \ingroup allocate_group + * + * \param A The matrix. + */ +void TYPED_FUNC( + bml_deallocate_ellpack) ( + bml_matrix_ellpack_t * A) +{ +#ifdef USE_OMP_OFFLOAD + int N = A->N; + int M = A->M; + + int *A_nnz = A->nnz; + int *A_index = A->index; + REAL_T *A_value = A->value; + +#pragma omp target exit data map(delete: A_nnz[:N], A_index[:N*M], A_value[:N*M]) +#endif + + bml_deallocate_domain(A->domain); + bml_deallocate_domain(A->domain2); + bml_free_memory(A->value); + bml_free_memory(A->index); + bml_free_memory(A->nnz); + bml_free_memory(A); +} + /** Clear a matrix. * * Numbers of non-zeroes, indeces, and values are set to zero. diff --git a/src/C-interface/ellpack/bml_multiply_ellpack_typed.c b/src/C-interface/ellpack/bml_multiply_ellpack_typed.c index b264410a0..9fc4873a0 100644 --- a/src/C-interface/ellpack/bml_multiply_ellpack_typed.c +++ b/src/C-interface/ellpack/bml_multiply_ellpack_typed.c @@ -128,7 +128,7 @@ void *TYPED_FUNC( int rowMin = X_localRowMin[myRank]; int rowMax = X_localRowMax[myRank]; -#if !(defined(__IBMC__) || defined(__ibmxl__) || (defined(USE_OMP_OFFLOAD) && defined(INTEL_SDK))) +#if !(defined(__IBMC__) || defined(__ibmxl__) || (defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK)))) int ix[X_N], jx[X_N]; REAL_T x[X_N]; @@ -137,7 +137,7 @@ void *TYPED_FUNC( memset(x, 0.0, X_N * sizeof(REAL_T)); #endif -#if defined(USE_OMP_OFFLOAD) && defined(INTEL_SDK) +#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK)) int num_chunks = MIN(OFFLOAD_NUM_CHUNKS, rowMax - rowMin + 1); int all_ix[X_N * num_chunks], all_jx[X_N * num_chunks]; @@ -152,7 +152,7 @@ void *TYPED_FUNC( #endif #if defined (USE_OMP_OFFLOAD) -#if defined(INTEL_SDK) +#if defined(INTEL_SDK) || defined(CRAY_SDK) #pragma omp teams distribute parallel for \ shared(X_N, X_M, X_index, X_nnz, X_value) \ shared(X2_N, X2_M, X2_index, X2_nnz, X2_value) \ @@ -195,7 +195,7 @@ void *TYPED_FUNC( #endif #endif -#if defined(USE_OMP_OFFLOAD) && defined(INTEL_SDK) +#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK)) for (int i = rowMin + chunk; i < rowMax; i = i + num_chunks) #else for (int i = rowMin; i < rowMax; i++) @@ -277,7 +277,7 @@ void *TYPED_FUNC( X2_nnz[i] = ll; } -#if defined(USE_OMP_OFFLOAD) && defined(INTEL_SDK) +#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK)) } #endif @@ -330,7 +330,7 @@ void TYPED_FUNC( int rowMin = A_localRowMin[myRank]; int rowMax = A_localRowMax[myRank]; -#if !(defined(__IBMC__) || defined(__ibmxl__) || (defined(USE_OMP_OFFLOAD) && defined(INTEL_SDK))) +#if !(defined(__IBMC__) || defined(__ibmxl__) || (defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK)))) int ix[C->N], jx[C->N]; REAL_T x[C->N]; @@ -339,7 +339,7 @@ void TYPED_FUNC( memset(x, 0.0, C->N * sizeof(REAL_T)); #endif -#if defined(USE_OMP_OFFLOAD) && defined(INTEL_SDK) +#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK)) int num_chunks = MIN(OFFLOAD_NUM_CHUNKS, rowMax - rowMin + 1); int all_ix[C_N * num_chunks], all_jx[C_N * num_chunks]; @@ -354,7 +354,7 @@ void TYPED_FUNC( #endif #if defined (USE_OMP_OFFLOAD) -#if defined(INTEL_SDK) +#if defined(INTEL_SDK) || defined(CRAY_SDK) #pragma omp teams distribute parallel for \ shared(A_N, A_M, A_nnz, A_index, A_value) \ shared(A_localRowMin, A_localRowMax) \ @@ -394,7 +394,7 @@ void TYPED_FUNC( #endif #endif //for (int i = 0; i < A_N; i++) -#if defined(USE_OMP_OFFLOAD) && defined(INTEL_SDK) +#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK)) for (int i = rowMin + chunk; i < rowMax; i = i + num_chunks) #else for (int i = rowMin; i < rowMax; i++) @@ -460,7 +460,7 @@ void TYPED_FUNC( } C_nnz[i] = ll; } -#if defined(USE_OMP_OFFLOAD) && defined(INTEL_SDK) +#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK)) } #endif }