Skip to content

Commit

Permalink
Modifications to support additional offload builds
Browse files Browse the repository at this point in the history
Revised the code to build on ORNL spock and Cray/HPE tulip:

o Use the same offload code path for CRAY_SDK macro as for INTEL_SDK macro
o Modify bml_add_norm_ellpack() and bml_add_identity_ellpack()
to follow similar syntax as in bml_add_ellpack()
  - Note: Tests needed for bml_add_norm_ellpack() and bml_add_identity_ellpack()
o Create a typed bml_deallocate_ellpack() (see issue #524)
o Add scripts for building on spock
  - scripts/setenv_spock_offload.sh
  - scripts/build_spock_offload_cce.sh
o Add scripts for building on tulip
  - scripts/setenv_tulip_offload.sh
  - scripts/build_tulip_offload_cce.sh
  • Loading branch information
mewall committed Jul 13, 2021
1 parent 012fa6a commit 95ace50
Show file tree
Hide file tree
Showing 9 changed files with 239 additions and 37 deletions.
30 changes: 30 additions & 0 deletions scripts/build_spock_offload_cce.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#!/bin/bash

# Make sure all the paths are correct

rm -r build
rm -r install

MY_PATH=$(pwd)

export CC=${CC:=cc}
export FC=${FC:=ftn}
export CXX=${CXX:=CC}
export BLAS_VENDOR=${BLAS_VENDOR:=Auto}
export BML_OPENMP=${BML_OPENMP:=yes}
export INSTALL_DIR=${INSTALL_DIR:="${MY_PATH}/install"}
export BML_MAGMA=${BML_MAGMA:=yes}
export MAGMA_ROOT=${MAGMA_HOME}
export BML_TESTING=${BML_TESTING:=yes}
export CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:=Release}
export CMAKE_Fortran_FLAGS=${CMAKE_Fortran_FLAGS:="-ef -DCRAY_SDK"}
#export EXTRA_CFLAGS=${EXTRA_CFLAGS:="-Ofast -DUSE_OMP_OFFLOAD -DCRAY_SDK"}
export CMAKE_C_FLAGS=${CMAKE_C_FLAGS:="-Ofast -DUSE_OMP_OFFLOAD -DCRAY_SDK"}
export EXTRA_LINK_FLAGS=${EXTRA_LINK_FLAGS:="-L${LIBSCI_BASE_DIR}/cray/9.0/x86_64/lib"}

./build.sh configure

pushd build
make -j8
make install
popd
30 changes: 30 additions & 0 deletions scripts/build_tulip_offload_cce.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#!/bin/bash

# Make sure all the paths are correct

rm -r build
rm -r install

MY_PATH=$(pwd)

export CC=${CC:=cc}
export FC=${FC:=ftn}
export CXX=${CXX:=CC}
export BLAS_VENDOR=${BLAS_VENDOR:=Auto}
export BML_OPENMP=${BML_OPENMP:=yes}
export INSTALL_DIR=${INSTALL_DIR:="${MY_PATH}/install"}
export BML_MAGMA=${BML_MAGMA:=yes}
export MAGMA_ROOT=${MAGMA_HOME}
export BML_TESTING=${BML_TESTING:=yes}
export CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:=Release}
export CMAKE_Fortran_FLAGS=${CMAKE_Fortran_FLAGS:="-ef -DCRAY_SDK"}
#export EXTRA_CFLAGS=${EXTRA_CFLAGS:="-Ofast -DUSE_OMP_OFFLOAD -DCRAY_SDK"}
export CMAKE_C_FLAGS=${CMAKE_C_FLAGS:="-Ofast -DUSE_OMP_OFFLOAD -DCRAY_SDK"}
export EXTRA_LINK_FLAGS=${EXTRA_LINK_FLAGS:="-L${BLASDIR} -l${BLASLIB}"}

./build.sh configure

pushd build
make -j8
make install
popd
13 changes: 13 additions & 0 deletions scripts/setenv_spock_offload.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#!/bin/bash

module load craype-accel-amd-gfx908
#module unload cray-mvapich2 cray-libsci
#module use /home/groups/coegroup/share/coe/modulefiles
module load rocm/4.1.0
#module swap cce cce/11.0.4
#module load ompi/4.1.0/cce/11.0.4/rocm/4.1.1
#module load blas
#module load rocm/4.2.0
module load cmake
export LD_LIBRARY_PATH="$CRAY_LD_LIBRARY_PATH:$LD_LIBRARY_PATH"

13 changes: 13 additions & 0 deletions scripts/setenv_tulip_offload.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#!/bin/bash

module load craype-accel-amd-gfx908
module unload cray-mvapich2 cray-libsci
module use /home/groups/coegroup/share/coe/modulefiles
module load rocm/4.1.1
module swap cce cce/11.0.4
module load ompi/4.1.0/cce/11.0.4/rocm/4.1.1
module load blas
#module load rocm/4.2.0
module load cmake
export LD_LIBRARY_PATH="$CRAY_LD_LIBRARY_PATH:$LD_LIBRARY_PATH"

93 changes: 84 additions & 9 deletions src/C-interface/ellpack/bml_add_ellpack_typed.c
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ void TYPED_FUNC(
int rowMin = A_localRowMin[myRank];
int rowMax = A_localRowMax[myRank];

#if !(defined(__IBMC__) || defined(__ibmxl__) || (defined(USE_OMP_OFFLOAD) && defined(INTEL_SDK)))
#if !(defined(__IBMC__) || defined(__ibmxl__) || (defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK))))
int ix[N], jx[N];
REAL_T x[N];

Expand All @@ -66,7 +66,7 @@ void TYPED_FUNC(
memset(x, 0.0, N * sizeof(REAL_T));
#endif

#if defined(USE_OMP_OFFLOAD) && defined(INTEL_SDK)
#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK))
int num_chunks = MIN(OFFLOAD_NUM_CHUNKS, rowMax - rowMin + 1);

int all_ix[N * num_chunks], all_jx[N * num_chunks];
Expand All @@ -81,7 +81,7 @@ void TYPED_FUNC(
#endif

#if defined (USE_OMP_OFFLOAD)
#if defined(INTEL_SDK)
#if defined(INTEL_SDK) || defined(CRAY_SDK)
#pragma omp teams distribute parallel for \
shared(rowMin, rowMax) \
shared(A_index, A_value, A_nnz) \
Expand Down Expand Up @@ -116,7 +116,7 @@ void TYPED_FUNC(
firstprivate(ix, jx, x)
#endif
#endif
#if defined(USE_OMP_OFFLOAD) && defined(INTEL_SDK)
#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK))
for (int i = rowMin + chunk; i < rowMax; i = i + num_chunks)
#else
for (int i = rowMin; i < rowMax; i++)
Expand Down Expand Up @@ -175,7 +175,7 @@ void TYPED_FUNC(
}
A_nnz[i] = ll;
}
#if defined(USE_OMP_OFFLOAD) && defined(INTEL_SDK)
#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK))
}
#endif
}
Expand Down Expand Up @@ -223,7 +223,7 @@ double TYPED_FUNC(
int rowMin = A_localRowMin[myRank];
int rowMax = A_localRowMax[myRank];

#if !(defined(__IBMC__) || defined(__ibmxl__))
#if !(defined(__IBMC__) || defined(__ibmxl__) || (defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK))))
int ix[N], jx[N];
REAL_T x[N];
REAL_T y[N];
Expand All @@ -234,13 +234,47 @@ double TYPED_FUNC(
memset(y, 0.0, N * sizeof(REAL_T));
#endif

#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK))
int num_chunks = MIN(OFFLOAD_NUM_CHUNKS, rowMax - rowMin + 1);

int all_ix[N * num_chunks], all_jx[N * num_chunks];
REAL_T all_x[N * num_chunks], all_y[N * num_chunks];

memset(all_ix, 0, N * num_chunks * sizeof(int));
memset(all_jx, 0, N * num_chunks * sizeof(int));
memset(all_x, 0.0, N * num_chunks * sizeof(REAL_T));
memset(all_y, 0.0, N * num_chunks * sizeof(REAL_T));

#pragma omp target map(to:all_ix[0:N*num_chunks],all_jx[0:N*num_chunks],all_x[0:N*num_chunks],all_y[0:N*num_chunks]) map(tofrom:trnorm)

#endif

#if defined (USE_OMP_OFFLOAD)
#pragma omp target teams distribute parallel for \
#if defined(INTEL_SDK) || defined(CRAY_SDK)
#pragma omp teams distribute parallel for \
shared(rowMin, rowMax) \
shared(A_index, A_value, A_nnz) \
shared(B_index, B_value, B_nnz) \
reduction(+:trnorm)
for (int chunk = 0; chunk < num_chunks; chunk++)
{
int *ix, *jx;
REAL_T *x, *y;

ix = &all_ix[chunk * N];
jx = &all_jx[chunk * N];
x = &all_x[chunk * N];
y = &all_y[chunk * N];

#else

#pragma omp target teams distribute parallel for \
shared(rowMin, rowMax) \
shared(A_index, A_value, A_nnz) \
shared(B_index, B_value, B_nnz) \
firstprivate(ix, jx, x, y) \
reduction(+:trnorm)
#endif
#else
#if defined(__IBMC__) || defined(__ibmxl__)
#pragma omp parallel for \
Expand All @@ -257,7 +291,11 @@ double TYPED_FUNC(
reduction(+:trnorm)
#endif
#endif
#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK))
for (int i = rowMin + chunk; i < rowMax; i = i + num_chunks)
#else
for (int i = rowMin; i < rowMax; i++)
#endif
{

#if defined(__IBMC__) || defined(__ibmxl__)
Expand Down Expand Up @@ -321,8 +359,11 @@ double TYPED_FUNC(
}
A_nnz[i] = ll;
}
#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK))
}
#endif

return trnorm;
return trnorm;
}

/** Matrix addition.
Expand All @@ -348,19 +389,46 @@ void TYPED_FUNC(
int *A_index = A->index;
REAL_T *A_value = (REAL_T *) A->value;

#if !(defined(__IBMC__) || defined(__ibmxl__))
#if !(defined(__IBMC__) || defined(__ibmxl__) || (defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK))))
int jx[A_M];
REAL_T x[A_M];

memset(jx, 0, A_M * sizeof(int));
memset(x, 0.0, A_M * sizeof(REAL_T));
#endif

#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK))
int num_chunks = MIN(OFFLOAD_NUM_CHUNKS, N);

int all_jx[N * num_chunks];
REAL_T all_x[N * num_chunks];

memset(all_jx, 0, N * num_chunks * sizeof(int));
memset(all_x, 0.0, N * num_chunks * sizeof(REAL_T));

#pragma omp target map(to:all_jx[0:N*num_chunks],all_x[0:N*num_chunks])

#endif

#if defined (USE_OMP_OFFLOAD)
#if defined(INTEL_SDK) || defined(CRAY_SDK)
#pragma omp teams distribute parallel for \
shared(N, A_M) \
shared(A_index, A_value, A_nnz)
for (int chunk = 0; chunk < num_chunks; chunk++)
{
int *jx;
REAL_T *x;

jx = &all_jx[chunk * N];
x = &all_x[chunk * N];

#else
#pragma omp target teams distribute parallel for \
shared(N, A_M) \
shared(A_index, A_value, A_nnz) \
firstprivate(jx, x)
#endif
#else
#if defined(__IBMC__) || defined(__ibmxl__)
#pragma omp parallel for \
Expand All @@ -373,7 +441,11 @@ void TYPED_FUNC(
firstprivate(jx, x)
#endif
#endif
#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK))
for (int i = chunk; i < N; i = i + num_chunks)
#else
for (int i = 0; i < N; i++)
#endif
{

#if defined(__IBMC__) || defined(__ibmxl__)
Expand Down Expand Up @@ -423,6 +495,9 @@ void TYPED_FUNC(
}
A_nnz[i] = ll;
}
#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK))
}
#endif
}

/** Matrix addition.
Expand Down
36 changes: 18 additions & 18 deletions src/C-interface/ellpack/bml_allocate_ellpack.c
Original file line number Diff line number Diff line change
Expand Up @@ -17,24 +17,24 @@ void
bml_deallocate_ellpack(
bml_matrix_ellpack_t * A)
{
#ifdef USE_OMP_OFFLOAD
int N = A->N;
int M = A->M;

int *A_nnz = A->nnz;
int *A_index = A->index;
// JAMAL: need to make a typed deallocator
double *A_value = A->value;

#pragma omp target exit data map(delete: A_nnz[:N], A_index[:N*M], A_value[:N*M])
#endif

bml_deallocate_domain(A->domain);
bml_deallocate_domain(A->domain2);
bml_free_memory(A->value);
bml_free_memory(A->index);
bml_free_memory(A->nnz);
bml_free_memory(A);
switch (A->matrix_precision)
{
case single_real:
bml_deallocate_ellpack_single_real(A);
break;
case double_real:
bml_deallocate_ellpack_double_real(A);
break;
case single_complex:
bml_deallocate_ellpack_single_complex(A);
break;
case double_complex:
bml_deallocate_ellpack_double_complex(A);
break;
default:
LOG_ERROR("unknown precision\n");
break;
}
}

/** Clear a matrix.
Expand Down
12 changes: 12 additions & 0 deletions src/C-interface/ellpack/bml_allocate_ellpack.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,18 @@
void bml_deallocate_ellpack(
bml_matrix_ellpack_t * A);

void bml_deallocate_ellpack_single_real(
bml_matrix_ellpack_t * A);

void bml_deallocate_ellpack_double_real(
bml_matrix_ellpack_t * A);

void bml_deallocate_ellpack_single_complex(
bml_matrix_ellpack_t * A);

void bml_deallocate_ellpack_double_complex(
bml_matrix_ellpack_t * A);

void bml_clear_ellpack(
bml_matrix_ellpack_t * A);

Expand Down
29 changes: 29 additions & 0 deletions src/C-interface/ellpack/bml_allocate_ellpack_typed.c
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,35 @@
#include <omp.h>
#endif

/** Deallocate a matrix.
*
* \ingroup allocate_group
*
* \param A The matrix.
*/
void TYPED_FUNC(
bml_deallocate_ellpack) (
bml_matrix_ellpack_t * A)
{
#ifdef USE_OMP_OFFLOAD
int N = A->N;
int M = A->M;

int *A_nnz = A->nnz;
int *A_index = A->index;
REAL_T *A_value = A->value;

#pragma omp target exit data map(delete: A_nnz[:N], A_index[:N*M], A_value[:N*M])
#endif

bml_deallocate_domain(A->domain);
bml_deallocate_domain(A->domain2);
bml_free_memory(A->value);
bml_free_memory(A->index);
bml_free_memory(A->nnz);
bml_free_memory(A);
}

/** Clear a matrix.
*
* Numbers of non-zeroes, indeces, and values are set to zero.
Expand Down
Loading

0 comments on commit 95ace50

Please sign in to comment.