Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Modifications to support additional offload builds #527

Merged
merged 1 commit into from
Jul 15, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 30 additions & 0 deletions scripts/build_spock_offload_cce.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#!/bin/bash

# Make sure all the paths are correct

rm -r build
rm -r install

MY_PATH=$(pwd)

export CC=${CC:=cc}
export FC=${FC:=ftn}
export CXX=${CXX:=CC}
export BLAS_VENDOR=${BLAS_VENDOR:=Auto}
export BML_OPENMP=${BML_OPENMP:=yes}
export INSTALL_DIR=${INSTALL_DIR:="${MY_PATH}/install"}
export BML_MAGMA=${BML_MAGMA:=yes}
export MAGMA_ROOT=${MAGMA_HOME}
export BML_TESTING=${BML_TESTING:=yes}
export CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:=Release}
export CMAKE_Fortran_FLAGS=${CMAKE_Fortran_FLAGS:="-ef -DCRAY_SDK"}
#export EXTRA_CFLAGS=${EXTRA_CFLAGS:="-Ofast -DUSE_OMP_OFFLOAD -DCRAY_SDK"}
export CMAKE_C_FLAGS=${CMAKE_C_FLAGS:="-Ofast -DUSE_OMP_OFFLOAD -DCRAY_SDK"}
export EXTRA_LINK_FLAGS=${EXTRA_LINK_FLAGS:="-L${LIBSCI_BASE_DIR}/cray/9.0/x86_64/lib"}

./build.sh configure

pushd build
make -j8
make install
popd
30 changes: 30 additions & 0 deletions scripts/build_tulip_offload_cce.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#!/bin/bash

# Make sure all the paths are correct

rm -r build
rm -r install

MY_PATH=$(pwd)

export CC=${CC:=cc}
export FC=${FC:=ftn}
export CXX=${CXX:=CC}
export BLAS_VENDOR=${BLAS_VENDOR:=Auto}
export BML_OPENMP=${BML_OPENMP:=yes}
export INSTALL_DIR=${INSTALL_DIR:="${MY_PATH}/install"}
export BML_MAGMA=${BML_MAGMA:=yes}
export MAGMA_ROOT=${MAGMA_HOME}
export BML_TESTING=${BML_TESTING:=yes}
export CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:=Release}
export CMAKE_Fortran_FLAGS=${CMAKE_Fortran_FLAGS:="-ef -DCRAY_SDK"}
#export EXTRA_CFLAGS=${EXTRA_CFLAGS:="-Ofast -DUSE_OMP_OFFLOAD -DCRAY_SDK"}
export CMAKE_C_FLAGS=${CMAKE_C_FLAGS:="-Ofast -DUSE_OMP_OFFLOAD -DCRAY_SDK"}
export EXTRA_LINK_FLAGS=${EXTRA_LINK_FLAGS:="-L${BLASDIR} -l${BLASLIB}"}

./build.sh configure

pushd build
make -j8
make install
popd
7 changes: 7 additions & 0 deletions scripts/setenv_spock_offload.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#!/bin/bash

module load craype-accel-amd-gfx908
module load rocm/4.1.0
module load cmake
export LD_LIBRARY_PATH="$CRAY_LD_LIBRARY_PATH:$LD_LIBRARY_PATH"

13 changes: 13 additions & 0 deletions scripts/setenv_tulip_offload.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#!/bin/bash

module load craype-accel-amd-gfx908
module unload cray-mvapich2 cray-libsci
module use /home/groups/coegroup/share/coe/modulefiles
module load rocm/4.1.1
module swap cce cce/11.0.4
module load ompi/4.1.0/cce/11.0.4/rocm/4.1.1
module load blas
#module load rocm/4.2.0
module load cmake
export LD_LIBRARY_PATH="$CRAY_LD_LIBRARY_PATH:$LD_LIBRARY_PATH"

93 changes: 84 additions & 9 deletions src/C-interface/ellpack/bml_add_ellpack_typed.c
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ void TYPED_FUNC(
int rowMin = A_localRowMin[myRank];
int rowMax = A_localRowMax[myRank];

#if !(defined(__IBMC__) || defined(__ibmxl__) || (defined(USE_OMP_OFFLOAD) && defined(INTEL_SDK)))
#if !(defined(__IBMC__) || defined(__ibmxl__) || (defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK))))
int ix[N], jx[N];
REAL_T x[N];

Expand All @@ -66,7 +66,7 @@ void TYPED_FUNC(
memset(x, 0.0, N * sizeof(REAL_T));
#endif

#if defined(USE_OMP_OFFLOAD) && defined(INTEL_SDK)
#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK))
int num_chunks = MIN(OFFLOAD_NUM_CHUNKS, rowMax - rowMin + 1);

int all_ix[N * num_chunks], all_jx[N * num_chunks];
Expand All @@ -81,7 +81,7 @@ void TYPED_FUNC(
#endif

#if defined (USE_OMP_OFFLOAD)
#if defined(INTEL_SDK)
#if defined(INTEL_SDK) || defined(CRAY_SDK)
#pragma omp teams distribute parallel for \
shared(rowMin, rowMax) \
shared(A_index, A_value, A_nnz) \
Expand Down Expand Up @@ -116,7 +116,7 @@ void TYPED_FUNC(
firstprivate(ix, jx, x)
#endif
#endif
#if defined(USE_OMP_OFFLOAD) && defined(INTEL_SDK)
#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK))
for (int i = rowMin + chunk; i < rowMax; i = i + num_chunks)
#else
for (int i = rowMin; i < rowMax; i++)
Expand Down Expand Up @@ -175,7 +175,7 @@ void TYPED_FUNC(
}
A_nnz[i] = ll;
}
#if defined(USE_OMP_OFFLOAD) && defined(INTEL_SDK)
#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK))
}
#endif
}
Expand Down Expand Up @@ -223,7 +223,7 @@ double TYPED_FUNC(
int rowMin = A_localRowMin[myRank];
int rowMax = A_localRowMax[myRank];

#if !(defined(__IBMC__) || defined(__ibmxl__))
#if !(defined(__IBMC__) || defined(__ibmxl__) || (defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK))))
int ix[N], jx[N];
REAL_T x[N];
REAL_T y[N];
Expand All @@ -234,13 +234,47 @@ double TYPED_FUNC(
memset(y, 0.0, N * sizeof(REAL_T));
#endif

#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK))
int num_chunks = MIN(OFFLOAD_NUM_CHUNKS, rowMax - rowMin + 1);

int all_ix[N * num_chunks], all_jx[N * num_chunks];
REAL_T all_x[N * num_chunks], all_y[N * num_chunks];

memset(all_ix, 0, N * num_chunks * sizeof(int));
memset(all_jx, 0, N * num_chunks * sizeof(int));
memset(all_x, 0.0, N * num_chunks * sizeof(REAL_T));
memset(all_y, 0.0, N * num_chunks * sizeof(REAL_T));

#pragma omp target map(to:all_ix[0:N*num_chunks],all_jx[0:N*num_chunks],all_x[0:N*num_chunks],all_y[0:N*num_chunks]) map(tofrom:trnorm)

#endif

#if defined (USE_OMP_OFFLOAD)
#pragma omp target teams distribute parallel for \
#if defined(INTEL_SDK) || defined(CRAY_SDK)
#pragma omp teams distribute parallel for \
shared(rowMin, rowMax) \
shared(A_index, A_value, A_nnz) \
shared(B_index, B_value, B_nnz) \
reduction(+:trnorm)
for (int chunk = 0; chunk < num_chunks; chunk++)
{
int *ix, *jx;
REAL_T *x, *y;

ix = &all_ix[chunk * N];
jx = &all_jx[chunk * N];
x = &all_x[chunk * N];
y = &all_y[chunk * N];

#else

#pragma omp target teams distribute parallel for \
shared(rowMin, rowMax) \
shared(A_index, A_value, A_nnz) \
shared(B_index, B_value, B_nnz) \
firstprivate(ix, jx, x, y) \
reduction(+:trnorm)
#endif
#else
#if defined(__IBMC__) || defined(__ibmxl__)
#pragma omp parallel for \
Expand All @@ -257,7 +291,11 @@ double TYPED_FUNC(
reduction(+:trnorm)
#endif
#endif
#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK))
for (int i = rowMin + chunk; i < rowMax; i = i + num_chunks)
#else
for (int i = rowMin; i < rowMax; i++)
#endif
{

#if defined(__IBMC__) || defined(__ibmxl__)
Expand Down Expand Up @@ -321,8 +359,11 @@ double TYPED_FUNC(
}
A_nnz[i] = ll;
}
#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK))
}
#endif

return trnorm;
return trnorm;
}

/** Matrix addition.
Expand All @@ -348,19 +389,46 @@ void TYPED_FUNC(
int *A_index = A->index;
REAL_T *A_value = (REAL_T *) A->value;

#if !(defined(__IBMC__) || defined(__ibmxl__))
#if !(defined(__IBMC__) || defined(__ibmxl__) || (defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK))))
int jx[A_M];
REAL_T x[A_M];

memset(jx, 0, A_M * sizeof(int));
memset(x, 0.0, A_M * sizeof(REAL_T));
#endif

#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK))
int num_chunks = MIN(OFFLOAD_NUM_CHUNKS, N);

int all_jx[N * num_chunks];
REAL_T all_x[N * num_chunks];

memset(all_jx, 0, N * num_chunks * sizeof(int));
memset(all_x, 0.0, N * num_chunks * sizeof(REAL_T));

#pragma omp target map(to:all_jx[0:N*num_chunks],all_x[0:N*num_chunks])

#endif

#if defined (USE_OMP_OFFLOAD)
#if defined(INTEL_SDK) || defined(CRAY_SDK)
#pragma omp teams distribute parallel for \
shared(N, A_M) \
shared(A_index, A_value, A_nnz)
for (int chunk = 0; chunk < num_chunks; chunk++)
{
int *jx;
REAL_T *x;

jx = &all_jx[chunk * N];
x = &all_x[chunk * N];

#else
#pragma omp target teams distribute parallel for \
shared(N, A_M) \
shared(A_index, A_value, A_nnz) \
firstprivate(jx, x)
#endif
#else
#if defined(__IBMC__) || defined(__ibmxl__)
#pragma omp parallel for \
Expand All @@ -373,7 +441,11 @@ void TYPED_FUNC(
firstprivate(jx, x)
#endif
#endif
#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK))
for (int i = chunk; i < N; i = i + num_chunks)
#else
for (int i = 0; i < N; i++)
#endif
{

#if defined(__IBMC__) || defined(__ibmxl__)
Expand Down Expand Up @@ -423,6 +495,9 @@ void TYPED_FUNC(
}
A_nnz[i] = ll;
}
#if defined(USE_OMP_OFFLOAD) && (defined(INTEL_SDK) || defined(CRAY_SDK))
}
#endif
}

/** Matrix addition.
Expand Down
36 changes: 18 additions & 18 deletions src/C-interface/ellpack/bml_allocate_ellpack.c
Original file line number Diff line number Diff line change
Expand Up @@ -17,24 +17,24 @@ void
bml_deallocate_ellpack(
bml_matrix_ellpack_t * A)
{
#ifdef USE_OMP_OFFLOAD
int N = A->N;
int M = A->M;

int *A_nnz = A->nnz;
int *A_index = A->index;
// JAMAL: need to make a typed deallocator
double *A_value = A->value;

#pragma omp target exit data map(delete: A_nnz[:N], A_index[:N*M], A_value[:N*M])
#endif

bml_deallocate_domain(A->domain);
bml_deallocate_domain(A->domain2);
bml_free_memory(A->value);
bml_free_memory(A->index);
bml_free_memory(A->nnz);
bml_free_memory(A);
switch (A->matrix_precision)
{
case single_real:
bml_deallocate_ellpack_single_real(A);
break;
case double_real:
bml_deallocate_ellpack_double_real(A);
break;
case single_complex:
bml_deallocate_ellpack_single_complex(A);
break;
case double_complex:
bml_deallocate_ellpack_double_complex(A);
break;
default:
LOG_ERROR("unknown precision\n");
break;
}
}

/** Clear a matrix.
Expand Down
12 changes: 12 additions & 0 deletions src/C-interface/ellpack/bml_allocate_ellpack.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,18 @@
void bml_deallocate_ellpack(
bml_matrix_ellpack_t * A);

void bml_deallocate_ellpack_single_real(
bml_matrix_ellpack_t * A);

void bml_deallocate_ellpack_double_real(
bml_matrix_ellpack_t * A);

void bml_deallocate_ellpack_single_complex(
bml_matrix_ellpack_t * A);

void bml_deallocate_ellpack_double_complex(
bml_matrix_ellpack_t * A);

void bml_clear_ellpack(
bml_matrix_ellpack_t * A);

Expand Down
29 changes: 29 additions & 0 deletions src/C-interface/ellpack/bml_allocate_ellpack_typed.c
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,35 @@
#include <omp.h>
#endif

/** Deallocate a matrix.
*
* \ingroup allocate_group
*
* \param A The matrix.
*/
void TYPED_FUNC(
bml_deallocate_ellpack) (
bml_matrix_ellpack_t * A)
{
#ifdef USE_OMP_OFFLOAD
int N = A->N;
int M = A->M;

int *A_nnz = A->nnz;
int *A_index = A->index;
REAL_T *A_value = A->value;

#pragma omp target exit data map(delete: A_nnz[:N], A_index[:N*M], A_value[:N*M])
#endif

bml_deallocate_domain(A->domain);
bml_deallocate_domain(A->domain2);
bml_free_memory(A->value);
bml_free_memory(A->index);
bml_free_memory(A->nnz);
bml_free_memory(A);
}

/** Clear a matrix.
*
* Numbers of non-zeroes, indeces, and values are set to zero.
Expand Down
Loading