Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update of AWS parallelcluster site config and instructions #490

Merged
merged 8 commits into from
Mar 8, 2023
192 changes: 156 additions & 36 deletions configs/sites/aws-pcluster/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@

### Base instance
Choose a basic AMI from the Community AMIs tab that matches your desired OS and parallelcluster version. Select an instance type of the same family that you are planning to use for the head and the compute nodes, and enough storage for a swap file and a spack-stack installation. For example:
- AMI ID: ami-091017c7508ac95f6
- Instance c6i.4xlarge
- Use 250GB of gp3 storage as /
- AMI ID: ami-093dab62f7840644b
- Instance c6i.8xlarge
- Use 500GB of gp3 storage as /

### Prerequisites
1. As `root`:
Expand All @@ -15,13 +15,80 @@ sudo su
apt-get -y update
apt-get -y upgrade
# These were already installed
#apt install -y apt-utils
apt install -y apt-utils

# Compilers - already installed
#apt install -y gcc g++ gfortran gdb
apt install -y gcc g++ gfortran gdb

# Install lua/lmod manually, because apt only has older versions
# that are not compatible with the modern lua modules spack produces
# https://lmod.readthedocs.io/en/latest/030_installing.html#install-lua-x-y-z-tar-gz
mkdir -p /opt/lua/5.1.4.9/src
cd /opt/lua/5.1.4.9/src
wget https://sourceforge.net/projects/lmod/files/lua-5.1.4.9.tar.bz2
tar -xvf lua-5.1.4.9.tar.bz2
cd lua-5.1.4.9
./configure --prefix=/opt/lua/5.1.4.9 2>&1 | tee log.config
make VERBOSE=1 2>&1 | tee log.make
make install 2>&1 | tee log.install
#
echo "# Set environment variables for lua" >> /etc/profile.d/02-lua.sh
echo "export PATH=\"/opt/lua/5.1.4.9/bin:\$PATH\"" >> /etc/profile.d/02-lua.sh
echo "export LD_LIBRARY_PATH=\"/opt/lua/5.1.4.9/lib:\$LD_LIBRARY_PATH\"" >> /etc/profile.d/02-lua.sh
echo "export CPATH=\"/opt/lua/5.1.4.9/include:\$CPATH\"" >> /etc/profile.d/02-lua.sh
echo "export MANPATH=\"/opt/lua/5.1.4.9/man:\$MANPATH\"" >> /etc/profile.d/02-lua.sh
#
source /etc/profile.d/02-lua.sh

mkdir -p /opt/lmod/8.7/src
cd /opt/lmod/8.7/src
wget https://sourceforge.net/projects/lmod/files/Lmod-8.7.tar.bz2
tar -xvf Lmod-8.7.tar.bz2
cd Lmod-8.7
# Note the weird prefix, lmod installs in PREFIX/lmod/X.Y automatically
./configure --prefix=/opt/ \
--with-lmodConfigDir=/opt/lmod/8.7/config \
2>&1 | tee log.config
make install 2>&1 | tee log.install
ln -sf /opt/lmod/lmod/init/profile /etc/profile.d/z00_lmod.sh
ln -sf /opt/lmod/lmod/init/cshrc /etc/profile.d/z00_lmod.csh
ln -sf /opt/lmod/lmod/init/profile.fish /etc/profile.d/z00_lmod.fish

# Add custom module locations and fix existing modules
#
# intelmpi
echo "conflict openmpi" >> /opt/intel/mpi/2021.6.0/modulefiles/intelmpi
echo 'if { [ module-info mode load ] && ![ is-loaded libfabric-aws/1.16.0~amzn4.0 ] } {' >> /opt/intel/mpi/2021.6.0/modulefiles/intelmpi
echo ' module load libfabric-aws/1.16.0~amzn4.0' >> /opt/intel/mpi/2021.6.0/modulefiles/intelmpi
echo '}' >> /opt/intel/mpi/2021.6.0/modulefiles/intelmpi
# openmpi
echo "conflict intelmpi" >> /usr/share/modules/modulefiles/openmpi/4.1.4
echo 'if { [ module-info mode load ] && ![ is-loaded libfabric-aws/1.16.0~amzn4.0 ] } {' >> /usr/share/modules/modulefiles/openmpi/4.1.4
echo ' module load libfabric-aws/1.16.0~amzn4.0' >> /usr/share/modules/modulefiles/openmpi/4.1.4
echo '}' >> /usr/share/modules/modulefiles/openmpi/4.1.4
#
echo "module use /usr/share/modules/modulefiles" >> /etc/profile.d/z01_lmod.sh
echo "module use /opt/intel/mpi/2021.6.0/modulefiles" >> /etc/profile.d/z01_lmod.sh
echo "module use /home/ubuntu/jedi/modulefiles" >> /etc/profile.d/z01_lmod.sh
#
# Log out completely, ssh back into the instance and check if lua modules work
exit
exit

# Environment module support - already installed
#apt install -y environment-modules
ssh ...
# Now user ubuntu
module av
module load libfabric-aws/1.16.0~amzn4.0
module load openmpi/4.1.4
module list
module unload openmpi/4.1.4
module load intelmpi
module list
module purge
module list

# Continue as root
sudo su

# Misc
apt install -y build-essential
Expand All @@ -42,11 +109,8 @@ apt install -y qt5-default
apt install -y libqt5svg5-dev
apt install -y qt5dxcb-plugin

# For R2D2 mysql backend
apt install -y mysql-server

# Remove AWS openmpi
apt remove -y openmpi40-aws
### # Remove AWS openmpi
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note to self. Remove lines 112-113 before merging.

### apt remove -y openmpi40-aws

# This is because boost doesn't work with the Intel compiler
apt install -y libboost1.71-dev
Expand All @@ -70,7 +134,7 @@ apt install -y python3-dev python3-pip
wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null
echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list
apt-get update
apt-get install -y intel-hpckit-2021.4.0/all
apt-get install -y intel-hpckit-2022.2.0/all

# Docker
# See https://docs.docker.com/engine/install/ubuntu/
Expand All @@ -80,7 +144,8 @@ mkdir -m 0755 -p /etc/apt/keyrings
curl -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /etc/apt/keyrings/docker.gpg
echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/ubuntu \
$(lsb_release -cs) stable" | tee /etc/apt/sources.list.d/docker.list > /dev/null
apt-get install docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin
apt-get update
apt install -y docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin
docker run hello-world
# DH* TODO 2023/02/21: Add users to group docker so that non-root users can run it
# See https://docs.docker.com/engine/install/linux-postinstall/
Expand Down Expand Up @@ -124,52 +189,104 @@ export BOOST_ROOT=/usr
cd $WK
mkdir build
cd build
cmake .. -DENABLE_STATIC_BOOST_LIBS=OFF -DCMAKE_INSTALL_PREFIX=/home/ubuntu/jedi/ecflow-5.8.4 2>&1 | tee log.cmake
cmake .. -DPython3_EXECUTABLE=/usr/bin/python3 -DENABLE_STATIC_BOOST_LIBS=OFF -DCMAKE_INSTALL_PREFIX=/home/ubuntu/jedi/ecflow-5.8.4 2>&1 | tee log.cmake
make -j4 2>&1 | tee log.make
make install 2>&1 | tee log.install

# Create a modulefiles directory with the following ecflow/5.8.4 module in it (w/o the '%%%%...' lines):
mkdir -p /home/ubuntu/jedi/modulefiles/ecflow
vi /home/ubuntu/jedi/modulefiles/ecflow/5.8.4
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
#%Module1.0

module-whatis "Provides an ecflow-5.8.4 server+ui installation for use with spack."

conflict ecflow

proc ModulesHelp { } {
puts stderr "Provides an ecflow-5.8.4 server+ui installation for use with spack."
}

# Set this value
set ECFLOW_PATH "/home/ubuntu/jedi/ecflow-5.8.4"

prepend-path PATH "${ECFLOW_PATH}/bin"
prepend-path LD_LIBRARY_PATH "${ECFLOW_PATH}/lib"
prepend-path LIBRARY_PATH "${ECFLOW_PATH}/lib"
prepend-path CPATH "${ECFLOW_PATH}/include"
prepend-path CMAKE_PREFIX_PATH "${ECFLOW_PATH}"
prepend-path PYTHONPATH "${ECFLOW_PATH}/lib/python3.8/site-packages"
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
```
4. Option 1: Use pre-defined site config in spack-stack (skip steps 5-7 afterwards)

4. Install msql community server
```
cd /home/ubuntu/jedi
mkdir -p mysql-8.0.31/src
cd mysql-8.0.31/src
wget https://dev.mysql.com/get/Downloads/MySQL-8.0/mysql-server_8.0.32-1ubuntu20.04_amd64.deb-bundle.tar
tar -xvf mysql-server_8.0.32-1ubuntu20.04_amd64.deb-bundle.tar
# Switch to root
sudo su
dpkg -i *.deb
apt --fix-broken install
dpkg -i *.deb
# Set root password, choose strong password encryption option
exit
rm *.deb
```

5. Option 1: Testing existing site config in spack-stack (skip steps 5-7 afterwards)
```
mkdir -p /home/ubuntu/sandpit
cd /home/ubuntu/sandpit
git clone -b develop --recursive https://github.com/noaa-emc/spack-stack spack-stack
cd spack-stack/
. setup.sh
spack stack create env --site aws-pcluster --template=skylab-dev --name=skylab-2.0.0-intel-2021.4.0
spack env activate -p envs/skylab-2.0.0-intel-2021.4.0
```
5. Option 2: For spack site configuration, to find Intel compiler
```
export PATH=/opt/intel/oneapi/compiler/2021.4.0/linux/bin/intel64:$PATH
spack stack create env --site aws-pcluster --template=unified-dev --name=unified-dev
spack env activate -p envs/unified-dev
sed -i "s/\['\%apple-clang', '\%gcc', '\%intel'\]/\['\%intel', '\%gcc'\]/g" envs/unified-dev/spack.yaml
```

6. Option 2: Configure site from scratch
6. Option 2: Test configuring site from scratch
```
mkdir /home/ubuntu/jedi && cd /home/ubuntu/jedi
git clone -b develop --recursive https://github.com/noaa-emc/spack-stack spack-stack
cd spack-stack/
. setup.sh
spack stack create env --site linux.default --template=skylab-dev --name=skylab-2.0.0-intel-2021.4.0
spack env activate -p envs/skylab-2.0.0-intel-2021.4.0
spack stack create env --site linux.default --template=unified-dev --name=unified-dev
spack env activate -p envs/unified-dev

export SPACK_SYSTEM_CONFIG_PATH=/home/ubuntu/jedi/spack-stack/envs/skylab-2.0.0-intel-2021.4.0/site
export SPACK_SYSTEM_CONFIG_PATH=/home/ubuntu/jedi/spack-stack/envs/unified-dev/site

spack external find --scope system
spack external find --scope system perl
spack external find --scope system python
spack external find --scope system wget
spack external find --scope system curl
spack external find --scope system texlive
spack external find --scope system mysql

# No external find for pre-installed intel-oneapi-mpi (from pcluster AMI),
# and no way to add object entry to list using "spack config add".
echo " intel-oneapi-mpi:" >> ${SPACK_SYSTEM_CONFIG_PATH}/packages.yaml
echo " buildable: False" >> ${SPACK_SYSTEM_CONFIG_PATH}/packages.yaml
echo " externals:" >> ${SPACK_SYSTEM_CONFIG_PATH}/packages.yaml
echo " - spec: intel-oneapi-mpi@2021.4.0%intel@2021.4.0" >> ${SPACK_SYSTEM_CONFIG_PATH}/packages.yaml
echo " - spec: intel-oneapi-mpi@2021.6.0%intel@2022.1.0" >> ${SPACK_SYSTEM_CONFIG_PATH}/packages.yaml
echo " prefix: /opt/intel" >> ${SPACK_SYSTEM_CONFIG_PATH}/packages.yaml
echo " modules:" >> ${SPACK_SYSTEM_CONFIG_PATH}/packages.yaml
echo " - libfabric-aws/1.16.0~amzn4.0" >> ${SPACK_SYSTEM_CONFIG_PATH}/packages.yaml
echo " - intelmpi" >> ${SPACK_SYSTEM_CONFIG_PATH}/packages.yaml

# Add external openmpi
echo " openmpi:" >> ${SPACK_SYSTEM_CONFIG_PATH}/packages.yaml
echo " externals:" >> ${SPACK_SYSTEM_CONFIG_PATH}/packages.yaml
echo " - spec: [email protected]%[email protected]~cuda~cxx~cxx_exceptions~java~memchecker+pmi~static~wrapper-rpath" >> ${SPACK_SYSTEM_CONFIG_PATH}/packages.yaml
echo " fabrics=ofi schedulers=slurm" >> ${SPACK_SYSTEM_CONFIG_PATH}/packages.yaml
echo " prefix: /opt/amazon/openmpi" >> ${SPACK_SYSTEM_CONFIG_PATH}/packages.yaml
echo " modules:" >> ${SPACK_SYSTEM_CONFIG_PATH}/packages.yaml
echo " - libfabric-aws/1.16.0~amzn3.0" >> ${SPACK_SYSTEM_CONFIG_PATH}/packages.yaml
echo " - openmpi/4.1.4" >> ${SPACK_SYSTEM_CONFIG_PATH}/packages.yaml

# Can't find qt5 because qtpluginfo is broken,
# and no way to add object entry to list using "spack config add".
echo " qt:" >> ${SPACK_SYSTEM_CONFIG_PATH}/packages.yaml
Expand All @@ -196,28 +313,31 @@ spack compiler find --scope system

export -n SPACK_SYSTEM_CONFIG_PATH

spack config add "packages:mpi:buildable:False"
spack config add "packages:python:buildable:False"
spack config add "packages:openssl:buildable:False"
spack config add "packages:all:providers:mpi:[intel-oneapi-mpi@2021.4.0]"
spack config add "packages:all:compiler:[intel@2021.4.0]"
spack config add "packages:all:providers:mpi:[intel-oneapi-mpi@2021.6.0, [email protected]]"
spack config add "packages:all:compiler:[intel@2022.1.0, gcc@9.4.0]"

# edit envs/skylab-2.0.0-intel-2021.4.0/site/compilers.yaml and replace the following line in the **Intel** compiler section:
# edit envs/unified-dev/site/compilers.yaml and replace the following line in the **Intel** compiler section:
# environment: {}
# -->
# environment:
# prepend_path:
# LD_LIBRARY_PATH: '/opt/intel/oneapi/compiler/2021.4.0/linux/compiler/lib/intel64_lin'
# LD_LIBRARY_PATH: '/opt/intel/oneapi/compiler/2021.6.0/linux/compiler/lib/intel64_lin'
# set:
# I_MPI_PMI_LIBRARY: '/opt/slurm/lib/libpmi.so'

# edit envs/skylab-2.0.0-intel-2021.4.0/site/packages.yaml and remove the older Python versions, keep 3.8.10 only
```

7. Option 2: Temporary workarounds to avoid duplicate hdf5, cmake etc. versions. Edit ``envs/skylab-2.0.0-intel-2021.4.0/site/packages.yaml`` and remove the external ``cmake`` and ``openssl`` entries.
7. Option 2: To avoid duplicate hdf5, cmake, ... versions, edit ``envs/unified-dev/site/packages.yaml`` and remove the external ``cmake`` and ``openssl`` entries.

8. Concretize and install
```
spack concretize 2>&1 | tee log.concretize
spack install --verbose --source 2>&1 | tee log.install
spack module lmod refresh
spack stack setup-meta-modules
```
9. Create the AMI for use in the AWS parallelcluster config.
9. Test spack-stack installation using your favorite application.
10. (Optional) Remove test installs of spack-stack environments, if desired.
11. Create the AMI for use in the AWS parallelcluster config.
30 changes: 24 additions & 6 deletions configs/sites/aws-pcluster/compilers.yaml
Original file line number Diff line number Diff line change
@@ -1,21 +1,39 @@
compilers:
- compiler:
spec: intel@2021.4.0
spec: intel@2022.1.0
paths:
cc: /opt/intel/oneapi/compiler/2021.4.0/linux/bin/intel64/icc
cxx: /opt/intel/oneapi/compiler/2021.4.0/linux/bin/intel64/icpc
f77: /opt/intel/oneapi/compiler/2021.4.0/linux/bin/intel64/ifort
fc: /opt/intel/oneapi/compiler/2021.4.0/linux/bin/intel64/ifort
cc: /opt/intel/oneapi/compiler/2022.1.0/linux/bin/intel64/icc
cxx: /opt/intel/oneapi/compiler/2022.1.0/linux/bin/intel64/icpc
f77: /opt/intel/oneapi/compiler/2022.1.0/linux/bin/intel64/ifort
fc: /opt/intel/oneapi/compiler/2022.1.0/linux/bin/intel64/ifort
flags: {}
operating_system: ubuntu20.04
target: x86_64
modules: []
environment:
prepend_path:
LD_LIBRARY_PATH: '/opt/intel/oneapi/compiler/2021.4.0/linux/compiler/lib/intel64_lin'
LD_LIBRARY_PATH: '/opt/intel/oneapi/compiler/2022.1.0/linux/compiler/lib/intel64_lin'
set:
I_MPI_PMI_LIBRARY: '/opt/slurm/lib/libpmi.so'
extra_rpaths: []
# Spack gets confused if there is an Intel and a OneAPI compiler ...
#- compiler:
# spec: [email protected]
# paths:
# cc: /opt/intel/oneapi/compiler/2022.1.0/linux/bin/icx
# cxx: /opt/intel/oneapi/compiler/2022.1.0/linux/bin/icpx
# f77: /opt/intel/oneapi/compiler/2022.1.0/linux/bin/ifx
# fc: /opt/intel/oneapi/compiler/2022.1.0/linux/bin/ifx
# flags: {}
# operating_system: ubuntu20.04
# target: x86_64
# modules: []
# environment:
# #prepend_path:
# # LD_LIBRARY_PATH: '/opt/intel/oneapi/compiler/2022.1.0/linux/compiler/lib/intel64_lin'
# set:
# I_MPI_PMI_LIBRARY: '/opt/slurm/lib/libpmi.so'
# extra_rpaths: []
- compiler:
spec: [email protected]
paths:
Expand Down
7 changes: 3 additions & 4 deletions configs/sites/aws-pcluster/modules.yaml
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
modules:
default:
enable::
- tcl
tcl:
- lmod
lmod:
whitelist:
# List of packages for which we need modules that are blacklisted by default
- openmpi
- mpich
- python
Loading