From 9cdb01a77374417cacf4404ebb1e8bd7cb552e41 Mon Sep 17 00:00:00 2001 From: Mark Taylor Date: Wed, 9 Aug 2017 13:07:30 -0600 Subject: [PATCH 1/3] fix incorrect settings for --npernode on skybridge --npernode was set to the number of cores per node, independent of the number of threads used, resulting in oversubscribed nodes. Updated skybridge environment now rejects these jobs (and we shouldn't run this way anyway). Also: remove duplicate sbatch setting (so this is only set in one place) added support for Intel Broadwell machine 'ghost' removed support for decomissioned redsky [BFB] --- config/acme/machines/config_batch.xml | 18 ++---- config/acme/machines/config_compilers.xml | 5 +- config/acme/machines/config_machines.xml | 72 ++++++++++++++++++++++- 3 files changed, 78 insertions(+), 17 deletions(-) diff --git a/config/acme/machines/config_batch.xml b/config/acme/machines/config_batch.xml index 74b0055e892..76ec9506d09 100644 --- a/config/acme/machines/config_batch.xml +++ b/config/acme/machines/config_batch.xml @@ -302,32 +302,24 @@ - - --ntasks-per-node={{ tasks_per_node }} - - ec + batch - - --ntasks-per-node={{ tasks_per_node }} - - ec + batch - - - --ntasks-per-node={{ tasks_per_node }} - + - ec + batch + -l nodes={{ num_nodes }}:ppn={{ tasks_per_node }} diff --git a/config/acme/machines/config_compilers.xml b/config/acme/machines/config_compilers.xml index b514dd6bc3c..adfa7c8fe05 100644 --- a/config/acme/machines/config_compilers.xml +++ b/config/acme/machines/config_compilers.xml @@ -702,18 +702,19 @@ for mct, etc. /projects/ccsm/AlbanyTrilinos_06262017/Albany/build/install - + -O2 -O2 $(NETCDFROOT) $(PNETCDFROOT) + /opt/openmpi-1.8-intel /projects/ccsm/esmf-6.3.0rp1/lib/libO/Linux.intel.64.openmpi.default --host=Linux $(shell $(NETCDF_PATH)/bin/nf-config --flibs) -L/projects/ccsm/BLAS-intel -lblas_LINUX lustre -mkl=cluster -mkl - /projects/ccsm/AlbanyTrilinos/Albany/build/install + /projects/ccsm/AlbanyTrilinos_06262017/Albany/build/install diff --git a/config/acme/machines/config_machines.xml b/config/acme/machines/config_machines.xml index 87f5bf331fa..2ae5ae473c1 100644 --- a/config/acme/machines/config_machines.xml +++ b/config/acme/machines/config_machines.xml @@ -710,7 +710,7 @@ --bind-to-core --n $TOTALPES - --npernode $PES_PER_NODE + --npernode {{tasks_per_node}} @@ -778,7 +778,7 @@ --bind-to-core --n $TOTALPES - --npernode $PES_PER_NODE + --npernode {{tasks_per_node}} @@ -827,6 +827,74 @@ + + SNL clust + ghost-login + wwwproxy.sandia.gov:80 + acme_integration + intel + openmpi,mpi-serial + LINUX + /gscratch/$USER/acme_scratch/ghost + $CIME_OUTPUT_ROOT/$CASE/run + $CIME_OUTPUT_ROOT/$CASE/bld + /projects/ccsm/inputdata + /projects/ccsm/inputdata/atm/datm7 + $CIME_OUTPUT_ROOT/archive/$CASE + USERDEFINED_optional_run + /projects/ccsm/ccsm_baselines + /projects/ccsm/cprnc/build.toss3/cprnc_wrap + slurm + jgfouca at sandia dot gov + 8 + 36 + 36 + 1 + TRUE + fy150001 + + + mpiexec + + --bind-to-core + --n $TOTALPES + --npernode {{tasks_per_node}} + + + + + + + /usr/share/lmod/lmod/init/python.py + /usr/share/lmod/lmod/init/perl.pm + /usr/share/lmod/lmod/init/sh + /usr/share/lmod/lmod/init/csh + /usr/share/lmod/lmod/libexec/lmod python + /usr/share/lmod/lmod/libexec/lmod perl + module + module + + + sems-env + sems-git + sems-python/2.7.9 + sems-cmake + gnu/4.9.2 + sems-intel/16.0.2 + sems-openmpi/1.10.5 + mkl/16.0 + sems-netcdf/4.4.1/exo_parallel + + + + $ENV{SEMS_NETCDF_ROOT} + $ENV{SEMS_NETCDF_ROOT} + $ENV{SEMS_NETCDF_ROOT}/include + $ENV{SEMS_NETCDF_ROOT}/lib + 64M + + + ANL/LCRC Linux Cluster blogin.*.lcrc.anl.gov From b7eeb2bd1ecf403fbe68fb6a9feea55e617abe2d Mon Sep 17 00:00:00 2001 From: Mark Taylor Date: Wed, 9 Aug 2017 15:59:01 -0600 Subject: [PATCH 2/3] bug fix: needed spaces {{ }} CIME syntax --- config/acme/machines/config_machines.xml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/config/acme/machines/config_machines.xml b/config/acme/machines/config_machines.xml index 2ae5ae473c1..58dec3cfc3f 100644 --- a/config/acme/machines/config_machines.xml +++ b/config/acme/machines/config_machines.xml @@ -710,7 +710,7 @@ --bind-to-core --n $TOTALPES - --npernode {{tasks_per_node}} + --npernode {{ tasks_per_node }} @@ -778,7 +778,7 @@ --bind-to-core --n $TOTALPES - --npernode {{tasks_per_node}} + --npernode {{ tasks_per_node }} @@ -858,7 +858,7 @@ --bind-to-core --n $TOTALPES - --npernode {{tasks_per_node}} + --npernode {{ tasks_per_node }} From 0f4c7917c2985502422cbfcd66263e7338bb0545 Mon Sep 17 00:00:00 2001 From: Mark Taylor Date: Wed, 9 Aug 2017 17:22:22 -0600 Subject: [PATCH 3/3] replacing --npernode with mapping arguments copied from Anvil [BFB] --- config/acme/machines/config_machines.xml | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/config/acme/machines/config_machines.xml b/config/acme/machines/config_machines.xml index 58dec3cfc3f..737968149d0 100644 --- a/config/acme/machines/config_machines.xml +++ b/config/acme/machines/config_machines.xml @@ -708,9 +708,8 @@ mpiexec - --bind-to-core --n $TOTALPES - --npernode {{ tasks_per_node }} + --map-by ppr:{{ tasks_per_numa }}:socket:PE=$ENV{OMP_NUM_THREADS} --bind-to core @@ -776,9 +775,8 @@ mpiexec - --bind-to-core --n $TOTALPES - --npernode {{ tasks_per_node }} + --map-by ppr:{{ tasks_per_numa }}:socket:PE=$ENV{OMP_NUM_THREADS} --bind-to core @@ -856,9 +854,8 @@ mpiexec - --bind-to-core --n $TOTALPES - --npernode {{ tasks_per_node }} + --map-by ppr:{{ tasks_per_numa }}:socket:PE=$ENV{OMP_NUM_THREADS} --bind-to core