Skip to content

Commit

Permalink
fix incorrect settings for --npernode on skybridge and chama
Browse files Browse the repository at this point in the history
--npernode was set to the number of cores per node, independent of
the number of threads used, resulting in oversubscribed nodes.
Updated skybridge environment now rejects these jobs (and we shouldn't run
this way anyway).

Also:
remove duplicate sbatch setting (so this is only set in one place)
added support for SNL machine 'ghost' (Intel Broadwell cluster)
removed support for decommissioned redsky

[BFB]
  • Loading branch information
mt5555 committed Aug 9, 2017
1 parent ee8bef2 commit 45a134a
Show file tree
Hide file tree
Showing 4 changed files with 79 additions and 18 deletions.
18 changes: 5 additions & 13 deletions cime/config/acme/machines/config_batch.xml
Original file line number Diff line number Diff line change
Expand Up @@ -302,32 +302,24 @@
</batch_system>

<batch_system MACH="skybridge" type="slurm" >
<directives>
<directive>--ntasks-per-node={{ tasks_per_node }}</directive>
</directives>
<queues>
<queue jobmin="1" jobmax="1024" walltimemax="06:00:00" default="true">ec</queue>
<queue jobmin="1" jobmax="1024" walltimemax="06:00:00" default="true">batch</queue>
</queues>
</batch_system>

<batch_system MACH="chama" type="slurm" >
<directives>
<directive>--ntasks-per-node={{ tasks_per_node }}</directive>
</directives>
<queues>
<queue jobmin="1" jobmax="1024" walltimemax="06:00:00" default="true">ec</queue>
<queue jobmin="1" jobmax="1024" walltimemax="06:00:00" default="true">batch</queue>
</queues>
</batch_system>

<batch_system MACH="redsky" type="slurm" >
<directives>
<directive>--ntasks-per-node={{ tasks_per_node }}</directive>
</directives>
<batch_system MACH="ghost" type="slurm" >
<queues>
<queue jobmin="1" jobmax="1024" walltimemax="06:00:00" default="true">ec</queue>
<queue jobmin="1" jobmax="1024" walltimemax="06:00:00" default="true">batch</queue>
</queues>
</batch_system>


<batch_system MACH="mustang" type="moab" >
<directives>
<directive>-l nodes={{ num_nodes }}:ppn={{ tasks_per_node }}</directive>
Expand Down
5 changes: 3 additions & 2 deletions cime/config/acme/machines/config_compilers.xml
Original file line number Diff line number Diff line change
Expand Up @@ -702,18 +702,19 @@ for mct, etc.
<ALBANY_PATH>/projects/ccsm/AlbanyTrilinos_06262017/Albany/build/install</ALBANY_PATH>
</compiler>

<compiler COMPILER="intel" MACH="redsky">
<compiler COMPILER="intel" MACH="ghost">
<ADD_FFLAGS DEBUG="FALSE"> -O2 </ADD_FFLAGS>
<ADD_CFLAGS DEBUG="FALSE"> -O2 </ADD_CFLAGS>
<NETCDF_PATH>$(NETCDFROOT)</NETCDF_PATH>
<PNETCDF_PATH>$(PNETCDFROOT)</PNETCDF_PATH>
<MPI_PATH MPILIB="openmpi">/opt/openmpi-1.8-intel</MPI_PATH>
<ESMF_LIBDIR>/projects/ccsm/esmf-6.3.0rp1/lib/libO/Linux.intel.64.openmpi.default</ESMF_LIBDIR>
<CONFIG_ARGS> --host=Linux </CONFIG_ARGS>
<ADD_SLIBS> $(shell $(NETCDF_PATH)/bin/nf-config --flibs) -L/projects/ccsm/BLAS-intel -lblas_LINUX</ADD_SLIBS>
<PIO_FILESYSTEM_HINTS>lustre </PIO_FILESYSTEM_HINTS>
<ADD_SLIBS MPILIB="openmpi"> -mkl=cluster </ADD_SLIBS>
<ADD_SLIBS MPILIB="mpi-serial"> -mkl </ADD_SLIBS>
<ALBANY_PATH>/projects/ccsm/AlbanyTrilinos/Albany/build/install</ALBANY_PATH>
<ALBANY_PATH>/projects/ccsm/AlbanyTrilinos_06262017/Albany/build/install</ALBANY_PATH>
</compiler>

<compiler COMPILER="gnu" MACH="penn">
Expand Down
72 changes: 70 additions & 2 deletions cime/config/acme/machines/config_machines.xml
Original file line number Diff line number Diff line change
Expand Up @@ -710,7 +710,7 @@
<arguments>
<arg name="bind"> --bind-to-core</arg>
<arg name="num_tasks"> --n $TOTALPES</arg>
<arg name="tasks_per_node"> --npernode $PES_PER_NODE</arg>
<arg name="tasks_per_node"> --npernode {{tasks_per_node}}</arg>
</arguments>
</mpirun>
<mpirun mpilib="mpi-serial">
Expand Down Expand Up @@ -778,7 +778,7 @@
<arguments>
<arg name="bind"> --bind-to-core</arg>
<arg name="num_tasks"> --n $TOTALPES</arg>
<arg name="tasks_per_node"> --npernode $PES_PER_NODE</arg>
<arg name="tasks_per_node"> --npernode {{tasks_per_node}}</arg>
</arguments>
</mpirun>
<mpirun mpilib="mpi-serial">
Expand Down Expand Up @@ -827,6 +827,74 @@
</environment_variables>
</machine>

<machine MACH="ghost">
<DESC>SNL clust</DESC>
<NODENAME_REGEX>ghost-login</NODENAME_REGEX>
<PROXY>wwwproxy.sandia.gov:80</PROXY>
<TESTS>acme_integration</TESTS>
<COMPILERS>intel</COMPILERS>
<MPILIBS>openmpi,mpi-serial</MPILIBS>
<OS>LINUX</OS>
<CIME_OUTPUT_ROOT>/gscratch/$USER/acme_scratch/ghost</CIME_OUTPUT_ROOT>
<RUNDIR>$CIME_OUTPUT_ROOT/$CASE/run</RUNDIR>
<EXEROOT>$CIME_OUTPUT_ROOT/$CASE/bld</EXEROOT>
<DIN_LOC_ROOT>/projects/ccsm/inputdata</DIN_LOC_ROOT>
<DIN_LOC_ROOT_CLMFORC>/projects/ccsm/inputdata/atm/datm7</DIN_LOC_ROOT_CLMFORC>
<DOUT_S_ROOT>$CIME_OUTPUT_ROOT/archive/$CASE</DOUT_S_ROOT> <!-- complete path to a short term archiving directory -->
<DOUT_L_MSROOT>USERDEFINED_optional_run</DOUT_L_MSROOT> <!-- complete path to a long term archiving directory -->
<BASELINE_ROOT>/projects/ccsm/ccsm_baselines</BASELINE_ROOT>
<CCSM_CPRNC>/projects/ccsm/cprnc/build.toss3/cprnc_wrap</CCSM_CPRNC> <!-- path to the cprnc tool used to compare netcdf history files in testing -->
<BATCH_SYSTEM>slurm</BATCH_SYSTEM>
<SUPPORTED_BY>jgfouca at sandia dot gov</SUPPORTED_BY>
<GMAKE_J>8</GMAKE_J>
<MAX_TASKS_PER_NODE>36</MAX_TASKS_PER_NODE>
<PES_PER_NODE>36</PES_PER_NODE>
<PIO_BUFFER_SIZE_LIMIT>1</PIO_BUFFER_SIZE_LIMIT>
<PROJECT_REQUIRED>TRUE</PROJECT_REQUIRED>
<PROJECT>fy150001</PROJECT>

<mpirun mpilib="default">
<executable>mpiexec</executable>
<arguments>
<arg name="bind"> --bind-to-core</arg>
<arg name="num_tasks"> --n $TOTALPES</arg>
<arg name="tasks_per_node"> --npernode {{tasks_per_node}}</arg>
</arguments>
</mpirun>
<mpirun mpilib="mpi-serial">
<executable></executable>
</mpirun>
<module_system type="module">
<init_path lang="python">/usr/share/lmod/lmod/init/python.py</init_path>
<init_path lang="perl">/usr/share/lmod/lmod/init/perl.pm</init_path>
<init_path lang="sh">/usr/share/lmod/lmod/init/sh</init_path>
<init_path lang="csh">/usr/share/lmod/lmod/init/csh</init_path>
<cmd_path lang="python">/usr/share/lmod/lmod/libexec/lmod python</cmd_path>
<cmd_path lang="perl">/usr/share/lmod/lmod/libexec/lmod perl</cmd_path>
<cmd_path lang="csh">module</cmd_path>
<cmd_path lang="sh">module</cmd_path>
<modules>
<command name="purge"/>
<command name="load">sems-env</command>
<command name="load">sems-git</command>
<command name="load">sems-python/2.7.9</command>
<command name="load">sems-cmake</command>
<command name="load">gnu/4.9.2</command>
<command name="load">sems-intel/16.0.2</command>
<command name="load" mpilib="!mpi-serial">sems-openmpi/1.10.5</command>
<command name="load">mkl/16.0</command>
<command name="load">sems-netcdf/4.4.1/exo_parallel</command>
</modules>
</module_system>
<environment_variables>
<env name="NETCDFROOT">$ENV{SEMS_NETCDF_ROOT}</env>
<env name="PNETCDFROOT" mpilib="!mpi-serial">$ENV{SEMS_NETCDF_ROOT}</env>
<env name="NETCDF_INCLUDES">$ENV{SEMS_NETCDF_ROOT}/include</env>
<env name="NETCDF_LIBS">$ENV{SEMS_NETCDF_ROOT}/lib</env>
<env name="OMP_STACKSIZE">64M</env>
</environment_variables>
</machine>

<machine MACH="blues">
<DESC>ANL/LCRC Linux Cluster</DESC>
<NODENAME_REGEX>blogin.*.lcrc.anl.gov</NODENAME_REGEX>
Expand Down
2 changes: 1 addition & 1 deletion components/clm/src/external_models/sbetr
Submodule sbetr updated from 0bec95 to 627407

0 comments on commit 45a134a

Please sign in to comment.