Skip to content

Commit

Permalink
Merge branch 'mt5555/cime/skybridge3'
Browse files Browse the repository at this point in the history
--npernode was set to the number of cores per node, independent of
the number of threads used, resulting in oversubscribed nodes.
Updated skybridge environment now rejects these jobs (and we shouldn't run
this way anyway).

Also:
remove duplicate sbatch setting (so this is only set in one place)
added support for Intel Broadwell machine 'ghost'
removed support for decomissioned redsky

[BFB]
  • Loading branch information
mfdeakin-sandia committed Aug 11, 2017
2 parents ce88a5a + 0f4c791 commit 57d0825
Show file tree
Hide file tree
Showing 3 changed files with 77 additions and 19 deletions.
18 changes: 5 additions & 13 deletions config/acme/machines/config_batch.xml
Original file line number Diff line number Diff line change
Expand Up @@ -302,32 +302,24 @@
</batch_system>

<batch_system MACH="skybridge" type="slurm" >
<directives>
<directive>--ntasks-per-node={{ tasks_per_node }}</directive>
</directives>
<queues>
<queue jobmin="1" jobmax="1024" walltimemax="06:00:00" default="true">ec</queue>
<queue jobmin="1" jobmax="1024" walltimemax="06:00:00" default="true">batch</queue>
</queues>
</batch_system>

<batch_system MACH="chama" type="slurm" >
<directives>
<directive>--ntasks-per-node={{ tasks_per_node }}</directive>
</directives>
<queues>
<queue jobmin="1" jobmax="1024" walltimemax="06:00:00" default="true">ec</queue>
<queue jobmin="1" jobmax="1024" walltimemax="06:00:00" default="true">batch</queue>
</queues>
</batch_system>

<batch_system MACH="redsky" type="slurm" >
<directives>
<directive>--ntasks-per-node={{ tasks_per_node }}</directive>
</directives>
<batch_system MACH="ghost" type="slurm" >
<queues>
<queue jobmin="1" jobmax="1024" walltimemax="06:00:00" default="true">ec</queue>
<queue jobmin="1" jobmax="1024" walltimemax="06:00:00" default="true">batch</queue>
</queues>
</batch_system>


<batch_system MACH="mustang" type="moab" >
<directives>
<directive>-l nodes={{ num_nodes }}:ppn={{ tasks_per_node }}</directive>
Expand Down
5 changes: 3 additions & 2 deletions config/acme/machines/config_compilers.xml
Original file line number Diff line number Diff line change
Expand Up @@ -702,18 +702,19 @@ for mct, etc.
<ALBANY_PATH>/projects/ccsm/AlbanyTrilinos_06262017/Albany/build/install</ALBANY_PATH>
</compiler>

<compiler COMPILER="intel" MACH="redsky">
<compiler COMPILER="intel" MACH="ghost">
<ADD_FFLAGS DEBUG="FALSE"> -O2 </ADD_FFLAGS>
<ADD_CFLAGS DEBUG="FALSE"> -O2 </ADD_CFLAGS>
<NETCDF_PATH>$(NETCDFROOT)</NETCDF_PATH>
<PNETCDF_PATH>$(PNETCDFROOT)</PNETCDF_PATH>
<MPI_PATH MPILIB="openmpi">/opt/openmpi-1.8-intel</MPI_PATH>
<ESMF_LIBDIR>/projects/ccsm/esmf-6.3.0rp1/lib/libO/Linux.intel.64.openmpi.default</ESMF_LIBDIR>
<CONFIG_ARGS> --host=Linux </CONFIG_ARGS>
<ADD_SLIBS> $(shell $(NETCDF_PATH)/bin/nf-config --flibs) -L/projects/ccsm/BLAS-intel -lblas_LINUX</ADD_SLIBS>
<PIO_FILESYSTEM_HINTS>lustre </PIO_FILESYSTEM_HINTS>
<ADD_SLIBS MPILIB="openmpi"> -mkl=cluster </ADD_SLIBS>
<ADD_SLIBS MPILIB="mpi-serial"> -mkl </ADD_SLIBS>
<ALBANY_PATH>/projects/ccsm/AlbanyTrilinos/Albany/build/install</ALBANY_PATH>
<ALBANY_PATH>/projects/ccsm/AlbanyTrilinos_06262017/Albany/build/install</ALBANY_PATH>
</compiler>

<compiler COMPILER="gnu" MACH="penn">
Expand Down
73 changes: 69 additions & 4 deletions config/acme/machines/config_machines.xml
Original file line number Diff line number Diff line change
Expand Up @@ -708,9 +708,8 @@
<mpirun mpilib="default">
<executable>mpiexec</executable>
<arguments>
<arg name="bind"> --bind-to-core</arg>
<arg name="num_tasks"> --n $TOTALPES</arg>
<arg name="tasks_per_node"> --npernode $PES_PER_NODE</arg>
<arg name="tasks_per_node"> --map-by ppr:{{ tasks_per_numa }}:socket:PE=$ENV{OMP_NUM_THREADS} --bind-to core</arg>
</arguments>
</mpirun>
<mpirun mpilib="mpi-serial">
Expand Down Expand Up @@ -776,9 +775,8 @@
<mpirun mpilib="default">
<executable>mpiexec</executable>
<arguments>
<arg name="bind"> --bind-to-core</arg>
<arg name="num_tasks"> --n $TOTALPES</arg>
<arg name="tasks_per_node"> --npernode $PES_PER_NODE</arg>
<arg name="tasks_per_node"> --map-by ppr:{{ tasks_per_numa }}:socket:PE=$ENV{OMP_NUM_THREADS} --bind-to core</arg>
</arguments>
</mpirun>
<mpirun mpilib="mpi-serial">
Expand Down Expand Up @@ -827,6 +825,73 @@
</environment_variables>
</machine>

<machine MACH="ghost">
<DESC>SNL clust</DESC>
<NODENAME_REGEX>ghost-login</NODENAME_REGEX>
<PROXY>wwwproxy.sandia.gov:80</PROXY>
<TESTS>acme_integration</TESTS>
<COMPILERS>intel</COMPILERS>
<MPILIBS>openmpi,mpi-serial</MPILIBS>
<OS>LINUX</OS>
<CIME_OUTPUT_ROOT>/gscratch/$USER/acme_scratch/ghost</CIME_OUTPUT_ROOT>
<RUNDIR>$CIME_OUTPUT_ROOT/$CASE/run</RUNDIR>
<EXEROOT>$CIME_OUTPUT_ROOT/$CASE/bld</EXEROOT>
<DIN_LOC_ROOT>/projects/ccsm/inputdata</DIN_LOC_ROOT>
<DIN_LOC_ROOT_CLMFORC>/projects/ccsm/inputdata/atm/datm7</DIN_LOC_ROOT_CLMFORC>
<DOUT_S_ROOT>$CIME_OUTPUT_ROOT/archive/$CASE</DOUT_S_ROOT> <!-- complete path to a short term archiving directory -->
<DOUT_L_MSROOT>USERDEFINED_optional_run</DOUT_L_MSROOT> <!-- complete path to a long term archiving directory -->
<BASELINE_ROOT>/projects/ccsm/ccsm_baselines</BASELINE_ROOT>
<CCSM_CPRNC>/projects/ccsm/cprnc/build.toss3/cprnc_wrap</CCSM_CPRNC> <!-- path to the cprnc tool used to compare netcdf history files in testing -->
<BATCH_SYSTEM>slurm</BATCH_SYSTEM>
<SUPPORTED_BY>jgfouca at sandia dot gov</SUPPORTED_BY>
<GMAKE_J>8</GMAKE_J>
<MAX_TASKS_PER_NODE>36</MAX_TASKS_PER_NODE>
<PES_PER_NODE>36</PES_PER_NODE>
<PIO_BUFFER_SIZE_LIMIT>1</PIO_BUFFER_SIZE_LIMIT>
<PROJECT_REQUIRED>TRUE</PROJECT_REQUIRED>
<PROJECT>fy150001</PROJECT>

<mpirun mpilib="default">
<executable>mpiexec</executable>
<arguments>
<arg name="num_tasks"> --n $TOTALPES</arg>
<arg name="tasks_per_node"> --map-by ppr:{{ tasks_per_numa }}:socket:PE=$ENV{OMP_NUM_THREADS} --bind-to core</arg>
</arguments>
</mpirun>
<mpirun mpilib="mpi-serial">
<executable></executable>
</mpirun>
<module_system type="module">
<init_path lang="python">/usr/share/lmod/lmod/init/python.py</init_path>
<init_path lang="perl">/usr/share/lmod/lmod/init/perl.pm</init_path>
<init_path lang="sh">/usr/share/lmod/lmod/init/sh</init_path>
<init_path lang="csh">/usr/share/lmod/lmod/init/csh</init_path>
<cmd_path lang="python">/usr/share/lmod/lmod/libexec/lmod python</cmd_path>
<cmd_path lang="perl">/usr/share/lmod/lmod/libexec/lmod perl</cmd_path>
<cmd_path lang="csh">module</cmd_path>
<cmd_path lang="sh">module</cmd_path>
<modules>
<command name="purge"/>
<command name="load">sems-env</command>
<command name="load">sems-git</command>
<command name="load">sems-python/2.7.9</command>
<command name="load">sems-cmake</command>
<command name="load">gnu/4.9.2</command>
<command name="load">sems-intel/16.0.2</command>
<command name="load" mpilib="!mpi-serial">sems-openmpi/1.10.5</command>
<command name="load">mkl/16.0</command>
<command name="load">sems-netcdf/4.4.1/exo_parallel</command>
</modules>
</module_system>
<environment_variables>
<env name="NETCDFROOT">$ENV{SEMS_NETCDF_ROOT}</env>
<env name="PNETCDFROOT" mpilib="!mpi-serial">$ENV{SEMS_NETCDF_ROOT}</env>
<env name="NETCDF_INCLUDES">$ENV{SEMS_NETCDF_ROOT}/include</env>
<env name="NETCDF_LIBS">$ENV{SEMS_NETCDF_ROOT}/lib</env>
<env name="OMP_STACKSIZE">64M</env>
</environment_variables>
</machine>

<machine MACH="blues">
<DESC>ANL/LCRC Linux Cluster</DESC>
<NODENAME_REGEX>blogin.*.lcrc.anl.gov</NODENAME_REGEX>
Expand Down

0 comments on commit 57d0825

Please sign in to comment.