From 787e7b05c23118224a1d1601350ce3523f1fcdb5 Mon Sep 17 00:00:00 2001 From: James Foucar Date: Tue, 15 Aug 2017 14:17:46 -0600 Subject: [PATCH 1/2] Refactor how spare nodes get computed. The prior implementation was a bit too simple. Always giving the user 10 percent of nodes as spare nodes was way overkill for large jobs. The new implementation maxes out the number of spare nodes at 10. We also add a new variable to allow the user to pick the exact number of spare nodes that they want. --- config/acme/machines/config_machines.xml | 2 +- config/config_tests.xml | 2 +- scripts/lib/CIME/XML/env_mach_pes.py | 15 ++++++++++++++- src/drivers/mct/cime_config/config_component.xml | 14 +++++++++++--- 4 files changed, 27 insertions(+), 6 deletions(-) diff --git a/config/acme/machines/config_machines.xml b/config/acme/machines/config_machines.xml index b82eb059f7f..621b9311c50 100644 --- a/config/acme/machines/config_machines.xml +++ b/config/acme/machines/config_machines.xml @@ -1982,7 +1982,7 @@ acme 8 16 - 10 + TRUE TRUE cli115 -D PIO_BUILD_TIMING:BOOL=ON diff --git a/config/config_tests.xml b/config/config_tests.xml index a4b146f7b44..43b49fc6c39 100644 --- a/config/config_tests.xml +++ b/config/config_tests.xml @@ -442,7 +442,7 @@ NODEFAIL Tests restart upon detected node failure. Generates fake failu FALSE FALSE JGF FAKE NODE FAIL - 300 + 3 diff --git a/scripts/lib/CIME/XML/env_mach_pes.py b/scripts/lib/CIME/XML/env_mach_pes.py index 0f409b5df48..b81e2e2ea99 100644 --- a/scripts/lib/CIME/XML/env_mach_pes.py +++ b/scripts/lib/CIME/XML/env_mach_pes.py @@ -80,4 +80,17 @@ def get_total_nodes(self, total_tasks, max_thread_count): return num_nodes, self.get_spare_nodes(num_nodes) def get_spare_nodes(self, num_nodes): - return int(math.ceil(float(num_nodes) * (self.get_value("PCT_SPARE_NODES") / 100.0))) + force_spare_nodes = self.get_value("FORCE_SPARE_NODES") + if force_spare_nodes is not None: + return force_spare_nodes + + if self.get_value("ALLOCATE_SPARE_NODES"): + ten_pct = int(math.ceil(float(num_nodes) * 0.1)) + if ten_pct < 1: + return 1 # Always provide at lease one spare node + elif ten_pct > 10: + return 10 # Never provide more than 10 spare nodes + else: + return ten_pct + else: + return 0 diff --git a/src/drivers/mct/cime_config/config_component.xml b/src/drivers/mct/cime_config/config_component.xml index e64e168927b..63949acd4b2 100644 --- a/src/drivers/mct/cime_config/config_component.xml +++ b/src/drivers/mct/cime_config/config_component.xml @@ -1878,12 +1878,20 @@ - + + logical + TRUE,FALSE + FALSE + mach_pes + env_mach_pes.xml + Allocate some spare nodes to handle node failures. The system will pick a reasonable number + + + integer - 0 mach_pes env_mach_pes.xml - Percent of extra spare nodes to allocate + Force this exact number of spare nodes to be allocated From f993df04e676093cebf314756ca52569fbeb0209 Mon Sep 17 00:00:00 2001 From: James Foucar Date: Tue, 15 Aug 2017 17:43:40 -0500 Subject: [PATCH 2/2] fixes --- config/xml_schemas/env_mach_pes.xsd | 2 +- scripts/lib/CIME/XML/env_mach_pes.py | 2 +- src/drivers/mct/cime_config/config_component.xml | 1 + 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/config/xml_schemas/env_mach_pes.xsd b/config/xml_schemas/env_mach_pes.xsd index b36a7d93b78..21ee5351e65 100644 --- a/config/xml_schemas/env_mach_pes.xsd +++ b/config/xml_schemas/env_mach_pes.xsd @@ -2,7 +2,7 @@ - + diff --git a/scripts/lib/CIME/XML/env_mach_pes.py b/scripts/lib/CIME/XML/env_mach_pes.py index b81e2e2ea99..1a4ab613dd1 100644 --- a/scripts/lib/CIME/XML/env_mach_pes.py +++ b/scripts/lib/CIME/XML/env_mach_pes.py @@ -81,7 +81,7 @@ def get_total_nodes(self, total_tasks, max_thread_count): def get_spare_nodes(self, num_nodes): force_spare_nodes = self.get_value("FORCE_SPARE_NODES") - if force_spare_nodes is not None: + if force_spare_nodes != -999: return force_spare_nodes if self.get_value("ALLOCATE_SPARE_NODES"): diff --git a/src/drivers/mct/cime_config/config_component.xml b/src/drivers/mct/cime_config/config_component.xml index 63949acd4b2..7cf1b73a5de 100644 --- a/src/drivers/mct/cime_config/config_component.xml +++ b/src/drivers/mct/cime_config/config_component.xml @@ -1889,6 +1889,7 @@ integer + -999 mach_pes env_mach_pes.xml Force this exact number of spare nodes to be allocated