From b091fb2c97966854d55e16d34806732bf259840e Mon Sep 17 00:00:00 2001 From: James Foucar Date: Mon, 1 May 2017 14:53:50 -0600 Subject: [PATCH 1/6] Update queue selection to take walltime into account Adds concept of strict walltime --- config/acme/machines/config_batch.xml | 10 +-- config/xml_schemas/config_batch.xsd | 3 +- scripts/create_test | 2 +- scripts/lib/CIME/XML/env_batch.py | 94 +++++++++++++++++++-------- 4 files changed, 75 insertions(+), 34 deletions(-) diff --git a/config/acme/machines/config_batch.xml b/config/acme/machines/config_batch.xml index aa9abfb4e3a..50e909b52ee 100644 --- a/config/acme/machines/config_batch.xml +++ b/config/acme/machines/config_batch.xml @@ -158,7 +158,7 @@ -l nodes={{ num_nodes }}:ppn={{ tasks_per_node }} - shared + shared batch @@ -177,8 +177,8 @@ + debug regular - debug @@ -198,8 +198,8 @@ --constraint=haswell + debug regular - debug @@ -208,8 +208,8 @@ --constraint=knl,quad,cache + debug regular - debug @@ -329,8 +329,8 @@ -l nodes={{ num_nodes }} + debug batch - debug diff --git a/config/xml_schemas/config_batch.xsd b/config/xml_schemas/config_batch.xsd index da8e028a5b3..ad810c02870 100644 --- a/config/xml_schemas/config_batch.xsd +++ b/config/xml_schemas/config_batch.xsd @@ -69,7 +69,7 @@ @@ -130,6 +130,7 @@ + diff --git a/scripts/create_test b/scripts/create_test index 3ed88253cce..f55dfe47131 100755 --- a/scripts/create_test +++ b/scripts/create_test @@ -420,7 +420,7 @@ def single_submit_impl(machine_name, test_id, proc_pool, project, args, job_cost else: wall_time_bab = wall_time - queue = env_batch.select_best_queue(proc_pool) + queue = env_batch.select_best_queue(proc_pool, wall_time_bab) wall_time_max_bab = env_batch.get_max_walltime(queue) if wall_time_max_bab is not None: wall_time_max = convert_to_seconds(wall_time_max_bab) diff --git a/scripts/lib/CIME/XML/env_batch.py b/scripts/lib/CIME/XML/env_batch.py index 68ba5fde7b2..78ebfe39fee 100644 --- a/scripts/lib/CIME/XML/env_batch.py +++ b/scripts/lib/CIME/XML/env_batch.py @@ -7,7 +7,7 @@ from CIME.XML.standard_module_setup import * from CIME.utils import format_time from CIME.XML.env_base import EnvBase -from CIME.utils import transform_vars, get_cime_root +from CIME.utils import transform_vars, get_cime_root, convert_to_seconds from copy import deepcopy logger = logging.getLogger(__name__) @@ -195,18 +195,31 @@ def set_job_defaults(self, batch_jobs, pesize=None, walltime=None, force_queue=N else: task_count = int(task_count) - queue = force_queue if force_queue is not None else self.select_best_queue(task_count, job) - self.set_value("JOB_QUEUE", queue, subgroup=job) - - walltime = self.get_max_walltime(queue) if walltime is None else walltime - if walltime is None: - logger.warn("Could not find a queue matching task count %d, falling back to deprecated default walltime parameter"%task_count) - #if the user names a queue which is not defined in config_batch.xml and does not set a - #walltime, fall back to the max walltime in the default queue - if force_queue: - self.get_default_queue() - walltime = self._default_walltime + if force_queue: + if not self.queue_meets_spec(force_queue, task_count, walltime=walltime, job=job): + logger.warning("User-request queue '%s' does not meet requirements for job '%s'" % (force_queue, job)) + else: + queue = self.select_best_queue(task_count, walltime=walltime, job=job) + if queue is None and walltime is not None: + # Try to see if walltime was the holdup + queue = self.select_best_queue(task_count, walltime=None, job=job) + if queue is not None: + # It was, override the walltime to avoid failure + new_walltime = self.get_queue_specs(queue)[3] + expect(new_walltime is not None, "Should never make it here") + logger.warning("Requested walltime '%s' could not be matched by any queue, using '%s' instead" % (walltime, new_walltime)) + walltime = new_walltime + + if queue is None: + logger.warning("No queue on this system met the requirements for this job. Falling back to defaults") + default_queue_node = self.get_default_queue() + queue = default_queue_node.text + walltime = self.get_queue_specs(queue)[3] + + walltime = self.get_queue_specs(queue)[3] if walltime is None else walltime + walltime = self._default_walltime if walltime is None else walltime # last-chance fallback + self.set_value("JOB_QUEUE", queue, subgroup=job) self.set_value("JOB_WALLCLOCK_TIME", walltime, subgroup=job) logger.debug("Job %s queue %s walltime %s" % (job, queue, walltime)) @@ -362,7 +375,6 @@ def _submit_single_job(self, case, job, depid=None, no_batch=False, batch_args=N function_name = job.replace(".", "_") if not dry_run: - function_name = job.replace(".", "_") locals()[function_name](case) return @@ -418,36 +430,64 @@ def get_job_id(self, output): jobid = search_match.group(1) return jobid - def select_best_queue(self, num_pes, job=None): + def queue_meets_spec(self, queue, num_pes, walltime=None, job=None): + jobmin, jobmax, jobname, walltimemax, strict = self.get_queue_specs(queue) + + # A job name match automatically meets spec + if job is not None and jobname is not None: + return jobname == job + + if jobmin is not None and num_pes < int(jobmin): + return False + + if jobmax is not None and num_pes > int(jobmax): + return False + + if walltime is not None and walltimemax is not None and strict: + walltime_s = convert_to_seconds(walltime) + walltimemax_s = convert_to_seconds(walltimemax) + if walltime_s > walltimemax_s: + return False + + return True + + def select_best_queue(self, num_pes, walltime=None, job=None): # Make sure to check default queue first. all_queues = [] all_queues.append( self.get_default_queue()) all_queues = all_queues + self.get_all_queues() for queue in all_queues: if queue is not None: - jobmin = queue.get("jobmin") - jobmax = queue.get("jobmax") - jobname = queue.get("jobname") - if jobname is not None: - if job == jobname: - return queue.text - # if the fullsum is between the min and max # jobs, then use this queue. - elif jobmin is not None and jobmax is not None and num_pes >= int(jobmin) and num_pes <= int(jobmax): - return queue.text + qname = queue.text + if self.queue_meets_spec(qname, num_pes, walltime=walltime, job=job): + return qname + return None - def get_max_walltime(self, queue): + def get_queue_specs(self, queue): + """ + Get queue specifications by name. + + Returns (jobmin, jobmax, jobname, walltimemax, is_strict) + """ for queue_node in self.get_all_queues(): if queue_node.text == queue: - return queue_node.get("walltimemax") + jobmin = queue.get("jobmin") + jobmax = queue.get("jobmax") + jobname = queue.get("jobname") + walltimemax = queue.get("walltimemax") + strict = queue.get("strict") == "true" + + return jobmin, jobmax, jobname, walltimemax, strict + + expect(False, "Queue '%s' is unknown to this system" % queue) def get_default_queue(self): node = self.get_optional_node("queue", attributes={"default" : "true"}) if node is None: node = self.get_optional_node("queue") expect(node is not None, "No queues found") - self._default_walltime = node.get("walltimemax") - return(node) + return node def get_all_queues(self): return self.get_nodes("queue") From 0a70675ba4a0ef73c6e86c88fcf80ae63c8f4999 Mon Sep 17 00:00:00 2001 From: James Foucar Date: Mon, 1 May 2017 17:13:59 -0600 Subject: [PATCH 2/6] Minor pylint fix --- scripts/lib/CIME/case_setup.py | 1 - 1 file changed, 1 deletion(-) diff --git a/scripts/lib/CIME/case_setup.py b/scripts/lib/CIME/case_setup.py index f17ef584e07..dd815e39dce 100644 --- a/scripts/lib/CIME/case_setup.py +++ b/scripts/lib/CIME/case_setup.py @@ -155,7 +155,6 @@ def _case_setup_impl(case, caseroot, clean=False, test_mode=False, reset=False): logger.debug("at update TOTALPES = %s"%pestot) case.set_value("TOTALPES", pestot) thread_count = env_mach_pes.get_max_thread_count(models) - build_threaded = case.get_build_threaded() cost_pes = env_mach_pes.get_cost_pes(pestot, thread_count, machine=case.get_value("MACH")) case.set_value("COST_PES", cost_pes) From 0e3cc7651e21201245ccaf4150415c6053a61520 Mon Sep 17 00:00:00 2001 From: James Foucar Date: Mon, 1 May 2017 17:23:44 -0600 Subject: [PATCH 3/6] Bug fixes --- scripts/lib/CIME/XML/env_batch.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/scripts/lib/CIME/XML/env_batch.py b/scripts/lib/CIME/XML/env_batch.py index 78ebfe39fee..8e36bd4ea2e 100644 --- a/scripts/lib/CIME/XML/env_batch.py +++ b/scripts/lib/CIME/XML/env_batch.py @@ -472,11 +472,11 @@ def get_queue_specs(self, queue): """ for queue_node in self.get_all_queues(): if queue_node.text == queue: - jobmin = queue.get("jobmin") - jobmax = queue.get("jobmax") - jobname = queue.get("jobname") - walltimemax = queue.get("walltimemax") - strict = queue.get("strict") == "true" + jobmin = queue_node.get("jobmin") + jobmax = queue_node.get("jobmax") + jobname = queue_node.get("jobname") + walltimemax = queue_node.get("walltimemax") + strict = queue_node.get("strict") == "true" return jobmin, jobmax, jobname, walltimemax, strict From 4379cad13a73cab5dd8467b0e90d1dcf2e15b8b5 Mon Sep 17 00:00:00 2001 From: James Foucar Date: Tue, 2 May 2017 10:49:10 -0600 Subject: [PATCH 4/6] Do not override walltime unless test --- scripts/create_test | 1 - scripts/lib/CIME/XML/env_batch.py | 16 ++++++++++------ scripts/lib/CIME/case.py | 3 ++- scripts/lib/CIME/test_scheduler.py | 1 - 4 files changed, 12 insertions(+), 9 deletions(-) diff --git a/scripts/create_test b/scripts/create_test index f55dfe47131..2b8643101b5 100755 --- a/scripts/create_test +++ b/scripts/create_test @@ -20,7 +20,6 @@ import argparse, math, glob logger = logging.getLogger(__name__) - ############################################################################### def parse_command_line(args, description): ############################################################################### diff --git a/scripts/lib/CIME/XML/env_batch.py b/scripts/lib/CIME/XML/env_batch.py index 8e36bd4ea2e..6385be121b2 100644 --- a/scripts/lib/CIME/XML/env_batch.py +++ b/scripts/lib/CIME/XML/env_batch.py @@ -181,7 +181,7 @@ def make_batch_script(self, input_template, job, case, total_tasks, tasks_per_no fd.write(output_text) os.chmod(job, os.stat(job).st_mode | stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH) - def set_job_defaults(self, batch_jobs, pesize=None, walltime=None, force_queue=None): + def set_job_defaults(self, batch_jobs, pesize=None, walltime=None, force_queue=None, allow_walltime_override=False): if self.batchtype is None: self.batchtype = self.get_batch_system_type() @@ -197,21 +197,25 @@ def set_job_defaults(self, batch_jobs, pesize=None, walltime=None, force_queue=N if force_queue: if not self.queue_meets_spec(force_queue, task_count, walltime=walltime, job=job): - logger.warning("User-request queue '%s' does not meet requirements for job '%s'" % (force_queue, job)) + logger.warning("WARNING: User-requested queue '%s' does not meet requirements for job '%s'" % (force_queue, job)) else: queue = self.select_best_queue(task_count, walltime=walltime, job=job) if queue is None and walltime is not None: # Try to see if walltime was the holdup queue = self.select_best_queue(task_count, walltime=None, job=job) if queue is not None: - # It was, override the walltime to avoid failure + # It was, override the walltime if a test, otherwise just warn the user new_walltime = self.get_queue_specs(queue)[3] expect(new_walltime is not None, "Should never make it here") - logger.warning("Requested walltime '%s' could not be matched by any queue, using '%s' instead" % (walltime, new_walltime)) - walltime = new_walltime + logger.warning("WARNING: Requested walltime '%s' could not be matched by any queue" % walltime) + if allow_walltime_override: + logger.warning(" Using walltime '%s' instead" % new_walltime) + walltime = new_walltime + else: + logger.warning(" Continuing with suspect walltime, batch submission may fail") if queue is None: - logger.warning("No queue on this system met the requirements for this job. Falling back to defaults") + logger.warning("WARNING: No queue on this system met the requirements for this job. Falling back to defaults") default_queue_node = self.get_default_queue() queue = default_queue_node.text walltime = self.get_queue_specs(queue)[3] diff --git a/scripts/lib/CIME/case.py b/scripts/lib/CIME/case.py index 9e143418fd5..07d8fde0d60 100644 --- a/scripts/lib/CIME/case.py +++ b/scripts/lib/CIME/case.py @@ -741,7 +741,7 @@ def configure(self, compset_name, grid_name, machine_name=None, env_batch.set_batch_system(batch, batch_system_type=batch_system_type) env_batch.create_job_groups(bjobs) - env_batch.set_job_defaults(bjobs, pesize=maxval, walltime=walltime, force_queue=queue) + env_batch.set_job_defaults(bjobs, pesize=maxval, walltime=walltime, force_queue=queue, allow_walltime_override=test) self.schedule_rewrite(env_batch) #-------------------------------------------- @@ -810,6 +810,7 @@ def configure(self, compset_name, grid_name, machine_name=None, if model == "cesm" and not test: self.set_value("DOUT_S",True) self.set_value("TIMER_LEVEL", 4) + if test: self.set_value("TEST",True) diff --git a/scripts/lib/CIME/test_scheduler.py b/scripts/lib/CIME/test_scheduler.py index 7de420c615f..a884e17550d 100644 --- a/scripts/lib/CIME/test_scheduler.py +++ b/scripts/lib/CIME/test_scheduler.py @@ -405,7 +405,6 @@ def _create_newcase_phase(self, test): create_newcase_cmd += " --mpilib %s" % self._mpilib logger.debug (" MPILIB set to %s" % self._mpilib) - if self._queue is not None: create_newcase_cmd += " --queue=%s" % self._queue From dc84643fde623a3f8e59103b06813967dc524ff2 Mon Sep 17 00:00:00 2001 From: James Foucar Date: Tue, 2 May 2017 12:01:35 -0600 Subject: [PATCH 5/6] Add tests --- scripts/tests/scripts_regression_tests.py | 84 +++++++++++++++++++++++ 1 file changed, 84 insertions(+) diff --git a/scripts/tests/scripts_regression_tests.py b/scripts/tests/scripts_regression_tests.py index 7dac780e46d..535d6cf4899 100755 --- a/scripts/tests/scripts_regression_tests.py +++ b/scripts/tests/scripts_regression_tests.py @@ -1402,6 +1402,90 @@ def test_cime_case_xmlchange_append(self): result = run_cmd_assert_result(self, "./xmlquery --value PIO_CONFIG_OPTS", from_dir=casedir) self.assertEqual(result, "-opt1 -opt2") + ########################################################################### + def test_cime_case_test_walltime_mgmt_1(self): + ########################################################################### + if CIME.utils.get_model() != "acme": + self.skipTest("Skipping walltime test. Depends on ACME batch settings") + + test_name = "ERS.f19_g16_rx1.A" + machine, compiler = "blues", "gnu" + run_cmd_assert_result(self, "unset CIME_GLOBAL_WALLTIME && %s/create_test --no-setup --machine %s %s -t %s --test-root %s --output-root %s" % + (SCRIPT_DIR, machine, test_name, self._baseline_name, self._testroot, self._testroot)) + + casedir = os.path.join(self._testroot, + "%s.%s" % (CIME.utils.get_full_test_name(test_name, machine=machine, compiler=compiler), self._baseline_name)) + self.assertTrue(os.path.isdir(casedir), msg="Missing casedir '%s'" % casedir) + + result = run_cmd_assert_result(self, "./xmlquery JOB_WALLCLOCK_TIME --subgroup=case.test --value", from_dir=casedir) + self.assertEqual(result, "0:10:00") + + result = run_cmd_assert_result(self, "./xmlquery JOB_QUEUE --subgroup=case.test --value", from_dir=casedir) + self.assertEqual(result, "shared") + + ########################################################################### + def test_cime_case_test_walltime_mgmt_2(self): + ########################################################################### + if CIME.utils.get_model() != "acme": + self.skipTest("Skipping walltime test. Depends on ACME batch settings") + + test_name = "ERS_P64.f19_g16_rx1.A" + machine, compiler = "blues", "gnu" + run_cmd_assert_result(self, "unset CIME_GLOBAL_WALLTIME && %s/create_test --no-setup --machine %s %s -t %s --test-root %s --output-root %s" % + (SCRIPT_DIR, machine, test_name, self._baseline_name, self._testroot, self._testroot)) + + casedir = os.path.join(self._testroot, + "%s.%s" % (CIME.utils.get_full_test_name(test_name, machine=machine, compiler=compiler), self._baseline_name)) + self.assertTrue(os.path.isdir(casedir), msg="Missing casedir '%s'" % casedir) + + result = run_cmd_assert_result(self, "./xmlquery JOB_WALLCLOCK_TIME --subgroup=case.test --value", from_dir=casedir) + self.assertEqual(result, "03:00:00") + + result = run_cmd_assert_result(self, "./xmlquery JOB_QUEUE --subgroup=case.test --value", from_dir=casedir) + self.assertEqual(result, "batch") + + ########################################################################### + def test_cime_case_test_walltime_mgmt_3(self): + ########################################################################### + if CIME.utils.get_model() != "acme": + self.skipTest("Skipping walltime test. Depends on ACME batch settings") + + test_name = "ERS_P64.f19_g16_rx1.A" + machine, compiler = "blues", "gnu" + run_cmd_assert_result(self, "unset CIME_GLOBAL_WALLTIME && %s/create_test --no-setup --machine %s %s -t %s --test-root %s --output-root %s --walltime='0:10:00'" % + (SCRIPT_DIR, machine, test_name, self._baseline_name, self._testroot, self._testroot)) + + casedir = os.path.join(self._testroot, + "%s.%s" % (CIME.utils.get_full_test_name(test_name, machine=machine, compiler=compiler), self._baseline_name)) + self.assertTrue(os.path.isdir(casedir), msg="Missing casedir '%s'" % casedir) + + result = run_cmd_assert_result(self, "./xmlquery JOB_WALLCLOCK_TIME --subgroup=case.test --value", from_dir=casedir) + self.assertEqual(result, "0:10:00") + + result = run_cmd_assert_result(self, "./xmlquery JOB_QUEUE --subgroup=case.test --value", from_dir=casedir) + self.assertEqual(result, "batch") # Not smart enough to select faster queue + + ########################################################################### + def test_cime_case_test_walltime_mgmt_4(self): + ########################################################################### + if CIME.utils.get_model() != "acme": + self.skipTest("Skipping walltime test. Depends on ACME batch settings") + + test_name = "ERS_P1.f19_g16_rx1.A" + machine, compiler = "blues", "gnu" + run_cmd_assert_result(self, "unset CIME_GLOBAL_WALLTIME && %s/create_test --no-setup --machine %s %s -t %s --test-root %s --output-root %s --walltime='2:00:00'" % + (SCRIPT_DIR, machine, test_name, self._baseline_name, self._testroot, self._testroot)) + + casedir = os.path.join(self._testroot, + "%s.%s" % (CIME.utils.get_full_test_name(test_name, machine=machine, compiler=compiler), self._baseline_name)) + self.assertTrue(os.path.isdir(casedir), msg="Missing casedir '%s'" % casedir) + + result = run_cmd_assert_result(self, "./xmlquery JOB_WALLCLOCK_TIME --subgroup=case.test --value", from_dir=casedir) + self.assertEqual(result, "01:00:00") + + result = run_cmd_assert_result(self, "./xmlquery JOB_QUEUE --subgroup=case.test --value", from_dir=casedir) + self.assertEqual(result, "shared") + ############################################################################### class X_TestSingleSubmit(TestCreateTestCommon): ############################################################################### From 26f0ffc5f4df84cff947d2a890ecc7e1e585e597 Mon Sep 17 00:00:00 2001 From: James Foucar Date: Tue, 2 May 2017 10:10:34 -0600 Subject: [PATCH 6/6] Fix single submit --- scripts/create_test | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/create_test b/scripts/create_test index 2b8643101b5..a8a75ce4154 100755 --- a/scripts/create_test +++ b/scripts/create_test @@ -420,7 +420,7 @@ def single_submit_impl(machine_name, test_id, proc_pool, project, args, job_cost wall_time_bab = wall_time queue = env_batch.select_best_queue(proc_pool, wall_time_bab) - wall_time_max_bab = env_batch.get_max_walltime(queue) + wall_time_max_bab = env_batch.get_queue_specs(queue)[3] if wall_time_max_bab is not None: wall_time_max = convert_to_seconds(wall_time_max_bab) if wall_time_max < wall_time: