Merge pull request #2034 from ESMCI/jgfouca/create_test_retry

Add retry capability to create_test Plus new regression test to exercise this capability. Test suite: scripts_regression_tests --fast Test baseline: Test namelist changes: Test status: bit for bit Fixes #1865 User interface changes?: Yes, new --retry option to create_test Update gh-pages html (Y/N)?: N Code review: @jedwards4b @billsacks
ESMCI · Nov 9, 2017 · 426f61c · 426f61c
2 parents 7c54009 + 93856e6
commit 426f61c
Show file tree

Hide file tree

Showing 2 changed files with 56 additions and 7 deletions.
diff --git a/scripts/create_test b/scripts/create_test
@@ -234,6 +234,9 @@ OR
                         "file. The file can follow either the config_pes.xml or "
                         "the env_mach_pes.xml format.")
 
+    parser.add_argument("--retry", type=int, default=0,
+                        help="Automatically retry failed tests. >0 implies --wait")
+
     args = CIME.utils.parse_args_and_handle_standard_logging_options(args, parser)
 
     # generate and compare flags may not point to the same directory
@@ -257,6 +260,9 @@ OR
     expect(not (args.namelists_only and not (args.generate or args.compare)),
            "Must provide either --compare or --generate with --namelists-only")
 
+    if args.retry > 0:
+        args.wait = True
+
     if args.parallel_jobs is not None:
         expect(args.parallel_jobs > 0,
                "Invalid value for parallel_jobs: %d" % args.parallel_jobs)
@@ -379,7 +385,7 @@ OR
         args.namelists_only, args.project, \
         args.test_id, args.parallel_jobs, args.walltime, \
         args.single_submit, args.proc_pool, args.use_existing, args.save_timing, args.queue, \
-        args.allow_baseline_overwrite, args.output_root, args.wait, args.force_procs, args.force_threads, args.mpilib, args.input_dir, args.pesfile
+        args.allow_baseline_overwrite, args.output_root, args.wait, args.force_procs, args.force_threads, args.mpilib, args.input_dir, args.pesfile, args.retry
 
 ###############################################################################
 def single_submit_impl(machine_name, test_id, proc_pool, _, args, job_cost_map, wall_time, test_root):
@@ -514,7 +520,7 @@ def create_test(test_names, test_data, compiler, machine_name, no_run, no_build,
         # Create submit script
         single_submit_impl(machine_name, test_id, proc_pool, project, sys.argv[1:], job_cost_map, walltime, test_root)
 
-    return 0 if success else CIME.utils.TESTS_FAILED_ERR_CODE
+    return success
 
 ###############################################################################
 def _main_func(description):
@@ -529,13 +535,24 @@ def _main_func(description):
     test_names, test_data, compiler, machine_name, no_run, no_build, no_setup, no_batch, \
     test_root, baseline_root, clean, baseline_cmp_name, baseline_gen_name, namelists_only, \
     project, test_id, parallel_jobs, walltime, single_submit, proc_pool, use_existing, \
-    save_timing, queue, allow_baseline_overwrite, output_root, wait, force_procs, force_threads, mpilib, input_dir, pesfile \
+    save_timing, queue, allow_baseline_overwrite, output_root, wait, force_procs, force_threads, mpilib, input_dir, pesfile, retry \
         = parse_command_line(sys.argv, description)
 
-    sys.exit(create_test(test_names, test_data, compiler, machine_name, no_run, no_build, no_setup, no_batch, test_root,
-                         baseline_root, clean, baseline_cmp_name, baseline_gen_name, namelists_only,
-                         project, test_id, parallel_jobs, walltime, single_submit, proc_pool, use_existing, save_timing,
-                         queue, allow_baseline_overwrite, output_root, wait, force_procs, force_threads, mpilib, input_dir, pesfile))
+    success = False
+    run_count = 0
+    while not success and run_count <= retry:
+        use_existing = use_existing if run_count == 0 else True
+        success = create_test(test_names, test_data, compiler, machine_name, no_run, no_build, no_setup, no_batch, test_root,
+                              baseline_root, clean, baseline_cmp_name, baseline_gen_name, namelists_only,
+                              project, test_id, parallel_jobs, walltime, single_submit, proc_pool, use_existing, save_timing,
+                              queue, allow_baseline_overwrite, output_root, wait, force_procs, force_threads, mpilib, input_dir, pesfile)
+        run_count += 1
+
+        # For testing only
+        os.environ["TESTBUILDFAIL_PASS"] = "True"
+        os.environ["TESTRUNFAIL_PASS"] = "True"
+
+    sys.exit(0 if success else CIME.utils.TESTS_FAILED_ERR_CODE)
 
 ###############################################################################
 

diff --git a/scripts/tests/scripts_regression_tests.py b/scripts/tests/scripts_regression_tests.py
@@ -1037,6 +1037,38 @@ def test_c_use_existing(self):
             assert_test_status(self, test_name, ts, SUBMIT_PHASE, TEST_PASS_STATUS)
             assert_test_status(self, test_name, ts, RUN_PHASE, TEST_PASS_STATUS)
 
+        del os.environ["TESTBUILDFAIL_PASS"]
+        del os.environ["TESTRUNFAIL_PASS"]
+
+        # test that passed tests are not re-run
+
+        ct2 = TestScheduler(tests, test_id=test_id, no_batch=NO_BATCH, use_existing=True,
+                            test_root=TEST_ROOT,output_root=TEST_ROOT,compiler=self._compiler,
+                            mpilib=TEST_MPILIB)
+
+        log_lvl = logging.getLogger().getEffectiveLevel()
+        logging.disable(logging.CRITICAL)
+        try:
+            ct2.run_tests()
+        finally:
+            logging.getLogger().setLevel(log_lvl)
+
+        self._wait_for_tests(test_id)
+
+        for test_status in test_statuses:
+            ts = TestStatus(test_dir=os.path.dirname(test_status))
+            test_name = ts.get_name()
+            assert_test_status(self, test_name, ts, MODEL_BUILD_PHASE, TEST_PASS_STATUS)
+            assert_test_status(self, test_name, ts, SUBMIT_PHASE, TEST_PASS_STATUS)
+            assert_test_status(self, test_name, ts, RUN_PHASE, TEST_PASS_STATUS)
+
+    ###########################################################################
+    def test_d_retry(self):
+    ###########################################################################
+        args = ["TESTBUILDFAIL_P1.f19_g16_rx1.A", "TESTRUNFAIL_P1.f19_g16_rx1.A", "TESTRUNPASS_P1.f19_g16_rx1.A", "--retry=1"]
+
+        self._create_test(args)
+
 ###############################################################################
 class P_TestJenkinsGenericJob(TestCreateTestCommon):
 ###############################################################################