Skip to content

Commit

Permalink
Merge pull request #4640 from ESMCI/jgfouca/robust_thread_waiting
Browse files Browse the repository at this point in the history
More robust approach to waiting for many threads

Using threading.active_count doesn't work if there are other threads being used in the process. Both jenkins_generic_job and wait_for_tests use threads and the jenkins_generic_job archiver thread was causing wait_for_tests to wait when it shouldn't have. @ndkeen discovered this was blocking our Jenkins reporting.

Instead, track the set of threads you want to wait on in a list.

The fix to wait_for_tests only impacts E3SM but I found a similar issue in build.py that could impact CESM.

Test suite: scrips_regression_tests wait_for_tests
Test baseline:
Test namelist changes:
Test status: bit for bit

Fixes [CIME Github issue #]

User interface changes?:

Update gh-pages html (Y/N)?:
  • Loading branch information
jgfouca authored Jun 11, 2024
2 parents 422ddaa + bb93451 commit 548b136
Show file tree
Hide file tree
Showing 3 changed files with 14 additions and 8 deletions.
6 changes: 4 additions & 2 deletions CIME/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -338,6 +338,7 @@ def _build_model(
thread_bad_results = []
libroot = os.path.join(exeroot, "lib")
bldroot = None
bld_threads = []
for model, comp, nthrds, _, config_dir in complist:
if buildlist is not None and model.lower() not in buildlist:
continue
Expand Down Expand Up @@ -391,12 +392,13 @@ def _build_model(
),
)
t.start()
bld_threads.append(t)

logs.append(file_build)

# Wait for threads to finish
while threading.active_count() > 1:
time.sleep(1)
for bld_thread in bld_threads:
bld_thread.join()

expect(not thread_bad_results, "\n".join(thread_bad_results))

Expand Down
10 changes: 6 additions & 4 deletions CIME/tests/test_sys_cime_case.py
Original file line number Diff line number Diff line change
Expand Up @@ -509,8 +509,10 @@ def test_cime_case_test_walltime_mgmt_8(self):
if self._config.test_mode == "cesm":
self.skipTest("Skipping walltime test. Depends on E3SM batch settings")

test_name = "SMS_P25600.f19_g16_rx1.A"
machine, compiler = "theta", "gnu"
# Frontier has 56 MAX_MPITASKS_PER_NODE so 5600 should require 100 nodes
# which should land us in 6 hour queue
test_name = "SMS_P5600.f19_g16_rx1.A"
machine, compiler = "frontier", "gnu"
casedir = self._create_test(
[
"--no-setup",
Expand All @@ -528,12 +530,12 @@ def test_cime_case_test_walltime_mgmt_8(self):
"./xmlquery JOB_WALLCLOCK_TIME -N --subgroup=case.test --value",
from_dir=casedir,
)
self.assertEqual(result, "09:00:00")
self.assertEqual(result, "06:00:00")

result = self.run_cmd_assert_result(
"./xmlquery JOB_QUEUE -N --subgroup=case.test --value", from_dir=casedir
)
self.assertEqual(result, "default")
self.assertEqual(result, "batch")

def test_cime_case_test_custom_project(self):
test_name = "ERS_P1.f19_g16_rx1.A"
Expand Down
6 changes: 4 additions & 2 deletions CIME/wait_for_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -658,6 +658,7 @@ def wait_for_tests_impl(
###############################################################################
results = queue.Queue()

wft_threads = []
for test_path in test_paths:
t = threading.Thread(
target=wait_for_test,
Expand All @@ -675,9 +676,10 @@ def wait_for_tests_impl(
)
t.daemon = True
t.start()
wft_threads.append(t)

while threading.active_count() > 1:
time.sleep(1)
for wft_thread in wft_threads:
wft_thread.join()

test_results = {}
completed_test_paths = []
Expand Down

0 comments on commit 548b136

Please sign in to comment.