From 5bc3792231cc5f6d3673eabe6d678a45d1747121 Mon Sep 17 00:00:00 2001 From: mvdbeek Date: Tue, 18 Jun 2024 12:50:33 +0200 Subject: [PATCH] Don't immediately resume _monitor_active_jobs on exception Got a bunch of these on rockfish, and I don't think we're helping ourselves by calling os.listdir every 5ms: ``` 2024-06-11 12:42:09,485 ERROR [pulsar.managers.stateful][[manager=rockfish]-[action=monitor]] Failure in stateful manager monitor step. Traceback (most recent call last): File "/data/nekrut/galaxy/main/pulsar/venv/lib/python3.9/site-packages/pulsar/managers/stateful.py", line 364, in _run self._monitor_active_jobs() File "/data/nekrut/galaxy/main/pulsar/venv/lib/python3.9/site-packages/pulsar/managers/stateful.py", line 369, in _monitor_active_jobs active_job_ids = self.stateful_manager.active_jobs.active_job_ids() File "/data/nekrut/galaxy/main/pulsar/venv/lib/python3.9/site-packages/pulsar/managers/stateful.py", line 310, in active_job_ids job_ids = os.listdir(target_directory) OSError: [Errno 23] Too many open files in system: '/scratch4/nekrut/galaxy/main/pulsar/var/rockfish-active-jobs' 2024-06-11 12:42:09,489 ERROR [pulsar.managers.stateful][[manager=rockfish]-[action=monitor]] Failure in stateful manager monitor step. Traceback (most recent call last): File "/data/nekrut/galaxy/main/pulsar/venv/lib/python3.9/site-packages/pulsar/managers/stateful.py", line 364, in _run self._monitor_active_jobs() File "/data/nekrut/galaxy/main/pulsar/venv/lib/python3.9/site-packages/pulsar/managers/stateful.py", line 369, in _monitor_active_jobs active_job_ids = self.stateful_manager.active_jobs.active_job_ids() File "/data/nekrut/galaxy/main/pulsar/venv/lib/python3.9/site-packages/pulsar/managers/stateful.py", line 310, in active_job_ids job_ids = os.listdir(target_directory) OSError: [Errno 23] Too many open files in system: '/scratch4/nekrut/galaxy/main/pulsar/var/rockfish-active-jobs' 2024-06-11 12:42:09,494 ERROR [pulsar.managers.stateful][[manager=rockfish]-[action=monitor]] Failure in stateful manager monitor step. Traceback (most recent call last): File "/data/nekrut/galaxy/main/pulsar/venv/lib/python3.9/site-packages/pulsar/managers/stateful.py", line 364, in _run self._monitor_active_jobs() File "/data/nekrut/galaxy/main/pulsar/venv/lib/python3.9/site-packages/pulsar/managers/stateful.py", line 369, in _monitor_active_jobs active_job_ids = self.stateful_manager.active_jobs.active_job_ids() File "/data/nekrut/galaxy/main/pulsar/venv/lib/python3.9/site-packages/pulsar/managers/stateful.py", line 310, in active_job_ids job_ids = os.listdir(target_directory) OSError: [Errno 23] Too many open files in system: '/scratch4/nekrut/galaxy/main/pulsar/var/rockfish-active-jobs' ``` --- pulsar/managers/stateful.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pulsar/managers/stateful.py b/pulsar/managers/stateful.py index 2f32ce1c..d8f0e362 100644 --- a/pulsar/managers/stateful.py +++ b/pulsar/managers/stateful.py @@ -370,6 +370,9 @@ def _run(self): self._monitor_active_jobs() except Exception: log.exception("Failure in stateful manager monitor step.") + # This should hopefully be a rare event. + # Let's not hammer the system with job lookups + time.sleep(1) def _monitor_active_jobs(self): active_job_ids = self.stateful_manager.active_jobs.active_job_ids()