-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathslurmd.py
134 lines (109 loc) · 4.38 KB
/
slurmd.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
#!/usr/bin/python3
# Copyright 2023 Canonical Ltd.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Manage the internal slurmd daemon on Juju machines.
This module also provides a wrapper for starting the slurmd service using systemd.
"""
import datetime
import logging
import os
import shlex
import subprocess
import sys
import textwrap
from pathlib import Path
import charms.operator_libs_linux.v1.systemd as systemd # type: ignore [import-untyped]
_logger = logging.getLogger(__name__)
def start() -> None:
"""Start slurmd service."""
systemd.service_start("slurmd")
def stop() -> None:
"""Stop slurmd service."""
systemd.service_stop("slurmd")
def restart() -> None:
"""Restart slurmd service."""
systemd.service_restart("slurmd")
def override_default(host: str) -> None:
"""Override the /etc/default/slurmd file.
Args:
host: Hostname of slurmctld service.
"""
_logger.debug("Overriding /etc/default/slurmd.")
Path("/etc/default/slurmd").write_text(
textwrap.dedent(
f"""
SLURMD_OPTIONS="--conf-server {host}:6817"
PYTHONPATH={Path.cwd() / "lib"}
"""
).strip()
)
def override_service() -> None:
"""Override the default slurmd systemd service file.
Notes:
This method makes an invokes `systemd daemon-reload` after writing
the overrides.conf file for slurmd. This invocation will reload
all systemd units on the machine.
"""
_logger.debug("Overriding default slurmd service file")
if not (override_dir := Path("/etc/systemd/system/slurmd.service.d")).is_dir():
override_dir.mkdir()
overrides = override_dir / "99-slurmd-charm.conf"
overrides.write_text(
textwrap.dedent(
f"""
[Unit]
ConditionPathExists=
[Service]
Type=forking
ExecStart=
ExecStart=/usr/bin/python3 {__file__}
LimitMEMLOCK=infinity
LimitNOFILE=1048576
TimeoutSec=900
"""
).strip()
)
systemd.daemon_reload()
def _start_slurmd_service() -> None:
"""Start the slurmd service on machine.
This method is invoked when the slurmd utils module is executed as a runnable
script. The method wraps the command traditionally executed by systemd when
the slurmd service is started with `systemctl start slurmd`. The difference
is that this method attempts to start the slurmd daemon over the course of
15 minutes since the slurm.conf file may not be ready on the slurmctld server.
"""
# Environment variables must be expanded here because the Popen
# command will fail to execute if the inline environment variable
# has a whitespace-delimited value. $SLURMD_OPTIONS is white-space
# delimited, so it is expanded here to simplify the call to Popen.
slurmd_cmd = shlex.split(os.path.expandvars("/usr/sbin/slurmd -D -s $SLURMD_OPTIONS"))
# Try to start slurmd with ~5 seconds in-between each attempt. Timeout after 15 minutes.
end_time = datetime.datetime.now() + datetime.timedelta(minutes=15)
while True:
if datetime.datetime.now() >= end_time:
_logger.error("Failed to start slurmd daemon. Timeout exceeded")
sys.exit(1)
else:
try:
process = subprocess.Popen(slurmd_cmd, env=os.environ)
# There needs to be a slight delay here to give slurmd
# enough time to fail if configuration file is not ready yet.
# If timeout is reached, this means the slurmd is running.
# If exit code is returned, this means slurmd failed and is not running.
process.wait(timeout=5)
except subprocess.TimeoutExpired:
_logger.info("Slurmd successfully started")
break
if __name__ == "__main__": # pragma: nocover
_start_slurmd_service()