-
Notifications
You must be signed in to change notification settings - Fork 199
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Test interchange exit on bad registration message (#3698)
On certain bad registration messages, the interchange should exit immediately. This tests that. See #3697 for some bad (cosmetic?) behaviour here - the interchange SIGABRTs on this code path rather than exiting cleanly, and this test includes a commented out assert that could check for clean exit (in addition to checking that the interchange process exits at all) ## Type of change - Code maintenance/cleanup
- Loading branch information
1 parent
6146e71
commit 537b504
Showing
1 changed file
with
120 additions
and
0 deletions.
There are no files selected for viewing
120 changes: 120 additions & 0 deletions
120
parsl/tests/test_htex/test_interchange_exit_bad_registration.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,120 @@ | ||
import json | ||
import logging | ||
import os | ||
import pickle | ||
import platform | ||
import subprocess | ||
import time | ||
|
||
import psutil | ||
import pytest | ||
import zmq | ||
|
||
import parsl.executors.high_throughput.zmq_pipes as zmq_pipes | ||
from parsl.executors.high_throughput.executor import DEFAULT_INTERCHANGE_LAUNCH_CMD | ||
from parsl.executors.high_throughput.manager_selector import RandomManagerSelector | ||
from parsl.version import VERSION as PARSL_VERSION | ||
|
||
P_ms = 10 | ||
|
||
|
||
@pytest.mark.local | ||
def test_exit_with_bad_registration(tmpd_cwd, try_assert): | ||
"""Test that the interchange exits when it receives a bad registration message. | ||
This complements parsl/tests/test_scaling/test_worker_interchange_bad_messages_3262.py | ||
which tests that the interchange is resistent to other forms of bad message. | ||
""" | ||
|
||
outgoing_q = zmq_pipes.TasksOutgoing( | ||
"127.0.0.1", (49152, 65535), None | ||
) | ||
incoming_q = zmq_pipes.ResultsIncoming( | ||
"127.0.0.1", (49152, 65535), None | ||
) | ||
command_client = zmq_pipes.CommandClient( | ||
"127.0.0.1", (49152, 65535), None | ||
) | ||
|
||
interchange_config = {"client_address": "127.0.0.1", | ||
"client_ports": (outgoing_q.port, | ||
incoming_q.port, | ||
command_client.port), | ||
"interchange_address": "127.0.0.1", | ||
"worker_ports": None, | ||
"worker_port_range": (50000, 60000), | ||
"hub_address": None, | ||
"hub_zmq_port": None, | ||
"logdir": tmpd_cwd, | ||
"heartbeat_threshold": 120, | ||
"poll_period": P_ms, | ||
"logging_level": logging.DEBUG, | ||
"cert_dir": None, | ||
"manager_selector": RandomManagerSelector(), | ||
"run_id": "test" | ||
} | ||
|
||
config_pickle = pickle.dumps(interchange_config) | ||
|
||
interchange_proc = subprocess.Popen(DEFAULT_INTERCHANGE_LAUNCH_CMD, stdin=subprocess.PIPE) | ||
stdin = interchange_proc.stdin | ||
assert stdin is not None, "Popen should have created an IO object (vs default None) because of PIPE mode" | ||
|
||
stdin.write(config_pickle) | ||
stdin.flush() | ||
stdin.close() | ||
|
||
# wait for interchange to be alive, by waiting for the command thread to become | ||
# responsive. if the interchange process didn't start enough to get the command | ||
# thread running, this will time out. | ||
|
||
(task_port, result_port) = command_client.run("WORKER_PORTS", timeout_s=120) | ||
|
||
# now we'll assume that if the interchange command thread is responding, | ||
# then the worker polling code is also running and that the interchange has | ||
# started successfully. | ||
|
||
# send bad registration message as if from a new worker pool. The badness here | ||
# is that the Python version does not match the real Python version - which | ||
# unlike some other bad interchange messages, should cause the interchange | ||
# to shut down. | ||
|
||
msg = {'type': 'registration', | ||
'parsl_v': PARSL_VERSION, | ||
'python_v': "{}.{}.{}".format(1, 1, 1), # this is the bad bit | ||
'worker_count': 1, | ||
'uid': 'testuid', | ||
'block_id': 0, | ||
'start_time': time.time(), | ||
'prefetch_capacity': 0, | ||
'max_capacity': 1, | ||
'os': platform.system(), | ||
'hostname': platform.node(), | ||
'dir': os.getcwd(), | ||
'cpu_count': psutil.cpu_count(logical=False), | ||
'total_memory': psutil.virtual_memory().total, | ||
} | ||
|
||
# connect to worker port and send this message. | ||
|
||
context = zmq.Context() | ||
channel_timeout = 10000 # in milliseconds | ||
task_channel = context.socket(zmq.DEALER) | ||
task_channel.setsockopt(zmq.LINGER, 0) | ||
task_channel.setsockopt(zmq.IDENTITY, b'testid') | ||
|
||
task_channel.set_hwm(0) | ||
task_channel.setsockopt(zmq.SNDTIMEO, channel_timeout) | ||
task_channel.connect(f"tcp://127.0.0.1:{task_port}") | ||
|
||
b_msg = json.dumps(msg).encode('utf-8') | ||
|
||
task_channel.send(b_msg) | ||
|
||
# check that the interchange exits within some reasonable time | ||
try_assert(lambda: interchange_proc.poll() is not None, "Interchange did not exit after killing watched client process", timeout_ms=5000) | ||
|
||
# See issue #3697 - ideally the interchange would exit cleanly, but it does not. | ||
# assert interchange_proc.poll() == 0, "Interchange exited with an error code, not 0" | ||
|
||
task_channel.close() | ||
context.term() |