From 74ec530669882ab4b28e54e519e44aeed2c08a61 Mon Sep 17 00:00:00 2001
From: Peter Portante <peter.portante@redhat.com>
Date: Thu, 15 Apr 2021 14:28:30 -0400
Subject: [PATCH] Correct `pbench-tool-meister-client` & `-stop`

Prior to this commit, `pbench-tool-meister-start` was the only CLI
interface which was using the two forms for specifying the Redis server
and Tool Data Sink host/port for binding and connecting.  Both the CLI
commands for stopping the Tool Meister sub-system and communicating with
it incorrectly expected the presence of the `PBENCH_REDIS_SERVER` to
only specify a `<host>:<port>`, and also indicate that instance was not
managed by CLI commands on this controller.

This commit corrects that where both `pbench-tool-meister-client` and
`pbench-tool-meister-stop` now properly handle both forms of host/port
specification, and `-stop` determines whether or not the CLI is in
charge of orchestrating the server components by looking for the local
pid files.

This was accomplished via a series of refactorings to share code across
those three CLI commands:

  * Add and use `BaseReturnCode` in `lib/pbench/agent/utils.py`
  * Move BaseServer to `.../agent/utils.py`
  * Add a `RedisServer` class for `pbench-tool-meister-stop` to take
    advantage of the common parameter handling of the Redis server
    specification
  * In `pbench-tool-meister-client`, we use the new `RedisServerCommon`
    class shared with `-start` and `-stop`
  * We fix the new `BaseServer.kill()` to exit immediately after posting
    a `SIGKILL`
  * Add unit tests for all common code moved to `.../agent/utils.py`
---
 agent/util-scripts/pbench-tool-meister-client |  30 +--
 agent/util-scripts/pbench-tool-meister-start  | 177 ++-------------
 agent/util-scripts/pbench-tool-meister-stop   | 153 ++++++-------
 .../test-bin/test-client-tool-meister         |   5 +-
 .../test-bin/test-start-stop-tool-meister     |   4 +-
 lib/pbench/agent/utils.py                     | 192 ++++++++++++++++
 lib/pbench/test/unit/agent/test_utils.py      | 211 ++++++++++++++++++
 7 files changed, 506 insertions(+), 266 deletions(-)
 create mode 100644 lib/pbench/test/unit/agent/test_utils.py
diff --git a/agent/util-scripts/pbench-tool-meister-client b/agent/util-scripts/pbench-tool-meister-client
index 3226264591..a6b6370ce7 100755
--- a/agent/util-scripts/pbench-tool-meister-client
+++ b/agent/util-scripts/pbench-tool-meister-client
@@ -10,15 +10,12 @@ import logging
 import os
 import sys
 
-from pbench.agent.constants import (
-    cli_tm_allowed_actions,
-    cli_tm_channel_prefix,
-    def_redis_port,
-)
+from pbench.agent.constants import cli_tm_allowed_actions, cli_tm_channel_prefix
 from pbench.agent.tool_meister_client import Client
+from pbench.agent.utils import RedisServerCommon
 
 
-def main(argv):
+def main(argv: list) -> int:
     """Main program for the Tool Meister client CLI.  The command line
     arguments are:
 
@@ -73,23 +70,16 @@ def main(argv):
             # properly shut down.
             return 0
 
-    redis_server = os.environ.get("PBENCH_REDIS_SERVER", f"localhost:{def_redis_port}")
-    parts = redis_server.split(":", 1)
-    if len(parts) != 2:
-        logger.error("Bad Redis server specified, {!r}", redis_server)
-        return 1
+    redis_server_env = os.environ.get("PBENCH_REDIS_SERVER", "")
     try:
-        redis_port = int(parts[1])
-    except Exception:
-        logger.error("Bad port for Redis server specified in {!r}", redis_server)
-        return 1
-    else:
-        redis_host = parts[0]
+        redis_server = RedisServerCommon(redis_server_env, "localhost")
+    except RedisServerCommon.Err as exc:
+        logger.error(str(exc))
+        return exc.return_code
 
-    # The Redis server is always running on the local host with the CLI.
     with Client(
-        redis_host=redis_host,
-        redis_port=redis_port,
+        redis_host=redis_server.host,
+        redis_port=redis_server.port,
         channel_prefix=cli_tm_channel_prefix,
         logger=logger,
     ) as client:
diff --git a/agent/util-scripts/pbench-tool-meister-start b/agent/util-scripts/pbench-tool-meister-start
index 0a911544ca..ab97099cff 100755
--- a/agent/util-scripts/pbench-tool-meister-start
+++ b/agent/util-scripts/pbench-tool-meister-start
@@ -97,12 +97,10 @@ the benchmark execution environment:
   - ssh-opts
 """
 
-import errno
 import json
 import logging
 import os
 import shlex
-import signal
 import socket
 import sys
 import time
@@ -128,9 +126,12 @@ from pbench.agent.tool_meister import main as tm_main
 from pbench.agent.tool_meister_client import Client
 from pbench.agent.toolmetadata import ToolMetadata
 from pbench.agent.utils import (
+    BaseReturnCode,
+    BaseServer,
     cli_verify_sysinfo,
     error_log,
     info_log,
+    RedisServerCommon,
     validate_hostname,
 )
 
@@ -140,12 +141,11 @@ from pbench.agent.utils import (
 _TDS_STARTUP_TIMEOUT = 60
 
 
-class ReturnCode:
+class ReturnCode(BaseReturnCode):
     """ReturnCode - symbolic return codes for the main program of
     pbench-tool-meister-start.
     """
 
-    SUCCESS = 0
     BADTOOLGROUP = 1
     BADAGENTCONFIG = 2
     MISSINGINSTALLDIR = 4
@@ -182,37 +182,8 @@ class ReturnCode:
     BADFULLHOSTNAME = 37
     BADHOSTNAME = 38
 
-    # Kill sub-codes
-    KILL_SUCCESS = 0
-    KILL_READEXC = 1
-    KILL_BADPID = 2
-    KILL_PIDNOTFOUND = 3
-    KILL_KILLERR = 4
-    KILL_KILLEXC = 5
 
-    class Err(RuntimeError):
-        """Err - exception definition to capture return code as an attribute.
-        """
-
-        def __init__(self, message: str, return_code: int):
-            """Adds a return_code attribute to capture an integer representing
-            the return code a caller can pass along.
-            """
-            super().__init__(message)
-            self.return_code = return_code
-
-    @staticmethod
-    def kill_ret_code(kill_code: int, ret_val: int):
-        """kill_ret_code - return an integer return code made up of the given
-        kill code and a return value.
-
-        A kill code of 0 and return value of 42 is returned as 42.
-        A kill code of 5 and return value of 52 is returned as 542.
-        """
-        return (kill_code * 100) + ret_val
-
-
-def _waitpid(pid: int):
+def _waitpid(pid: int) -> int:
     """Wrapper for os.waitpid()
 
     Returns the exit status of the given process ID.
@@ -241,10 +212,10 @@ def start_tms_via_ssh(
     tool_group: str,
     ssh_opts: str,
     full_hostname: str,
-    redis_server,
+    redis_server: RedisServerCommon,
     redis_client: redis.Redis,
     logger: logging.Logger,
-):
+) -> None:
     """start_tms_via_ssh - orchestrate the creation of local and remote Tool
     Meister instances using ssh for those that are remote.
 
@@ -364,88 +335,6 @@ def start_tms_via_ssh(
         )
 
 
-class BaseServer:
-    """BaseServer - abstract base class for common code shared between the
-    ToolDataSink and RedisServer classes.
-    """
-
-    def_port = None
-    bad_port_code = None
-    bad_host_code = None
-    name = None
-
-    class Err(ReturnCode.Err):
-        """BaseServer.Err - derived from ReturnCode.Err, specifically raised by
-        BaseServer and its derived classes.
-        """
-
-        pass
-
-    def __init__(self, spec: str, def_host_name: str):
-        """__init__ - from the given IP/port specification given, determine the
-        IP:port for binding (listening) and the IP:port for connecting.
-
-        The IP/port specification can be given in one of two forms:
-
-          - `<ip>:<port>'
-            * where the same ip address and port are used for binding and
-              connecting
-          - `<bind ip>:<port>;<connect ip>:<port>`
-            * where a semi-colon separates the bind ip/port from the connecting
-              ip/port
-
-        In either case, a missing port (bare colon, optional) indicates the
-        default port should be used. If no IP address is given, the default
-        host name is used.
-
-        No attempt is made to verify that the IP address resolves, or that it
-        is reachable, though we do check they are syntactically valid.
-        """
-        _spec = spec if spec else def_host_name
-        parts = _spec.split(";", 1)
-        pairs = []
-        for part in parts:
-            host_port_parts = part.rsplit(":", 1)
-            if len(host_port_parts) == 1:
-                port = self.def_port
-            else:
-                try:
-                    port = int(host_port_parts[1])
-                except ValueError as exc:
-                    if host_port_parts[1] == "":
-                        port = self.def_port
-                    else:
-                        raise self.Err(
-                            f"Bad port specified for {self.name} in '{spec}'",
-                            self.bad_port_ret_code,
-                        ) from exc
-            host = host_port_parts[0] if host_port_parts[0] else def_host_name
-            if host[0] == "[" and host[-1] == "]":
-                # Brackets are invalid for a host name, but might be used when
-                # specifying a port with an IPv6 address, strip them before we
-                # validate the host name.
-                host = host[1:-1]
-            if validate_hostname(host) != 0:
-                raise self.Err(
-                    f"Bad host specified for {self.name} in '{spec}'",
-                    self.bad_host_ret_code,
-                )
-            pairs.append((host, port))
-
-        self.bind_host, self.bind_port = pairs[0]
-        if len(pairs) == 2:
-            # Separate bind/connecting ip:port
-            self.host, self.port = pairs[1]
-            self._repr = f"{self.name} - {self.bind_host}:{self.bind_port} / {self.host}:{self.port}"
-        else:
-            assert len(pairs) == 1, "Logic bomb!  unexpected pairs, {pairs!r}"
-            self.host, self.port = pairs[0]
-            self._repr = f"{self.name} - {self.host}:{self.port}"
-
-    def __repr__(self):
-        return self._repr
-
-
 class ToolDataSink(BaseServer):
     """ToolDataSink - an encapsulation of the handling of the Tool Data Sink
     specification and methods to optionally create and manage an instance.
@@ -461,10 +350,10 @@ class ToolDataSink(BaseServer):
         exec_dir: Path,
         full_hostname: str,
         tds_param_key: str,
-        redis_server,
+        redis_server: RedisServerCommon,
         redis_client: redis.Redis,
         logger: logging.Logger,
-    ):
+    ) -> None:
         assert (
             self.host is not None
             and self.port is not None
@@ -530,7 +419,7 @@ class ToolDataSink(BaseServer):
                         time.sleep(0.1)
 
     @staticmethod
-    def wait(chan: RedisChannelSubscriber, logger: logging.Logger):
+    def wait(chan: RedisChannelSubscriber, logger: logging.Logger) -> int:
         """wait - Wait for the Tool Data Sink to report back success or
         failure regarding the Tool Meister environment setup.
         """
@@ -559,15 +448,13 @@ class ToolDataSink(BaseServer):
         return 0 if status == "success" else 1
 
 
-class RedisServer(BaseServer):
+class RedisServer(RedisServerCommon):
     """RedisServer - an encapsulation of the handling of the Redis server
     specification and methods to optionally create and manage an instance.
     """
 
-    def_port = def_redis_port
     bad_port_ret_code = ReturnCode.BADREDISPORT
     bad_host_ret_code = ReturnCode.BADREDISHOST
-    name = "Redis server"
 
     # Redis server configuration template for pbench's use
     conf_tmpl = """bind {bind_host_names}
@@ -584,7 +471,7 @@ port {redis_port:d}
         super().__init__(spec, def_host_name)
         self.pid_file = None
 
-    def start(self, tm_dir: Path, full_hostname: str, logger: logging.Logger):
+    def start(self, tm_dir: Path, full_hostname: str, logger: logging.Logger) -> None:
         """start_redis - configure and start a Redis server.
 
         Raises a BaseServer.Err exception if an error is encountered.
@@ -695,46 +582,8 @@ port {redis_port:d}
                     ReturnCode.REDISFAILED,
                 )
 
-    def kill(self, ret_val: int):
-        """kill - attempt to KILL the running Redis server.
-
-        This method is a no-op if the server instance isn't managed by us.
-
-        Returns ReturnCode "enum" via the "kill" return code method.
-        """
-        assert self.pid_file is not None, f"Logic bomb!  Unexpected state: {self!r}"
-
-        try:
-            raw_pid = self.pid_file.read_text()
-        except Exception:
-            # No "pid" to kill
-            return ReturnCode.kill_ret_code(ReturnCode.KILL_READEXC, ret_val)
-        else:
-            try:
-                pid = int(raw_pid)
-            except ValueError:
-                # Bad pid value
-                return ReturnCode.kill_ret_code(ReturnCode.KILL_BADPID, ret_val)
-            try:
-                os.kill(pid, signal.SIGKILL)
-            except OSError as exc:
-                if exc.errno == errno.ESRCH:
-                    # PID not found, ignore
-                    return ReturnCode.kill_ret_code(
-                        ReturnCode.KILL_PIDNOTFOUND, ret_val
-                    )
-                else:
-                    # Some error encountered trying to KILL the process.
-                    return ReturnCode.kill_ret_code(ReturnCode.KILL_KILLERR, ret_val)
-            except Exception:
-                # Some other error encountered trying to KILL the process.
-                return ReturnCode.kill_ret_code(ReturnCode.KILL_KILLEXC, ret_val)
-            else:
-                # "successfully" KILL'd the give process.
-                return ReturnCode.kill_ret_code(ReturnCode.KILL_SUCCESS, ret_val)
-
 
-def main(_prog: str, cli_params: Namespace):
+def main(_prog: str, cli_params: Namespace) -> int:
     """Main program for tool meister start.
 
     :cli_params: expects a CLI parameters object which has five attributes:
diff --git a/agent/util-scripts/pbench-tool-meister-stop b/agent/util-scripts/pbench-tool-meister-stop
index ce4f9b62b9..9b4aebbe29 100755
--- a/agent/util-scripts/pbench-tool-meister-stop
+++ b/agent/util-scripts/pbench-tool-meister-stop
@@ -9,59 +9,95 @@ stopping all local/remote tool meisters, closing down the local data sink, and
 finally the local redis server.
 """
 
-import errno
 import logging
 import os
-import signal
 import sys
 import time
 
-from argparse import ArgumentParser
+from argparse import ArgumentParser, Namespace
 from pathlib import Path
 
-from pbench.agent.constants import def_redis_port, cli_tm_channel_prefix
+from pbench.agent.constants import cli_tm_channel_prefix
 from pbench.agent.tool_group import BadToolGroup, ToolGroup
 from pbench.agent.tool_meister_client import Client
-from pbench.agent.utils import cli_verify_sysinfo, error_log, info_log
+from pbench.agent.utils import (
+    RedisServerCommon,
+    cli_verify_sysinfo,
+    error_log,
+    info_log,
+)
 
 
-def is_running(pid):
-    """Is the given PID running?
+def is_running(pid: int) -> bool:
+    """is_running - Is the given PID running?
 
     See https://stackoverflow.com/questions/7653178/wait-until-a-certain-process-knowing-the-pid-end
+
+    Return True if a PID is running, else False if not.
     """
     try:
         os.kill(pid, 0)
-    except OSError as err:
-        if err.errno == errno.ESRCH:
-            return False
+    except ProcessLookupError:
+        return False
     return True
 
 
-def wait_for_pid(pid):
+def wait_for_pid(pid: int) -> None:
     """wait_for_pid - wait for a process to actually stop running.
     """
     while is_running(pid):
         time.sleep(0.1)
 
 
+class RedisServer(RedisServerCommon):
+    """RedisServer - an encapsulation of the handling of the Redis server
+    specification for pbench-tool-meister-stop.
+
+    The constructor is enhanced to find the optional local Redis server pid
+    file, and the additional method, locally_managed(), keys off of its
+    presence.
+    """
+
+    def __init__(self, spec: str, benchmark_run_dir: Path, def_host_name: str):
+        super().__init__(spec, def_host_name)
+        try:
+            self.pid_file = (
+                benchmark_run_dir / "tm" / f"redis_{self.def_port:d}.pid"
+            ).resolve(strict=True)
+        except FileNotFoundError:
+            pass
+
+    def locally_managed(self) -> bool:
+        return self.pid_file is not None
+
+
 def graceful_shutdown(
-    benchmark_run_dir, full_hostname, group, redis_server_pid_file, logger
-):
-    # The assumption/assertion here is that the tool meister "stop" command is
-    # run on the same node as the tool meister "start" command ran, creating
-    # the local Tool Data Sink and the optional local Tool Meister. We want to
-    # make sure anything "local" to this stop command is shut down gracefully
-    # before we report back to the user.  If Tool Meisters from remote nodes
-    # have already reported that they have received the "terminate" message,
-    # then we trust they will shutdown gracefully themselves.
+    benchmark_run_dir: Path,
+    full_hostname: str,
+    group: str,
+    redis_server: RedisServer,
+    logger: logging.Logger,
+) -> int:
+    """
+    graceful_shutdown - attempt to cleanly shut down the previously created
+    Tool Data Sink, and the Redis server.
+
+    The assumption/assertion here is that the tool meister "stop" command is run
+    on the same node as the tool meister "start" command ran, creating the local
+    Tool Data Sink and the optional local Tool Meister. We want to make sure
+    anything "local" to this stop command is shut down gracefully before we
+    report back to the user.  If Tool Meisters from remote nodes have already
+    reported that they have received the "terminate" message, then we trust they
+    will shutdown gracefully themselves.
+
+    Returns 0 on success, 1 on failure (logging any unexpected exceptions)
+    """
     try:
         tds_pid_file = benchmark_run_dir / "tm" / "pbench-tool-data-sink.pid"
         try:
             pid_str = tds_pid_file.read_text()
-        except OSError as exc:
-            if exc.errno != errno.ENOENT:
-                raise
+        except FileNotFoundError:
+            pass
         else:
             tds_pid = int(pid_str)
             logger.debug("waiting for tool-data-sink (%d) to exit", tds_pid)
@@ -76,9 +112,8 @@ def graceful_shutdown(
         ltm_pid_file = benchmark_run_dir / "tm" / f"tm-{group}-{full_hostname}.pid"
         try:
             pid_str = ltm_pid_file.read_text()
-        except OSError as exc:
-            if exc.errno != errno.ENOENT:
-                raise
+        except FileNotFoundError:
+            pass
         else:
             ltm_pid = int(pid_str)
             logger.debug("waiting for local tool-meister (%d) to exit", ltm_pid)
@@ -89,29 +124,7 @@ def graceful_shutdown(
 
     # All was good so far, so we can terminate the redis server.
     try:
-        try:
-            pid_str = redis_server_pid_file.read_text()
-        except OSError as exc:
-            if exc.errno != errno.ENOENT:
-                raise
-        else:
-            redis_server_pid = int(pid_str)
-            pid_exists = True
-            timeout = time.time() + 60
-            while pid_exists:
-                try:
-                    os.kill(redis_server_pid, signal.SIGTERM)
-                except ProcessLookupError:
-                    pid_exists = False
-                else:
-                    if time.time() > timeout:
-                        try:
-                            os.kill(redis_server_pid, signal.SIGKILL)
-                        except ProcessLookupError:
-                            pid_exists = False
-                        except Exception:
-                            raise
-                    time.sleep(0.1)
+        ret_val = redis_server.kill(ret_val)
     except Exception:
         logger.exception("Exception encountered terminating Redis server")
         ret_val = 1
@@ -119,7 +132,7 @@ def graceful_shutdown(
     return ret_val
 
 
-def main(_prog, cli_params):
+def main(_prog: str, cli_params: Namespace) -> int:
     """Main program for the tool meister stop CLI interface.
 
     Stopping the Tool Meisters involves four steps:
@@ -183,36 +196,18 @@ def main(_prog, cli_params):
         logger.exception("failed to fetch required parameters from the environment")
         return 1
 
-    if cli_params.redis_server is None:
-        # No Redis server was given, so look locally to see if we can find it.
-        # If no Redis server locally, we're done.
-        try:
-            redis_server_pid_file = (
-                benchmark_run_dir / "tm" / f"redis_{def_redis_port:d}.pid"
-            ).resolve(strict=True)
-        except FileNotFoundError:
-            # No Redis server, nothing to do.
-            return 0
-        else:
-            redis_host = "localhost"
-            redis_port = def_redis_port
-    else:
-        parts = cli_params.redis_server.split(":", 1)
-        if len(parts) != 2:
-            logger.error("Bad Redis server specified, '%s'", cli_params.redis_server)
-            return 1
-        try:
-            redis_port = int(parts[1])
-        except Exception:
-            logger.error("Bad Redis port specified, '%s'", cli_params.redis_server)
-            return 1
-        else:
-            redis_host = parts[0]
+    try:
+        redis_server = RedisServer(
+            cli_params.redis_server, benchmark_run_dir, full_hostname
+        )
+    except RedisServer.Err as exc:
+        logger.error(str(exc))
+        return exc.return_code
 
     # The Redis server is always running on the local host with the CLI.
     with Client(
-        redis_host=redis_host,
-        redis_port=redis_port,
+        redis_host=redis_server.host,
+        redis_port=redis_server.port,
         channel_prefix=cli_tm_channel_prefix,
         logger=logger,
     ) as client:
@@ -254,14 +249,14 @@ def main(_prog, cli_params):
     # just return the success/failure of the terminate operation.
     ret_val = end_ret_val if end_ret_val != 0 else term_ret_val
 
-    if cli_params.redis_server is None:
+    if redis_server.locally_managed():
         # The client operations have finished, successful or unsuccessfully,
         # and we were not given an explicit Redis server to use.  So the
         # previous pbench-tool-meister-start must have set up the local Tool
         # Data Sink, Tool Meister (if registered), and the Redis server.  It is
         # our responsibility to make sure these processes shut down correctly.
         shutdown_ret_val = graceful_shutdown(
-            benchmark_run_dir, full_hostname, group, redis_server_pid_file, logger
+            benchmark_run_dir, full_hostname, group, redis_server, logger
         )
         if ret_val == 0:
             # If client termination was successful, report the status of the
diff --git a/agent/util-scripts/test-bin/test-client-tool-meister b/agent/util-scripts/test-bin/test-client-tool-meister
index d48e910b70..f570c1bfb6 100755
--- a/agent/util-scripts/test-bin/test-client-tool-meister
+++ b/agent/util-scripts/test-bin/test-client-tool-meister
@@ -76,7 +76,10 @@ function _timeout {
 
 source ${_script_path}/common-tm-cleanup
 
-PBENCH_REDIS_SERVER="localhost" PBENCH_TOOL_DATA_SINK="localhost" _timeout pbench-tool-meister-start --sysinfo="${sysinfo}" "${group}"
+export PBENCH_REDIS_SERVER="localhost:17001;localhost:17001"
+export PBENCH_TOOL_DATA_SINK="localhost;localhost"
+
+_timeout pbench-tool-meister-start --sysinfo="${sysinfo}" "${group}"
 status=${?}
 if [[ ${status} -ne 0 ]]; then
     printf -- "ERROR - \"pbench-tool-meister-start --sysinfo='%s' '%s'\" failed to execute successfully (exit code: %s)\n" "${sysinfo}" "${group}" "${status}" >&2
diff --git a/agent/util-scripts/test-bin/test-start-stop-tool-meister b/agent/util-scripts/test-bin/test-start-stop-tool-meister
index bbd6a4284e..30ef26c53a 100755
--- a/agent/util-scripts/test-bin/test-start-stop-tool-meister
+++ b/agent/util-scripts/test-bin/test-start-stop-tool-meister
@@ -49,14 +49,14 @@ fi
 
 source ${_script_path}/common-tm-cleanup
 
-_PBENCH_TOOL_MEISTER_LOG_LEVEL="debug" _PBENCH_TOOL_DATA_SINK_LOG_LEVEL="debug" _PBENCH_TOOL_MEISTER_START_LOG_LEVEL="debug" PBENCH_REDIS_SERVER="localhost" PBENCH_TOOL_DATA_SINK="localhost" pbench-tool-meister-start --sysinfo="none" ${group}
+_PBENCH_TOOL_MEISTER_LOG_LEVEL="debug" _PBENCH_TOOL_DATA_SINK_LOG_LEVEL="debug" _PBENCH_TOOL_MEISTER_START_LOG_LEVEL="debug" PBENCH_REDIS_SERVER="localhost" PBENCH_TOOL_DATA_SINK="localhost:8080;localhost:8080" pbench-tool-meister-start --sysinfo="none" ${group}
 status=${?}
 if [[ ${status} -ne 0 ]]; then
     printf -- "\"pbench-tool-meister-start ${group}\" failed to execute successfully (exit code: ${status})\n" >&2
     exit 1
 fi
 
-_PBENCH_TOOL_MEISTER_STOP_LOG_LEVEL="debug" pbench-tool-meister-stop --sysinfo="none" ${group}
+_PBENCH_TOOL_MEISTER_STOP_LOG_LEVEL="debug" PBENCH_REDIS_SERVER="localhost:17001;localhost:17001" PBENCH_TOOL_DATA_SINK="localhost" pbench-tool-meister-stop --sysinfo="none" ${group}
 status=${?}
 if [[ ${status} -ne 0 ]]; then
     printf -- "\"pbench-tool-meister-stop\" failed to execute successfully (exit code: ${status})\n" >&2
diff --git a/lib/pbench/agent/utils.py b/lib/pbench/agent/utils.py
index bf38f0b61c..a4822d29c5 100644
--- a/lib/pbench/agent/utils.py
+++ b/lib/pbench/agent/utils.py
@@ -2,18 +2,210 @@
 import logging
 import os
 import re
+import signal
 import subprocess
 import sys
+import time
 
 from datetime import datetime
 
 from pbench.agent.constants import (
+    def_redis_port,
     sysinfo_opts_available,
     sysinfo_opts_convenience,
     sysinfo_opts_default,
 )
 
 
+class BaseReturnCode:
+    """BaseReturnCode - base class of common methods for symbolic return codes
+    for main programs.
+    """
+
+    # Common success code is zero.
+    SUCCESS = 0
+
+    # Common kill sub-codes
+    KILL_SUCCESS = 0
+    KILL_READEXC = 1
+    KILL_BADPID = 2
+    KILL_READERR = 3
+    KILL_TERMEXC = 4
+    KILL_KILLEXC = 5
+
+    class Err(RuntimeError):
+        """Err - exception definition to capture return code as an attribute.
+        """
+
+        def __init__(self, message: str, return_code: int):
+            """Adds a return_code attribute to capture an integer representing
+            the return code a caller can pass along.
+            """
+            super().__init__(message)
+            self.return_code = return_code
+
+    @staticmethod
+    def kill_ret_code(kill_code: int, ret_val: int):
+        """kill_ret_code - return an integer return code made up of the given
+        kill code and a return value.
+
+        A kill code of 0 and return value of 42 is returned as 42.
+        A kill code of 5 and return value of 52 is returned as 542.
+        """
+        return (kill_code * 100) + ret_val
+
+
+class BaseServer:
+    """BaseServer - abstract base class for common code shared between the
+    ToolDataSink and RedisServer classes.
+    """
+
+    def_port = None
+    bad_port_code = None
+    bad_host_code = None
+    name = None
+
+    class Err(BaseReturnCode.Err):
+        """BaseServer.Err - derived from BaseReturnCode.Err, specifically
+        raised by BaseServer and its derived classes.
+        """
+
+        pass
+
+    def __init__(self, spec: str, def_host_name: str):
+        """__init__ - from the given IP/port specification, determine the
+        IP:port for binding (listening) and the IP:port for connecting.
+
+        The IP/port specification can be given in one of two forms:
+
+          - `<ip>:<port>'
+            * where the same ip address and port are used for binding and
+              connecting
+          - `<bind ip>:<port>;<connect ip>:<port>`
+            * where a semi-colon separates the bind ip/port from the connecting
+              ip/port
+
+        In either case, a missing port (bare colon, optional) indicates the
+        default port should be used. If no IP address is given, the default
+        host name is used.
+
+        No attempt is made to verify that the IP address resolves, or that it
+        is reachable, though we do check they are syntactically valid.
+        """
+        assert (
+            def_host_name
+        ), f"Logic bomb!  Default host name required: {spec!r}, {def_host_name!r}"
+        _spec = spec if spec else def_host_name
+        parts = _spec.split(";", 1)
+        pairs = []
+        for part in parts:
+            host_port_parts = part.rsplit(":", 1)
+            if len(host_port_parts) == 1:
+                port = self.def_port
+            else:
+                try:
+                    port = int(host_port_parts[1])
+                except ValueError as exc:
+                    if host_port_parts[1] == "":
+                        port = self.def_port
+                    else:
+                        raise self.Err(
+                            f"Bad port specified for {self.name} in '{spec}'",
+                            self.bad_port_ret_code,
+                        ) from exc
+            host = host_port_parts[0] if host_port_parts[0] else def_host_name
+            if host[0] == "[" and host[-1] == "]":
+                # Brackets are invalid for a host name, but might be used when
+                # specifying a port with an IPv6 address, strip them before we
+                # validate the host name.
+                host = host[1:-1]
+            if validate_hostname(host) != 0:
+                raise self.Err(
+                    f"Bad host specified for {self.name} in '{spec}'",
+                    self.bad_host_ret_code,
+                )
+            pairs.append((host, port))
+
+        self.bind_host, self.bind_port = pairs[0]
+        if len(pairs) == 2:
+            # Separate bind/connecting ip:port
+            self.host, self.port = pairs[1]
+            self._repr = f"{self.name} - {self.bind_host}:{self.bind_port} / {self.host}:{self.port}"
+        else:
+            assert len(pairs) == 1, "Logic bomb!  unexpected pairs, {pairs!r}"
+            self.host, self.port = pairs[0]
+            self._repr = f"{self.name} - {self.host}:{self.port}"
+
+        self.pid_file = None
+
+    def __repr__(self):
+        return self._repr
+
+    def kill(self, ret_val: int):
+        """kill - attempt to KILL the running Redis server.
+
+        This method is a no-op if the server instance isn't managed by us.
+
+        Returns BaseReturnCode "enum" via the "kill" return code method.
+        """
+        assert self.pid_file is not None, f"Logic bomb!  Unexpected state: {self!r}"
+
+        try:
+            raw_pid = self.pid_file.read_text()
+        except FileNotFoundError:
+            return BaseReturnCode.kill_ret_code(BaseReturnCode.KILL_SUCCESS, ret_val)
+        except OSError:
+            return BaseReturnCode.kill_ret_code(BaseReturnCode.KILL_READERR, ret_val)
+        except Exception:
+            # No "pid" to kill
+            return BaseReturnCode.kill_ret_code(BaseReturnCode.KILL_READEXC, ret_val)
+
+        try:
+            pid = int(raw_pid)
+        except ValueError:
+            # Bad pid value
+            return BaseReturnCode.kill_ret_code(BaseReturnCode.KILL_BADPID, ret_val)
+
+        pid_exists = True
+        timeout = time.time() + 60
+        while pid_exists:
+            try:
+                os.kill(pid, signal.SIGTERM)
+            except ProcessLookupError:
+                pid_exists = False
+            except Exception:
+                # Some other error encountered trying to KILL the process.
+                return BaseReturnCode.kill_ret_code(
+                    BaseReturnCode.KILL_TERMEXC, ret_val
+                )
+            else:
+                if time.time() > timeout:
+                    try:
+                        os.kill(pid, signal.SIGKILL)
+                    except ProcessLookupError:
+                        pass
+                    except Exception:
+                        return BaseReturnCode.kill_ret_code(
+                            BaseReturnCode.KILL_KILLEXC, ret_val
+                        )
+                    break
+                time.sleep(0.1)
+
+        # "successfully" KILL'd the give process.
+        return BaseReturnCode.kill_ret_code(BaseReturnCode.KILL_SUCCESS, ret_val)
+
+
+class RedisServerCommon(BaseServer):
+    """RedisServerCommon - an encapsulation of the common handling of the Redis
+    server specification.
+    """
+
+    def_port = def_redis_port
+    bad_port_ret_code = 1
+    bad_host_ret_code = 1
+    name = "Redis server"
+
+
 def setup_logging(debug, logfile):
     """Setup logging for client
     :param debug: Turn on debug logging
diff --git a/lib/pbench/test/unit/agent/test_utils.py b/lib/pbench/test/unit/agent/test_utils.py
new file mode 100644
index 0000000000..9273e96063
--- /dev/null
+++ b/lib/pbench/test/unit/agent/test_utils.py
@@ -0,0 +1,211 @@
+"""Tests for the utils module.
+"""
+
+import os
+import pathlib
+import pytest
+import signal
+import time
+
+from pbench.agent.utils import BaseServer, BaseReturnCode
+
+
+class OurServer(BaseServer):
+    def_port = 4242
+    bad_port_ret_code = 42
+    bad_host_ret_code = 43
+    name = "forty-two"
+
+
+class TestBaseServer:
+    """Verify the utils BaseServer class.
+    """
+
+    def test_constructor(self):
+        with pytest.raises(AssertionError):
+            bs = OurServer("", "")
+
+        for arg1, arg2 in [("", "localhost"), ("localhost", "localhost")]:
+            bs = OurServer(arg1, arg2)
+            assert bs.pid_file is None
+            assert bs.host == "localhost"
+            assert bs.port == OurServer.def_port
+            assert bs.bind_host == bs.host
+            assert bs.bind_port == bs.port
+            assert repr(bs) == f"forty-two - localhost:{OurServer.def_port:d}"
+
+        _def = "localhost"
+
+        with pytest.raises(OurServer.Err) as excinfo:
+            bs = OurServer("bad_host.example.com", _def)
+        assert (
+            excinfo.value.return_code == OurServer.bad_host_ret_code
+        ), f"{excinfo.value!r}"
+
+        with pytest.raises(OurServer.Err) as excinfo:
+            bs = OurServer("bad-port.example.com:bad", _def)
+        assert (
+            excinfo.value.return_code == OurServer.bad_port_ret_code
+        ), f"{excinfo.value!r}"
+
+        for arg1, exp_port in [
+            ("host.example.com:2345", 2345),
+            ("host.example.com:", 4242),
+        ]:
+            bs = OurServer(arg1, _def)
+            assert bs.pid_file is None
+            assert bs.host == "host.example.com"
+            assert bs.port == exp_port
+            assert bs.bind_host == bs.host
+            assert bs.bind_port == bs.port
+            assert repr(bs) == f"forty-two - host.example.com:{exp_port:d}"
+
+        bs = OurServer(":2345", _def)
+        assert bs.pid_file is None
+        assert bs.host == "localhost"
+        assert bs.port == 2345
+        assert bs.bind_host == bs.host
+        assert bs.bind_port == bs.port
+        assert repr(bs) == "forty-two - localhost:2345"
+
+        bs = OurServer("127.0.0.42:4567", _def)
+        assert bs.pid_file is None
+        assert bs.host == "127.0.0.42"
+        assert bs.port == 4567
+        assert bs.bind_host == bs.host
+        assert bs.bind_port == bs.port
+        assert repr(bs) == "forty-two - 127.0.0.42:4567"
+
+        bs = OurServer("[127::42]:4567", _def)
+        assert bs.pid_file is None
+        assert bs.host == "127::42"
+        assert bs.port == 4567
+        assert bs.bind_host == bs.host
+        assert bs.bind_port == bs.port
+        assert repr(bs) == "forty-two - 127::42:4567"
+
+        bs = OurServer("bind.example.com:2345;host.example.com:6789", _def)
+        assert bs.pid_file is None
+        assert bs.host == "host.example.com"
+        assert bs.port == 6789
+        assert bs.bind_host == "bind.example.com"
+        assert bs.bind_port == 2345
+        assert repr(bs) == "forty-two - bind.example.com:2345 / host.example.com:6789"
+
+    def test_kill(self, pytestconfig, monkeypatch):
+        bs = OurServer("localhost", "localhost")
+        with pytest.raises(AssertionError):
+            bs.kill(1)
+
+        bs = OurServer("localhost", "localhost")
+        TMP = pathlib.Path(pytestconfig.cache.get("TMP", None))
+        pidfile = TMP / "test.pid"
+        pidfile.write_text("12345")
+        bs.pid_file = pidfile
+        ret = bs.kill(42)
+        assert ret == 42
+
+        pidfile.write_text("badpid")
+        ret = bs.kill(42)
+        assert ret == (BaseReturnCode.KILL_BADPID * 100) + 42
+
+        bs.pid_file = TMP / "enoent.pid"
+        ret = bs.kill(42)
+        assert ret == 42
+
+        class MockPath:
+            def __init__(self, exc: Exception):
+                self._exc = exc
+
+            def read_text(self):
+                raise self._exc
+
+        bs.pid_file = MockPath(OSError(13, "fake oserror"))
+        ret = bs.kill(42)
+        assert ret == 342
+
+        bs.pid_file = MockPath(Exception("fake exception"))
+        ret = bs.kill(42)
+        assert ret == 142
+
+        class MockTime:
+            def __init__(self):
+                self._clock = 0
+
+            def time(self, *args, **kwargs):
+                self._clock += 1
+                return self._clock
+
+            def sleep(self, *args, **kwargs):
+                return
+
+        class MockKill:
+            behaviors = {
+                "1001": (True, False, True, False, False, False),
+                "1002": (False, False, False, False, False, False),
+                "1003": (True, True, False, True, False, False),
+                "1004": (True, False, False, True, False, False),
+                "1005": (True, False, False, False, True, False),
+                "1006": (True, False, False, False, False, True),
+            }
+
+            def __init__(self, behavior):
+                (
+                    self.pid_exists_term,
+                    self.pid_exists_kill,
+                    self.pid_killed_by_term,
+                    self.pid_killed_by_kill,
+                    self.pid_kill_term_exc,
+                    self.pid_kill_kill_exc,
+                ) = self.behaviors[behavior]
+
+            def kill(self, pid, sig):
+                if sig == signal.SIGTERM:
+                    if self.pid_kill_term_exc:
+                        raise Exception("term")
+                    if self.pid_exists_term:
+                        if self.pid_killed_by_term:
+                            self.pid_exists_term = False
+                            self.pid_exists_kill = False
+                    else:
+                        raise ProcessLookupError(pid)
+                else:
+                    assert sig == signal.SIGKILL
+                    if self.pid_kill_kill_exc:
+                        raise Exception("kill")
+                    if self.pid_exists_kill:
+                        if self.pid_killed_by_kill:
+                            self.pid_exists_term = False
+                            self.pid_exists_kill = False
+                    else:
+                        raise ProcessLookupError(pid)
+
+        bs.pid_file = pidfile
+
+        test_cases = [
+            ("1001", 42, "Kill a pid that is found, successfully"),
+            ("1002", 42, "Kill a pid that is not found, successfully"),
+            (
+                "1003",
+                42,
+                "Kill a pid that is found, successfully,"
+                " where the SIGKILL successfully kills it",
+            ),
+            (
+                "1004",
+                42,
+                "Kill a pid that is found, unsuccessfully,"
+                " where the SIGKILL does not find it",
+            ),
+            ("1005", 442, "Exception raised killing a pid w TERM"),
+            ("1006", 542, "Exception raised killing a pid w KILL"),
+        ]
+        for pid_text, ret_code, desc in test_cases:
+            pidfile.write_text(pid_text)
+            mock_time = MockTime()
+            mock_kill = MockKill(pid_text)
+            monkeypatch.setattr(os, "kill", mock_kill.kill)
+            monkeypatch.setattr(time, "time", mock_time.time)
+            monkeypatch.setattr(time, "sleep", mock_time.sleep)
+            ret = bs.kill(42)
+            assert ret == ret_code, f"{desc} FAILED, {pid_text!r}, {ret_code!r}"