Skip to content

Commit

Permalink
DAOS-16366 test: Use agent/server config files from test directory (#…
Browse files Browse the repository at this point in the history
…14944) (#15033)

Use agent, control, and server config files placed in the common test directory instead of /etc/daos with a systemctl override configuration file.

Signed-off-by: Phil Henderson <[email protected]>
  • Loading branch information
phender authored Aug 30, 2024
1 parent bd4cfd2 commit cf7322a
Show file tree
Hide file tree
Showing 15 changed files with 652 additions and 417 deletions.
11 changes: 8 additions & 3 deletions src/tests/ftest/daos_test/dfuse.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@
from apricot import TestWithServers
from cmocka_utils import CmockaUtils
from dfuse_utils import get_dfuse, start_dfuse
from general_utils import create_directory, get_log_file
from file_utils import create_directory
from general_utils import get_log_file
from job_manager_utils import get_job_manager


Expand Down Expand Up @@ -80,7 +81,9 @@ def run_test(self, il_lib=None):
else:
# Bypass, simply create a remote directory and use that.
mount_dir = '/tmp/dfuse-test'
create_directory(self.hostlist_clients, mount_dir)
result = create_directory(self.log, self.hostlist_clients, mount_dir)
if not result.passed:
self.fail(f"Error creating {mount_dir} on {result.failed_hosts}")

cmocka_utils = CmockaUtils(
self.hostlist_clients, "dfuse", self.outputdir, self.test_dir, self.log)
Expand Down Expand Up @@ -118,7 +121,9 @@ def run_test(self, il_lib=None):
else:
# make D_IL_MOUNT_POINT different from mount_dir so it tests a non-DAOS filesystem
dummy_dir = '/tmp/dummy'
create_directory(self.hostlist_clients, dummy_dir)
result = create_directory(self.log, self.hostlist_clients, dummy_dir)
if not result.passed:
self.fail(f"Error creating {dummy_dir} on {result.failed_hosts}")
daos_test_env['D_IL_MOUNT_POINT'] = dummy_dir
if cache_mode != 'writeback':
command.append('--metadata')
Expand Down
17 changes: 14 additions & 3 deletions src/tests/ftest/launch.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@
from util.code_coverage_utils import CodeCoverage
from util.environment_utils import TestEnvironment, TestEnvironmentException, set_test_environment
from util.host_utils import get_local_host
from util.launch_utils import LaunchException, TestGroup, setup_fuse_config, summarize_run
from util.launch_utils import (LaunchException, TestGroup, setup_fuse_config, setup_systemctl,
summarize_run)
from util.logger_utils import LOG_FILE_FORMAT, get_console_handler, get_file_handler
from util.network_utils import PROVIDER_ALIAS, SUPPORTED_PROVIDERS
from util.package_utils import find_packages
Expand Down Expand Up @@ -270,7 +271,8 @@ def _run(self, args):
# pylint: disable=unsupported-binary-operation
all_hosts = args.test_servers | args.test_clients | self.local_host
self.details["installed packages"] = find_packages(
logger, all_hosts, "'^(daos|libfabric|mercury|ior|openmpi|mpifileutils)-'")
logger, all_hosts,
"'^(daos|libfabric|mercury|ior|openmpi|mpifileutils|mlnx-ofed-basic)-'")

# Setup the test environment
test_env = TestEnvironment()
Expand Down Expand Up @@ -325,6 +327,15 @@ def _run(self, args):
message = "Issue detected setting up the fuse configuration"
setup_result.warn_test(logger, "Setup", message, sys.exc_info())

# Setup override systemctl files
try:
clients = args.test_clients if args.test_clients else args.test_servers
cleanup_files = setup_systemctl(
logger, args.test_servers, clients | self.local_host, test_env)
except LaunchException:
message = "Issue detected setting up the systemctl configuration"
return self.get_exit_status(1, message, "Setup", sys.exc_info())

# Get the core file pattern information
core_files = {}
if args.process_cores:
Expand Down Expand Up @@ -370,7 +381,7 @@ def _run(self, args):
logger, self.result, self.repeat, self.slurm_setup, args.sparse, args.failfast,
not args.disable_stop_daos, args.archive, args.rename, args.jenkinslog, core_files,
args.logs_threshold, args.user_create, code_coverage, self.job_results_dir,
self.logdir, args.clear_mounts)
self.logdir, args.clear_mounts, cleanup_files)

# Convert the test status to a launch.py status
status |= summarize_run(logger, self.mode, test_status)
Expand Down
7 changes: 4 additions & 3 deletions src/tests/ftest/pool/destroy.py
Original file line number Diff line number Diff line change
Expand Up @@ -362,10 +362,11 @@ def test_destroy_wrong_group(self):
server_group_b = self.server_group + "_b"

# Prepare and configure dmg config files for a and b.
dmg_config_file_a = get_default_config_file(name="control_a")
config_path = os.path.dirname(self.test_env.control_config)
dmg_config_file_a = get_default_config_file(name="control_a", path=config_path)
dmg_config_temp_a = self.get_config_file(
name=server_group_a, command="dmg", path=self.test_dir)
dmg_config_file_b = get_default_config_file(name="control_b")
dmg_config_file_b = get_default_config_file(name="control_b", path=config_path)
dmg_config_temp_b = self.get_config_file(
name=server_group_b, command="dmg", path=self.test_dir)

Expand Down Expand Up @@ -393,7 +394,7 @@ def test_destroy_wrong_group(self):

# Get dmg_c instance that uses daos_control_c.yml. Server group is b.
cert_dir = os.path.join(os.sep, "etc", "daos", "certs")
dmg_config_file_c = get_default_config_file(name="control_c")
dmg_config_file_c = get_default_config_file(name="control_c", path=config_path)
dmg_config_temp_c = self.get_config_file(
name=server_group_b, command="dmg", path=self.test_dir)
dmg_c = get_dmg_command(
Expand Down
17 changes: 7 additions & 10 deletions src/tests/ftest/recovery/ddb.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,12 @@
from ClusterShell.NodeSet import NodeSet
from ddb_utils import DdbCommand
from exception_utils import CommandFailure
from general_utils import (DaosTestError, create_string_buffer, distribute_files,
get_clush_command, get_random_string, report_errors, run_command)
from file_utils import distribute_files
from general_utils import (DaosTestError, create_string_buffer, get_random_string, report_errors,
run_command)
from pydaos.raw import DaosObjClass, IORequest
from recovery_test_base import RecoveryTestBase
from run_utils import get_clush_command


def insert_objects(context, container, object_count, dkey_count, akey_count, base_dkey,
Expand Down Expand Up @@ -507,14 +509,9 @@ def test_recovery_ddb_load(self):
file.write(new_data)

# Copy the created file to server node.
try:
distribute_files(
hosts=host, source=load_file_path, destination=load_file_path,
mkdir=False)
except DaosTestError as error:
raise CommandFailure(
"ERROR: Copying new_data.txt to {0}: {1}".format(host, error)) \
from error
result = distribute_files(self.log, host, load_file_path, load_file_path, False)
if not result.passed:
raise CommandFailure(f"ERROR: Copying new_data.txt to {result.failed_hosts}")

# The file with the new data is ready. Run ddb load.
ddb_command.value_load(component_path="[0]/[0]/[0]/[0]", load_file_path=load_file_path)
Expand Down
6 changes: 3 additions & 3 deletions src/tests/ftest/util/agent_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
import os
import re
import socket
from getpass import getuser

from agent_utils_params import DaosAgentTransportCredentials, DaosAgentYamlParameters
from ClusterShell.NodeSet import NodeSet
Expand Down Expand Up @@ -289,7 +288,7 @@ def start(self):

# Verify the socket directory exists when using a non-systemctl manager
if self.verify_socket_dir:
self.verify_socket_directory(getuser())
self.verify_socket_directory(self.manager.job.certificate_owner)

super().start()

Expand Down Expand Up @@ -319,7 +318,8 @@ def support_collect_log(self, **kwargs):
"""
cmd = self.manager.job.copy()
cmd.debug.value = False
cmd.config.value = get_default_config_file("agent")
cmd.config.value = get_default_config_file(
"agent", os.path.dirname(self.manager.job.yaml.filename))
cmd.set_command(("support", "collect-log"), **kwargs)
self.log.info("Support collect-log on clients: %s", str(cmd))
return run_remote(self.log, self.hosts, cmd.with_exports)
Expand Down
21 changes: 12 additions & 9 deletions src/tests/ftest/util/apricot/apricot/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,7 @@
from exception_utils import CommandFailure
from fault_config_utils import FaultInjection
from general_utils import (dict_to_str, dump_engines_stacks, get_avocado_config_value,
get_default_config_file, get_file_listing, nodeset_append_suffix,
set_avocado_config_value)
nodeset_append_suffix, set_avocado_config_value)
from host_utils import HostException, HostInfo, HostRole, get_host_parameters, get_local_host
from logger_utils import TestLogger
from pydaos.raw import DaosApiError, DaosContext, DaosLog
Expand Down Expand Up @@ -762,13 +761,17 @@ def setUp(self):
self.fail(f"Error creating test-specific temporary directory on {result.failed_hosts}")

# Copy the fault injection files to the hosts.
self.fault_injection.copy_fault_files(self.host_info.all_hosts)
self.fault_injection.copy_fault_files(self.log, self.host_info.all_hosts)

# List common test directory contents before running the test
self.log.info("-" * 100)
self.log.debug("Common test directory (%s) contents:", self.test_dir)
self.log.debug("Common test directory (%s) contents:", os.path.dirname(self.test_dir))
all_hosts = include_local_host(self.host_info.all_hosts)
get_file_listing(all_hosts, self.test_dir, self.test_env.agent_user).log_output(self.log)
test_dir_parent = os.path.dirname(self.test_dir)
result = run_remote(self.log, all_hosts, f"df -h {test_dir_parent}")
if int(max(re.findall(r" ([\d+])% ", result.joined_stdout) + ["0"])) > 90:
run_remote(self.log, all_hosts, f"du -sh {test_dir_parent}/*")
self.log.info("-" * 100)

if not self.start_servers_once or self.name.uid == 1:
# Kill commands left running on the hosts (from a previous test)
Expand Down Expand Up @@ -1063,7 +1066,7 @@ def add_agent_manager(self, group=None, config_file=None, config_temp=None):
if group is None:
group = self.server_group
if config_file is None and self.agent_manager_class == "Systemctl":
config_file = get_default_config_file("agent")
config_file = self.test_env.agent_config
config_temp = self.get_config_file(group, "agent", self.test_dir)
elif config_file is None:
config_file = self.get_config_file(group, "agent")
Expand Down Expand Up @@ -1113,14 +1116,14 @@ def add_server_manager(self, group=None, svr_config_file=None,
if group is None:
group = self.server_group
if svr_config_file is None and self.server_manager_class == "Systemctl":
svr_config_file = get_default_config_file("server")
svr_config_file = self.test_env.server_config
svr_config_temp = self.get_config_file(
group, "server", self.test_dir)
elif svr_config_file is None:
svr_config_file = self.get_config_file(group, "server")
svr_config_temp = None
if dmg_config_file is None and self.server_manager_class == "Systemctl":
dmg_config_file = get_default_config_file("control")
dmg_config_file = self.test_env.control_config
dmg_config_temp = self.get_config_file(group, "dmg", self.test_dir)
elif dmg_config_file is None:
dmg_config_file = self.get_config_file(group, "dmg")
Expand Down Expand Up @@ -1668,7 +1671,7 @@ def get_dmg_command(self, index=0):
return self.server_managers[index].dmg

if self.server_manager_class == "Systemctl":
dmg_config_file = get_default_config_file("control")
dmg_config_file = self.test_env.control_config
dmg_config_temp = self.get_config_file("daos", "dmg", self.test_dir)
dmg_cert_dir = os.path.join(os.sep, "etc", "daos", "certs")
else:
Expand Down
109 changes: 10 additions & 99 deletions src/tests/ftest/util/collection_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from util.environment_utils import TestEnvironment
from util.host_utils import get_local_host
from util.run_utils import find_command, run_local, run_remote, stop_processes
from util.systemctl_utils import stop_service
from util.user_utils import get_chown_command
from util.yaml_utils import get_test_category

Expand Down Expand Up @@ -66,97 +67,6 @@ def stop_daos_server_service(logger, test):
return stop_service(logger, hosts, service)


def stop_service(logger, hosts, service):
"""Stop any daos_server.service running on the hosts running servers.
Args:
logger (Logger): logger for the messages produced by this method
hosts (NodeSet): list of hosts on which to stop the service.
service (str): name of the service
Returns:
bool: True if the service was successfully stopped; False otherwise
"""
result = {"status": True}
if hosts:
status_keys = ["reset-failed", "stop", "disable"]
mapping = {"stop": "active", "disable": "enabled", "reset-failed": "failed"}
check_hosts = NodeSet(hosts)
loop = 1
# Reduce 'max_loops' to 2 once https://jira.hpdd.intel.com/browse/DAOS-7809
# has been resolved
max_loops = 3
while check_hosts:
# Check the status of the service on each host
result = get_service_status(logger, check_hosts, service)
check_hosts = NodeSet()
for key in status_keys:
if result[key]:
if loop == max_loops:
# Exit the while loop if the service is still running
logger.error(
" - Error %s still %s on %s", service, mapping[key], result[key])
result["status"] = False
else:
# Issue the appropriate systemctl command to remedy the
# detected state, e.g. 'stop' for 'active'.
command = ["sudo", "-n", "systemctl", key, service]
run_remote(logger, result[key], " ".join(command))

# Run the status check again on this group of hosts
check_hosts.add(result[key])
loop += 1
else:
logger.debug(" Skipping stopping %s service - no hosts", service)

return result["status"]


def get_service_status(logger, hosts, service):
"""Get the status of the daos_server.service.
Args:
logger (Logger): logger for the messages produced by this method
hosts (NodeSet): hosts on which to get the service state
service (str): name of the service
Returns:
dict: a dictionary with the following keys:
- "status": boolean set to True if status was obtained; False otherwise
- "stop": NodeSet where to stop the daos_server.service
- "disable": NodeSet where to disable the daos_server.service
- "reset-failed": NodeSet where to reset the daos_server.service
"""
status = {
"status": True,
"stop": NodeSet(),
"disable": NodeSet(),
"reset-failed": NodeSet()}
status_states = {
"stop": ["active", "activating", "deactivating"],
"disable": ["active", "activating", "deactivating"],
"reset-failed": ["failed"]}
command = ["systemctl", "is-active", service]
result = run_remote(logger, hosts, " ".join(command))
for data in result.output:
if data.timeout:
status["status"] = False
status["stop"].add(data.hosts)
status["disable"].add(data.hosts)
status["reset-failed"].add(data.hosts)
logger.debug(" %s: TIMEOUT", data.hosts)
break
logger.debug(" %s: %s", data.hosts, "\n".join(data.stdout))
for key, state_list in status_states.items():
for line in data.stdout:
if line in state_list:
status[key].add(data.hosts)
break
return status


def reset_server_storage(logger, test):
"""Reset the server storage for the hosts that ran servers in the test.
Expand Down Expand Up @@ -981,14 +891,15 @@ def collect_test_result(logger, test, test_result, job_results_dir, stop_daos, a
"depth": 1,
"timeout": 300,
}
remote_files["remote configuration files"] = {
"source": os.path.join(os.sep, "etc", "daos"),
"destination": os.path.join(job_results_dir, "latest", TEST_RESULTS_DIRS[0]),
"pattern": "daos_*.yml",
"hosts": test.host_info.all_hosts,
"depth": 1,
"timeout": 300,
}
for index, source in enumerate(test_env.config_file_directories()):
remote_files[f"remote configuration files ({index})"] = {
"source": source,
"destination": os.path.join(job_results_dir, "latest", TEST_RESULTS_DIRS[0]),
"pattern": "daos_*.yml",
"hosts": test.host_info.all_hosts,
"depth": 1,
"timeout": 300,
}
remote_files["daos log files"] = {
"source": test_env.log_dir,
"destination": os.path.join(job_results_dir, "latest", TEST_RESULTS_DIRS[1]),
Expand Down
Loading

0 comments on commit cf7322a

Please sign in to comment.