From 0e541f7aa9fb13b27a86763f45d960951de94440 Mon Sep 17 00:00:00 2001 From: Cathy Zhang Date: Fri, 6 Sep 2024 00:35:03 -0700 Subject: [PATCH 1/3] stresscli/metrics: Add support to collect service metrics for Docker deployed workload Signed-off-by: Cathy Zhang --- evals/benchmark/benchmark.yaml | 5 +- .../benchmark/stresscli/commands/load_test.py | 55 ++++++++--- .../stresscli/commands/metrics_docker.py | 93 +++++++++++++++++++ evals/benchmark/stresscli/requirements.txt | 1 + 4 files changed, 137 insertions(+), 17 deletions(-) create mode 100644 evals/benchmark/stresscli/commands/metrics_docker.py diff --git a/evals/benchmark/benchmark.yaml b/evals/benchmark/benchmark.yaml index bf3576ae..abde3c06 100644 --- a/evals/benchmark/benchmark.yaml +++ b/evals/benchmark/benchmark.yaml @@ -56,8 +56,9 @@ test_cases: e2e: run_test: true service_name: "chatqna-backend-server-svc" # Replace with your service name - service_list: # Replace with your k8s service names for metrics collection, - # activate if deployment_type is k8s and collect_service_metric is true + service_list: # Replace with your k8s service names if deploy with k8s + # or container names if deploy with Docker for metrics collection, + # activate if collect_service_metric is true - "chatqna-tei" - "chatqna-teirerank" diff --git a/evals/benchmark/stresscli/commands/load_test.py b/evals/benchmark/stresscli/commands/load_test.py index 8af0c989..f845490a 100644 --- a/evals/benchmark/stresscli/commands/load_test.py +++ b/evals/benchmark/stresscli/commands/load_test.py @@ -66,13 +66,30 @@ def locust_runtests(kubeconfig, profile): click.echo(f"Load test results saved to {base_folder}") -def collect_metrics(collector, namespace, services, output_dir): - collector.start_collecting_data( - namespace=namespace, - services=services, - output_dir=output_dir, - restart_pods_flag=False, - ) +def collect_metrics(collector, services, output_dir, namespace=None): + """Collect metrics from the specified services and output directory. + + Args: + collector: The metrics collector object. + services (list): A list of services to collect metrics from. + output_dir (str): The directory where metrics will be saved. + namespace (str, optional): The namespace for collecting metrics. Defaults to None. + """ + if namespace: + # If namespace is provided, call with namespace + collector.start_collecting_data( + namespace=namespace, + services=services, + output_dir=output_dir, + restart_pods_flag=False, + + ) + else: + # If namespace is not provided, call without namespace + collector.start_collecting_data( + services=services, + output_dir=output_dir, + ) def run_locust_test(kubeconfig, global_settings, run_settings, output_folder, index): @@ -154,20 +171,28 @@ def run_locust_test(kubeconfig, global_settings, run_settings, output_folder, in print(f"Running test: {' '.join(cmd)}") namespace = runspec["namespace"] - if service_metric and runspec["deployment-type"] == "k8s": - from .metrics import MetricsCollector - from .metrics_util import export_metric - - collector = MetricsCollector() + if service_metric: services = global_settings.get("service-list") or [] - collect_metrics(collector, namespace, services, start_output_folder) + if runspec["deployment-type"] == "k8s": + from .metrics import MetricsCollector + collector = MetricsCollector() + collect_metrics(collector, services, start_output_folder, namespace) + elif runspec["deployment-type"] == "docker": + from .metrics_docker import DockerMetricsCollector + collector = DockerMetricsCollector() + collect_metrics(collector, services, start_output_folder) runspec["starttest_time"] = datetime.now().isoformat() result = subprocess.run(cmd, capture_output=True, text=True) runspec["endtest_time"] = datetime.now().isoformat() - if service_metric and runspec["deployment-type"] == "k8s": - collect_metrics(collector, namespace, services, end_output_folder) + if service_metric: + from .metrics_util import export_metric + services = global_settings.get("service-list") or [] + if runspec["deployment-type"] == "k8s": + collect_metrics(collector, services, end_output_folder, namespace) + elif runspec["deployment-type"] == "docker": + collect_metrics(collector, services, end_output_folder) export_metric(start_output_folder, end_output_folder, metrics_output_folder, metrics_output, services) with open(json_output, "w") as json_file: diff --git a/evals/benchmark/stresscli/commands/metrics_docker.py b/evals/benchmark/stresscli/commands/metrics_docker.py new file mode 100644 index 00000000..fd43eb2a --- /dev/null +++ b/evals/benchmark/stresscli/commands/metrics_docker.py @@ -0,0 +1,93 @@ +import json +import docker +import requests +import logging +import os +import time + +# Setup logs +log_level = os.getenv("LOG_LEVEL", "ERROR").upper() +logging.basicConfig(level=getattr(logging, log_level)) + +class DockerMetricsCollector: + def __init__(self): + self.docker_client = docker.from_env() + + def get_docker_container(self, container_name): + """Retrieve Docker container information""" + try: + container = self.docker_client.containers.get(container_name) + logging.info(f"Found Docker container {container_name}") + return container + except docker.errors.NotFound: + logging.error(f"Container {container_name} not found.") + return None + + def get_exposed_port(self, container): + """Get the port exposed to the external environment by the Docker container""" + try: + # Retrieve ports in JSON format + ports_json = container.attrs['NetworkSettings']['Ports'] + logging.debug(f"Container ports: {ports_json}") + + # Parse the ports to find the host port + for container_port, host_infos in ports_json.items(): + for host_info in host_infos: + host_ip = host_info['HostIp'] + host_port = host_info['HostPort'] + + # Use localhost if the port is mapped to 0.0.0.0 or empty + if host_ip in ['0.0.0.0', '']: + logging.debug(f"Found host port {host_port} for container port {container_port} (mapped to all interfaces)") + return ('localhost', host_port) + else: + logging.debug(f"Found host port {host_port} for container port {container_port} (mapped to {host_ip})") + return (host_ip, host_port) + + logging.error("No valid host port found.") + return (None, None) + except KeyError as e: + logging.error(f"Error retrieving ports: {e}") + return (None, None) + + def collect_metrics(self, container_name, metrics_path="/metrics"): + """Collect metrics from the Docker container""" + container = self.get_docker_container(container_name) + if container: + try: + host_ip, port = self.get_exposed_port(container) # Get the exposed port + if not port: + logging.error(f"Cannot determine port for container {container_name}") + return None + + # Construct the URL + service_url = f"http://{host_ip}:{port}{metrics_path}" + logging.debug(f"Collecting metrics from {service_url}") + response = requests.get(service_url) + response.raise_for_status() + return response.text + except requests.RequestException as e: + logging.error(f"Error collecting metrics from {container_name}: {e}") + return None + + def start_collecting_data(self, services, output_dir="/data"): + """Start collecting metrics from services""" + timestamp = int(time.time()) + for container_name in services: + metrics = self.collect_metrics(container_name) + if metrics: + output_path = os.path.join(output_dir, f"{container_name}_{timestamp}.txt") + logging.debug(f"Writing Docker metrics to {output_path}") + with open(output_path, "w") as f: + f.write(metrics) + else: + logging.error(f"No metrics collected for container {container_name}") + return {"status": "success"} + +if __name__ == "__main__": + docker_collector = DockerMetricsCollector() + result = docker_collector.start_collecting_data( + services=["llm-tgi-server", "retriever-redis-server", "embedding-tei-server", "tei-embedding-server", "tgi-service", "tei-reranking-server"], + output_dir="/path/to/data", + ) + print(result) diff --git a/evals/benchmark/stresscli/requirements.txt b/evals/benchmark/stresscli/requirements.txt index e77a6ca5..dac9a90a 100644 --- a/evals/benchmark/stresscli/requirements.txt +++ b/evals/benchmark/stresscli/requirements.txt @@ -7,3 +7,4 @@ numpy pytest pyyaml requests +docker From 311ddf9f94edc10a36ca28f1e0e2d795a4aab02e Mon Sep 17 00:00:00 2001 From: Cathy Zhang Date: Fri, 6 Sep 2024 00:40:14 -0700 Subject: [PATCH 2/3] Utilize symbol @ as a delimiter for metric file name Utilize symbol '@' as a delimiter between the service name and pod name within the generaged metric filename to ensure that a given service name does not erroneously correspond to an excessive number of unrelated files. Signed-off-by: Cathy Zhang --- evals/benchmark/stresscli/commands/metrics.py | 2 +- evals/benchmark/stresscli/commands/metrics_docker.py | 2 +- evals/benchmark/stresscli/commands/metrics_util.py | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/evals/benchmark/stresscli/commands/metrics.py b/evals/benchmark/stresscli/commands/metrics.py index cfe1948f..08474491 100644 --- a/evals/benchmark/stresscli/commands/metrics.py +++ b/evals/benchmark/stresscli/commands/metrics.py @@ -85,7 +85,7 @@ def start_collecting_data(self, namespace, services, output_dir="/data", restart pod_port = self.get_pod_port(pod_info) metrics = self.collect_metrics(pod_ip, pod_port, metrics_path) if metrics: - pod_output_path = os.path.join(output_dir, f"{service_name}_{pod_name}_{timestamp}.txt") + pod_output_path = os.path.join(output_dir, f"{service_name}@{pod_name}_{timestamp}.txt") logging.debug(f"Writing metrics to {pod_output_path}") with open(pod_output_path, "w") as f: f.write(metrics) diff --git a/evals/benchmark/stresscli/commands/metrics_docker.py b/evals/benchmark/stresscli/commands/metrics_docker.py index fd43eb2a..052b964d 100644 --- a/evals/benchmark/stresscli/commands/metrics_docker.py +++ b/evals/benchmark/stresscli/commands/metrics_docker.py @@ -76,7 +76,7 @@ def start_collecting_data(self, services, output_dir="/data"): for container_name in services: metrics = self.collect_metrics(container_name) if metrics: - output_path = os.path.join(output_dir, f"{container_name}_{timestamp}.txt") + output_path = os.path.join(output_dir, f"{container_name}@{timestamp}.txt") logging.debug(f"Writing Docker metrics to {output_path}") with open(output_path, "w") as f: f.write(metrics) diff --git a/evals/benchmark/stresscli/commands/metrics_util.py b/evals/benchmark/stresscli/commands/metrics_util.py index 1763eea5..73b17a95 100644 --- a/evals/benchmark/stresscli/commands/metrics_util.py +++ b/evals/benchmark/stresscli/commands/metrics_util.py @@ -116,8 +116,8 @@ def calculate_diff(start_dir, end_dir, output_dir, services=None): services = [services] for service_name in services: - # Create a regex pattern to match files starting with the service_name followed by a non-alphanumeric character - pattern = rf"^{re.escape(service_name)}[^a-zA-Z].*\.txt$" + # Create a regex pattern to match files starting with the service_name followed by symbol @ + pattern = rf'^{re.escape(service_name)}@.*\.txt$' start_service_files = [f for f in start_files if re.match(pattern, f)] end_service_files = [f for f in end_files if re.match(pattern, f)] From 754bfeedb5e70050485a3a83bca0fc071653386b Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 6 Sep 2024 09:04:56 +0000 Subject: [PATCH 3/3] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../benchmark/stresscli/commands/load_test.py | 4 +- .../stresscli/commands/metrics_docker.py | 46 +++++++++++++------ .../stresscli/commands/metrics_util.py | 2 +- evals/benchmark/stresscli/requirements.txt | 2 +- 4 files changed, 37 insertions(+), 17 deletions(-) diff --git a/evals/benchmark/stresscli/commands/load_test.py b/evals/benchmark/stresscli/commands/load_test.py index f845490a..fba1c5ee 100644 --- a/evals/benchmark/stresscli/commands/load_test.py +++ b/evals/benchmark/stresscli/commands/load_test.py @@ -82,7 +82,6 @@ def collect_metrics(collector, services, output_dir, namespace=None): services=services, output_dir=output_dir, restart_pods_flag=False, - ) else: # If namespace is not provided, call without namespace @@ -175,10 +174,12 @@ def run_locust_test(kubeconfig, global_settings, run_settings, output_folder, in services = global_settings.get("service-list") or [] if runspec["deployment-type"] == "k8s": from .metrics import MetricsCollector + collector = MetricsCollector() collect_metrics(collector, services, start_output_folder, namespace) elif runspec["deployment-type"] == "docker": from .metrics_docker import DockerMetricsCollector + collector = DockerMetricsCollector() collect_metrics(collector, services, start_output_folder) @@ -188,6 +189,7 @@ def run_locust_test(kubeconfig, global_settings, run_settings, output_folder, in if service_metric: from .metrics_util import export_metric + services = global_settings.get("service-list") or [] if runspec["deployment-type"] == "k8s": collect_metrics(collector, services, end_output_folder, namespace) diff --git a/evals/benchmark/stresscli/commands/metrics_docker.py b/evals/benchmark/stresscli/commands/metrics_docker.py index 052b964d..48ddfcdc 100644 --- a/evals/benchmark/stresscli/commands/metrics_docker.py +++ b/evals/benchmark/stresscli/commands/metrics_docker.py @@ -1,20 +1,26 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + import json -import docker -import requests import logging import os import time +import requests + +import docker + # Setup logs log_level = os.getenv("LOG_LEVEL", "ERROR").upper() logging.basicConfig(level=getattr(logging, log_level)) + class DockerMetricsCollector: def __init__(self): self.docker_client = docker.from_env() def get_docker_container(self, container_name): - """Retrieve Docker container information""" + """Retrieve Docker container information.""" try: container = self.docker_client.containers.get(container_name) logging.info(f"Found Docker container {container_name}") @@ -24,24 +30,28 @@ def get_docker_container(self, container_name): return None def get_exposed_port(self, container): - """Get the port exposed to the external environment by the Docker container""" + """Get the port exposed to the external environment by the Docker container.""" try: # Retrieve ports in JSON format - ports_json = container.attrs['NetworkSettings']['Ports'] + ports_json = container.attrs["NetworkSettings"]["Ports"] logging.debug(f"Container ports: {ports_json}") # Parse the ports to find the host port for container_port, host_infos in ports_json.items(): for host_info in host_infos: - host_ip = host_info['HostIp'] - host_port = host_info['HostPort'] + host_ip = host_info["HostIp"] + host_port = host_info["HostPort"] # Use localhost if the port is mapped to 0.0.0.0 or empty - if host_ip in ['0.0.0.0', '']: - logging.debug(f"Found host port {host_port} for container port {container_port} (mapped to all interfaces)") - return ('localhost', host_port) + if host_ip in ["0.0.0.0", ""]: + logging.debug( + f"Found host port {host_port} for container port {container_port} (mapped to all interfaces)" + ) + return ("localhost", host_port) else: - logging.debug(f"Found host port {host_port} for container port {container_port} (mapped to {host_ip})") + logging.debug( + f"Found host port {host_port} for container port {container_port} (mapped to {host_ip})" + ) return (host_ip, host_port) logging.error("No valid host port found.") @@ -51,7 +61,7 @@ def get_exposed_port(self, container): return (None, None) def collect_metrics(self, container_name, metrics_path="/metrics"): - """Collect metrics from the Docker container""" + """Collect metrics from the Docker container.""" container = self.get_docker_container(container_name) if container: try: @@ -71,7 +81,7 @@ def collect_metrics(self, container_name, metrics_path="/metrics"): return None def start_collecting_data(self, services, output_dir="/data"): - """Start collecting metrics from services""" + """Start collecting metrics from services.""" timestamp = int(time.time()) for container_name in services: metrics = self.collect_metrics(container_name) @@ -84,10 +94,18 @@ def start_collecting_data(self, services, output_dir="/data"): logging.error(f"No metrics collected for container {container_name}") return {"status": "success"} + if __name__ == "__main__": docker_collector = DockerMetricsCollector() result = docker_collector.start_collecting_data( - services=["llm-tgi-server", "retriever-redis-server", "embedding-tei-server", "tei-embedding-server", "tgi-service", "tei-reranking-server"], + services=[ + "llm-tgi-server", + "retriever-redis-server", + "embedding-tei-server", + "tei-embedding-server", + "tgi-service", + "tei-reranking-server", + ], output_dir="/path/to/data", ) print(result) diff --git a/evals/benchmark/stresscli/commands/metrics_util.py b/evals/benchmark/stresscli/commands/metrics_util.py index 73b17a95..270f153e 100644 --- a/evals/benchmark/stresscli/commands/metrics_util.py +++ b/evals/benchmark/stresscli/commands/metrics_util.py @@ -117,7 +117,7 @@ def calculate_diff(start_dir, end_dir, output_dir, services=None): for service_name in services: # Create a regex pattern to match files starting with the service_name followed by symbol @ - pattern = rf'^{re.escape(service_name)}@.*\.txt$' + pattern = rf"^{re.escape(service_name)}@.*\.txt$" start_service_files = [f for f in start_files if re.match(pattern, f)] end_service_files = [f for f in end_files if re.match(pattern, f)] diff --git a/evals/benchmark/stresscli/requirements.txt b/evals/benchmark/stresscli/requirements.txt index dac9a90a..ab52c2f8 100644 --- a/evals/benchmark/stresscli/requirements.txt +++ b/evals/benchmark/stresscli/requirements.txt @@ -1,5 +1,6 @@ click deepdiff +docker flask kubernetes locust @@ -7,4 +8,3 @@ numpy pytest pyyaml requests -docker