From 0e541f7aa9fb13b27a86763f45d960951de94440 Mon Sep 17 00:00:00 2001
From: Cathy Zhang <cathy.zhang@intel.com>
Date: Fri, 6 Sep 2024 00:35:03 -0700
Subject: [PATCH 1/3] stresscli/metrics: Add support to collect service metrics
 for Docker deployed workload

Signed-off-by: Cathy Zhang <cathy.zhang@intel.com>
---
 evals/benchmark/benchmark.yaml                |  5 +-
 .../benchmark/stresscli/commands/load_test.py | 55 ++++++++---
 .../stresscli/commands/metrics_docker.py      | 93 +++++++++++++++++++
 evals/benchmark/stresscli/requirements.txt    |  1 +
 4 files changed, 137 insertions(+), 17 deletions(-)
 create mode 100644 evals/benchmark/stresscli/commands/metrics_docker.py

diff --git a/evals/benchmark/benchmark.yaml b/evals/benchmark/benchmark.yaml
index bf3576ae..abde3c06 100644
--- a/evals/benchmark/benchmark.yaml
+++ b/evals/benchmark/benchmark.yaml
@@ -56,8 +56,9 @@ test_cases:
     e2e:
       run_test: true
       service_name: "chatqna-backend-server-svc"  # Replace with your service name
-      service_list:  # Replace with your k8s service names for metrics collection,
-                     # activate if deployment_type is k8s and collect_service_metric is true
+      service_list:  # Replace with your k8s service names if deploy with k8s
+                     # or container names if deploy with Docker for metrics collection,
+                     # activate if collect_service_metric is true
         - "chatqna-tei"
         - "chatqna-teirerank"
 
diff --git a/evals/benchmark/stresscli/commands/load_test.py b/evals/benchmark/stresscli/commands/load_test.py
index 8af0c989..f845490a 100644
--- a/evals/benchmark/stresscli/commands/load_test.py
+++ b/evals/benchmark/stresscli/commands/load_test.py
@@ -66,13 +66,30 @@ def locust_runtests(kubeconfig, profile):
         click.echo(f"Load test results saved to {base_folder}")
 
 
-def collect_metrics(collector, namespace, services, output_dir):
-    collector.start_collecting_data(
-        namespace=namespace,
-        services=services,
-        output_dir=output_dir,
-        restart_pods_flag=False,
-    )
+def collect_metrics(collector, services, output_dir, namespace=None):
+    """Collect metrics from the specified services and output directory.
+
+    Args:
+        collector: The metrics collector object.
+        services (list): A list of services to collect metrics from.
+        output_dir (str): The directory where metrics will be saved.
+        namespace (str, optional): The namespace for collecting metrics. Defaults to None.
+    """
+    if namespace:
+        # If namespace is provided, call with namespace
+        collector.start_collecting_data(
+            namespace=namespace,
+            services=services,
+            output_dir=output_dir,
+            restart_pods_flag=False,
+
+        )
+    else:
+        # If namespace is not provided, call without namespace
+        collector.start_collecting_data(
+            services=services,
+            output_dir=output_dir,
+        )
 
 
 def run_locust_test(kubeconfig, global_settings, run_settings, output_folder, index):
@@ -154,20 +171,28 @@ def run_locust_test(kubeconfig, global_settings, run_settings, output_folder, in
     print(f"Running test: {' '.join(cmd)}")
     namespace = runspec["namespace"]
 
-    if service_metric and runspec["deployment-type"] == "k8s":
-        from .metrics import MetricsCollector
-        from .metrics_util import export_metric
-
-        collector = MetricsCollector()
+    if service_metric:
         services = global_settings.get("service-list") or []
-        collect_metrics(collector, namespace, services, start_output_folder)
+        if runspec["deployment-type"] == "k8s":
+            from .metrics import MetricsCollector
+            collector = MetricsCollector()
+            collect_metrics(collector, services, start_output_folder, namespace)
+        elif runspec["deployment-type"] == "docker":
+            from .metrics_docker import DockerMetricsCollector
+            collector = DockerMetricsCollector()
+            collect_metrics(collector, services, start_output_folder)
 
     runspec["starttest_time"] = datetime.now().isoformat()
     result = subprocess.run(cmd, capture_output=True, text=True)
     runspec["endtest_time"] = datetime.now().isoformat()
 
-    if service_metric and runspec["deployment-type"] == "k8s":
-        collect_metrics(collector, namespace, services, end_output_folder)
+    if service_metric:
+        from .metrics_util import export_metric
+        services = global_settings.get("service-list") or []
+        if runspec["deployment-type"] == "k8s":
+            collect_metrics(collector, services, end_output_folder, namespace)
+        elif runspec["deployment-type"] == "docker":
+            collect_metrics(collector, services, end_output_folder)
         export_metric(start_output_folder, end_output_folder, metrics_output_folder, metrics_output, services)
 
     with open(json_output, "w") as json_file:
diff --git a/evals/benchmark/stresscli/commands/metrics_docker.py b/evals/benchmark/stresscli/commands/metrics_docker.py
new file mode 100644
index 00000000..fd43eb2a
--- /dev/null
+++ b/evals/benchmark/stresscli/commands/metrics_docker.py
@@ -0,0 +1,93 @@
+import json
+import docker
+import requests
+import logging
+import os
+import time
+
+# Setup logs
+log_level = os.getenv("LOG_LEVEL", "ERROR").upper()
+logging.basicConfig(level=getattr(logging, log_level))
+
+class DockerMetricsCollector:
+    def __init__(self):
+        self.docker_client = docker.from_env()
+
+    def get_docker_container(self, container_name):
+        """Retrieve Docker container information"""
+        try:
+            container = self.docker_client.containers.get(container_name)
+            logging.info(f"Found Docker container {container_name}")
+            return container
+        except docker.errors.NotFound:
+            logging.error(f"Container {container_name} not found.")
+            return None
+
+    def get_exposed_port(self, container):
+        """Get the port exposed to the external environment by the Docker container"""
+        try:
+            # Retrieve ports in JSON format
+            ports_json = container.attrs['NetworkSettings']['Ports']
+            logging.debug(f"Container ports: {ports_json}")
+
+            # Parse the ports to find the host port
+            for container_port, host_infos in ports_json.items():
+                for host_info in host_infos:
+                    host_ip = host_info['HostIp']
+                    host_port = host_info['HostPort']
+
+                    # Use localhost if the port is mapped to 0.0.0.0 or empty
+                    if host_ip in ['0.0.0.0', '']:
+                        logging.debug(f"Found host port {host_port} for container port {container_port} (mapped to all interfaces)")
+                        return ('localhost', host_port)
+                    else:
+                        logging.debug(f"Found host port {host_port} for container port {container_port} (mapped to {host_ip})")
+                        return (host_ip, host_port)
+
+            logging.error("No valid host port found.")
+            return (None, None)
+        except KeyError as e:
+            logging.error(f"Error retrieving ports: {e}")
+            return (None, None)
+
+    def collect_metrics(self, container_name, metrics_path="/metrics"):
+        """Collect metrics from the Docker container"""
+        container = self.get_docker_container(container_name)
+        if container:
+            try:
+                host_ip, port = self.get_exposed_port(container)  # Get the exposed port
+                if not port:
+                    logging.error(f"Cannot determine port for container {container_name}")
+                    return None
+
+                # Construct the URL
+                service_url = f"http://{host_ip}:{port}{metrics_path}"
+                logging.debug(f"Collecting metrics from {service_url}")
+                response = requests.get(service_url)
+                response.raise_for_status()
+                return response.text
+            except requests.RequestException as e:
+                logging.error(f"Error collecting metrics from {container_name}: {e}")
+        return None
+
+    def start_collecting_data(self, services, output_dir="/data"):
+        """Start collecting metrics from services"""
+        timestamp = int(time.time())
+        for container_name in services:
+            metrics = self.collect_metrics(container_name)
+            if metrics:
+                output_path = os.path.join(output_dir, f"{container_name}_{timestamp}.txt")
+                logging.debug(f"Writing Docker metrics to {output_path}")
+                with open(output_path, "w") as f:
+                    f.write(metrics)
+            else:
+                logging.error(f"No metrics collected for container {container_name}")
+        return {"status": "success"}
+
+if __name__ == "__main__":
+    docker_collector = DockerMetricsCollector()
+    result = docker_collector.start_collecting_data(
+        services=["llm-tgi-server", "retriever-redis-server", "embedding-tei-server", "tei-embedding-server", "tgi-service", "tei-reranking-server"],
+        output_dir="/path/to/data",
+    )
+    print(result)
diff --git a/evals/benchmark/stresscli/requirements.txt b/evals/benchmark/stresscli/requirements.txt
index e77a6ca5..dac9a90a 100644
--- a/evals/benchmark/stresscli/requirements.txt
+++ b/evals/benchmark/stresscli/requirements.txt
@@ -7,3 +7,4 @@ numpy
 pytest
 pyyaml
 requests
+docker

From 311ddf9f94edc10a36ca28f1e0e2d795a4aab02e Mon Sep 17 00:00:00 2001
From: Cathy Zhang <cathy.zhang@intel.com>
Date: Fri, 6 Sep 2024 00:40:14 -0700
Subject: [PATCH 2/3] Utilize symbol @ as a delimiter for metric file name

Utilize symbol '@' as a delimiter between the service name and pod name
within the generaged metric filename to ensure that a given service name
does not erroneously correspond to an excessive number of unrelated
files.

Signed-off-by: Cathy Zhang <cathy.zhang@intel.com>
---
 evals/benchmark/stresscli/commands/metrics.py        | 2 +-
 evals/benchmark/stresscli/commands/metrics_docker.py | 2 +-
 evals/benchmark/stresscli/commands/metrics_util.py   | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/evals/benchmark/stresscli/commands/metrics.py b/evals/benchmark/stresscli/commands/metrics.py
index cfe1948f..08474491 100644
--- a/evals/benchmark/stresscli/commands/metrics.py
+++ b/evals/benchmark/stresscli/commands/metrics.py
@@ -85,7 +85,7 @@ def start_collecting_data(self, namespace, services, output_dir="/data", restart
                 pod_port = self.get_pod_port(pod_info)
                 metrics = self.collect_metrics(pod_ip, pod_port, metrics_path)
                 if metrics:
-                    pod_output_path = os.path.join(output_dir, f"{service_name}_{pod_name}_{timestamp}.txt")
+                    pod_output_path = os.path.join(output_dir, f"{service_name}@{pod_name}_{timestamp}.txt")
                     logging.debug(f"Writing metrics to {pod_output_path}")
                     with open(pod_output_path, "w") as f:
                         f.write(metrics)
diff --git a/evals/benchmark/stresscli/commands/metrics_docker.py b/evals/benchmark/stresscli/commands/metrics_docker.py
index fd43eb2a..052b964d 100644
--- a/evals/benchmark/stresscli/commands/metrics_docker.py
+++ b/evals/benchmark/stresscli/commands/metrics_docker.py
@@ -76,7 +76,7 @@ def start_collecting_data(self, services, output_dir="/data"):
         for container_name in services:
             metrics = self.collect_metrics(container_name)
             if metrics:
-                output_path = os.path.join(output_dir, f"{container_name}_{timestamp}.txt")
+                output_path = os.path.join(output_dir, f"{container_name}@{timestamp}.txt")
                 logging.debug(f"Writing Docker metrics to {output_path}")
                 with open(output_path, "w") as f:
                     f.write(metrics)
diff --git a/evals/benchmark/stresscli/commands/metrics_util.py b/evals/benchmark/stresscli/commands/metrics_util.py
index 1763eea5..73b17a95 100644
--- a/evals/benchmark/stresscli/commands/metrics_util.py
+++ b/evals/benchmark/stresscli/commands/metrics_util.py
@@ -116,8 +116,8 @@ def calculate_diff(start_dir, end_dir, output_dir, services=None):
         services = [services]
 
     for service_name in services:
-        # Create a regex pattern to match files starting with the service_name followed by a non-alphanumeric character
-        pattern = rf"^{re.escape(service_name)}[^a-zA-Z].*\.txt$"
+        # Create a regex pattern to match files starting with the service_name followed by symbol @
+        pattern = rf'^{re.escape(service_name)}@.*\.txt$'
 
         start_service_files = [f for f in start_files if re.match(pattern, f)]
         end_service_files = [f for f in end_files if re.match(pattern, f)]

From 754bfeedb5e70050485a3a83bca0fc071653386b Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 6 Sep 2024 09:04:56 +0000
Subject: [PATCH 3/3] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .../benchmark/stresscli/commands/load_test.py |  4 +-
 .../stresscli/commands/metrics_docker.py      | 46 +++++++++++++------
 .../stresscli/commands/metrics_util.py        |  2 +-
 evals/benchmark/stresscli/requirements.txt    |  2 +-
 4 files changed, 37 insertions(+), 17 deletions(-)

diff --git a/evals/benchmark/stresscli/commands/load_test.py b/evals/benchmark/stresscli/commands/load_test.py
index f845490a..fba1c5ee 100644
--- a/evals/benchmark/stresscli/commands/load_test.py
+++ b/evals/benchmark/stresscli/commands/load_test.py
@@ -82,7 +82,6 @@ def collect_metrics(collector, services, output_dir, namespace=None):
             services=services,
             output_dir=output_dir,
             restart_pods_flag=False,
-
         )
     else:
         # If namespace is not provided, call without namespace
@@ -175,10 +174,12 @@ def run_locust_test(kubeconfig, global_settings, run_settings, output_folder, in
         services = global_settings.get("service-list") or []
         if runspec["deployment-type"] == "k8s":
             from .metrics import MetricsCollector
+
             collector = MetricsCollector()
             collect_metrics(collector, services, start_output_folder, namespace)
         elif runspec["deployment-type"] == "docker":
             from .metrics_docker import DockerMetricsCollector
+
             collector = DockerMetricsCollector()
             collect_metrics(collector, services, start_output_folder)
 
@@ -188,6 +189,7 @@ def run_locust_test(kubeconfig, global_settings, run_settings, output_folder, in
 
     if service_metric:
         from .metrics_util import export_metric
+
         services = global_settings.get("service-list") or []
         if runspec["deployment-type"] == "k8s":
             collect_metrics(collector, services, end_output_folder, namespace)
diff --git a/evals/benchmark/stresscli/commands/metrics_docker.py b/evals/benchmark/stresscli/commands/metrics_docker.py
index 052b964d..48ddfcdc 100644
--- a/evals/benchmark/stresscli/commands/metrics_docker.py
+++ b/evals/benchmark/stresscli/commands/metrics_docker.py
@@ -1,20 +1,26 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
 import json
-import docker
-import requests
 import logging
 import os
 import time
 
+import requests
+
+import docker
+
 # Setup logs
 log_level = os.getenv("LOG_LEVEL", "ERROR").upper()
 logging.basicConfig(level=getattr(logging, log_level))
 
+
 class DockerMetricsCollector:
     def __init__(self):
         self.docker_client = docker.from_env()
 
     def get_docker_container(self, container_name):
-        """Retrieve Docker container information"""
+        """Retrieve Docker container information."""
         try:
             container = self.docker_client.containers.get(container_name)
             logging.info(f"Found Docker container {container_name}")
@@ -24,24 +30,28 @@ def get_docker_container(self, container_name):
             return None
 
     def get_exposed_port(self, container):
-        """Get the port exposed to the external environment by the Docker container"""
+        """Get the port exposed to the external environment by the Docker container."""
         try:
             # Retrieve ports in JSON format
-            ports_json = container.attrs['NetworkSettings']['Ports']
+            ports_json = container.attrs["NetworkSettings"]["Ports"]
             logging.debug(f"Container ports: {ports_json}")
 
             # Parse the ports to find the host port
             for container_port, host_infos in ports_json.items():
                 for host_info in host_infos:
-                    host_ip = host_info['HostIp']
-                    host_port = host_info['HostPort']
+                    host_ip = host_info["HostIp"]
+                    host_port = host_info["HostPort"]
 
                     # Use localhost if the port is mapped to 0.0.0.0 or empty
-                    if host_ip in ['0.0.0.0', '']:
-                        logging.debug(f"Found host port {host_port} for container port {container_port} (mapped to all interfaces)")
-                        return ('localhost', host_port)
+                    if host_ip in ["0.0.0.0", ""]:
+                        logging.debug(
+                            f"Found host port {host_port} for container port {container_port} (mapped to all interfaces)"
+                        )
+                        return ("localhost", host_port)
                     else:
-                        logging.debug(f"Found host port {host_port} for container port {container_port} (mapped to {host_ip})")
+                        logging.debug(
+                            f"Found host port {host_port} for container port {container_port} (mapped to {host_ip})"
+                        )
                         return (host_ip, host_port)
 
             logging.error("No valid host port found.")
@@ -51,7 +61,7 @@ def get_exposed_port(self, container):
             return (None, None)
 
     def collect_metrics(self, container_name, metrics_path="/metrics"):
-        """Collect metrics from the Docker container"""
+        """Collect metrics from the Docker container."""
         container = self.get_docker_container(container_name)
         if container:
             try:
@@ -71,7 +81,7 @@ def collect_metrics(self, container_name, metrics_path="/metrics"):
         return None
 
     def start_collecting_data(self, services, output_dir="/data"):
-        """Start collecting metrics from services"""
+        """Start collecting metrics from services."""
         timestamp = int(time.time())
         for container_name in services:
             metrics = self.collect_metrics(container_name)
@@ -84,10 +94,18 @@ def start_collecting_data(self, services, output_dir="/data"):
                 logging.error(f"No metrics collected for container {container_name}")
         return {"status": "success"}
 
+
 if __name__ == "__main__":
     docker_collector = DockerMetricsCollector()
     result = docker_collector.start_collecting_data(
-        services=["llm-tgi-server", "retriever-redis-server", "embedding-tei-server", "tei-embedding-server", "tgi-service", "tei-reranking-server"],
+        services=[
+            "llm-tgi-server",
+            "retriever-redis-server",
+            "embedding-tei-server",
+            "tei-embedding-server",
+            "tgi-service",
+            "tei-reranking-server",
+        ],
         output_dir="/path/to/data",
     )
     print(result)
diff --git a/evals/benchmark/stresscli/commands/metrics_util.py b/evals/benchmark/stresscli/commands/metrics_util.py
index 73b17a95..270f153e 100644
--- a/evals/benchmark/stresscli/commands/metrics_util.py
+++ b/evals/benchmark/stresscli/commands/metrics_util.py
@@ -117,7 +117,7 @@ def calculate_diff(start_dir, end_dir, output_dir, services=None):
 
     for service_name in services:
         # Create a regex pattern to match files starting with the service_name followed by symbol @
-        pattern = rf'^{re.escape(service_name)}@.*\.txt$'
+        pattern = rf"^{re.escape(service_name)}@.*\.txt$"
 
         start_service_files = [f for f in start_files if re.match(pattern, f)]
         end_service_files = [f for f in end_files if re.match(pattern, f)]
diff --git a/evals/benchmark/stresscli/requirements.txt b/evals/benchmark/stresscli/requirements.txt
index dac9a90a..ab52c2f8 100644
--- a/evals/benchmark/stresscli/requirements.txt
+++ b/evals/benchmark/stresscli/requirements.txt
@@ -1,5 +1,6 @@
 click
 deepdiff
+docker
 flask
 kubernetes
 locust
@@ -7,4 +8,3 @@ numpy
 pytest
 pyyaml
 requests
-docker