stresscli/metrics: Add support to collect service metrics for Docker …

…deployed workload Signed-off-by: Cathy Zhang <[email protected]>
opea-project · Sep 6, 2024 · 0e541f7 · 0e541f7
1 parent 6abbe40
commit 0e541f7
Show file tree

Hide file tree

Showing 4 changed files with 137 additions and 17 deletions.
diff --git a/evals/benchmark/benchmark.yaml b/evals/benchmark/benchmark.yaml
@@ -56,8 +56,9 @@ test_cases:
     e2e:
       run_test: true
       service_name: "chatqna-backend-server-svc"  # Replace with your service name
-      service_list:  # Replace with your k8s service names for metrics collection,
-                     # activate if deployment_type is k8s and collect_service_metric is true
+      service_list:  # Replace with your k8s service names if deploy with k8s
+                     # or container names if deploy with Docker for metrics collection,
+                     # activate if collect_service_metric is true
         - "chatqna-tei"
         - "chatqna-teirerank"
 

diff --git a/evals/benchmark/stresscli/commands/load_test.py b/evals/benchmark/stresscli/commands/load_test.py
@@ -66,13 +66,30 @@ def locust_runtests(kubeconfig, profile):
         click.echo(f"Load test results saved to {base_folder}")
 
 
-def collect_metrics(collector, namespace, services, output_dir):
-    collector.start_collecting_data(
-        namespace=namespace,
-        services=services,
-        output_dir=output_dir,
-        restart_pods_flag=False,
-    )
+def collect_metrics(collector, services, output_dir, namespace=None):
+    """Collect metrics from the specified services and output directory.
+
+    Args:
+        collector: The metrics collector object.
+        services (list): A list of services to collect metrics from.
+        output_dir (str): The directory where metrics will be saved.
+        namespace (str, optional): The namespace for collecting metrics. Defaults to None.
+    """
+    if namespace:
+        # If namespace is provided, call with namespace
+        collector.start_collecting_data(
+            namespace=namespace,
+            services=services,
+            output_dir=output_dir,
+            restart_pods_flag=False,
+
+        )
+    else:
+        # If namespace is not provided, call without namespace
+        collector.start_collecting_data(
+            services=services,
+            output_dir=output_dir,
+        )
 
 
 def run_locust_test(kubeconfig, global_settings, run_settings, output_folder, index):
@@ -154,20 +171,28 @@ def run_locust_test(kubeconfig, global_settings, run_settings, output_folder, in
     print(f"Running test: {' '.join(cmd)}")
     namespace = runspec["namespace"]
 
-    if service_metric and runspec["deployment-type"] == "k8s":
-        from .metrics import MetricsCollector
-        from .metrics_util import export_metric
-
-        collector = MetricsCollector()
+    if service_metric:
         services = global_settings.get("service-list") or []
-        collect_metrics(collector, namespace, services, start_output_folder)
+        if runspec["deployment-type"] == "k8s":
+            from .metrics import MetricsCollector
+            collector = MetricsCollector()
+            collect_metrics(collector, services, start_output_folder, namespace)
+        elif runspec["deployment-type"] == "docker":
+            from .metrics_docker import DockerMetricsCollector
+            collector = DockerMetricsCollector()
+            collect_metrics(collector, services, start_output_folder)
 
     runspec["starttest_time"] = datetime.now().isoformat()
     result = subprocess.run(cmd, capture_output=True, text=True)
     runspec["endtest_time"] = datetime.now().isoformat()
 
-    if service_metric and runspec["deployment-type"] == "k8s":
-        collect_metrics(collector, namespace, services, end_output_folder)
+    if service_metric:
+        from .metrics_util import export_metric
+        services = global_settings.get("service-list") or []
+        if runspec["deployment-type"] == "k8s":
+            collect_metrics(collector, services, end_output_folder, namespace)
+        elif runspec["deployment-type"] == "docker":
+            collect_metrics(collector, services, end_output_folder)
         export_metric(start_output_folder, end_output_folder, metrics_output_folder, metrics_output, services)
 
     with open(json_output, "w") as json_file:

diff --git a/evals/benchmark/stresscli/commands/metrics_docker.py b/evals/benchmark/stresscli/commands/metrics_docker.py
@@ -0,0 +1,93 @@
+import json
+import docker
+import requests
+import logging
+import os
+import time
+
+# Setup logs
+log_level = os.getenv("LOG_LEVEL", "ERROR").upper()
+logging.basicConfig(level=getattr(logging, log_level))
+
+class DockerMetricsCollector:
+    def __init__(self):
+        self.docker_client = docker.from_env()
+
+    def get_docker_container(self, container_name):
+        """Retrieve Docker container information"""
+        try:
+            container = self.docker_client.containers.get(container_name)
+            logging.info(f"Found Docker container {container_name}")
+            return container
+        except docker.errors.NotFound:
+            logging.error(f"Container {container_name} not found.")
+            return None
+
+    def get_exposed_port(self, container):
+        """Get the port exposed to the external environment by the Docker container"""
+        try:
+            # Retrieve ports in JSON format
+            ports_json = container.attrs['NetworkSettings']['Ports']
+            logging.debug(f"Container ports: {ports_json}")
+
+            # Parse the ports to find the host port
+            for container_port, host_infos in ports_json.items():
+                for host_info in host_infos:
+                    host_ip = host_info['HostIp']
+                    host_port = host_info['HostPort']
+
+                    # Use localhost if the port is mapped to 0.0.0.0 or empty
+                    if host_ip in ['0.0.0.0', '']:
+                        logging.debug(f"Found host port {host_port} for container port {container_port} (mapped to all interfaces)")
+                        return ('localhost', host_port)
+                    else:
+                        logging.debug(f"Found host port {host_port} for container port {container_port} (mapped to {host_ip})")
+                        return (host_ip, host_port)
+
+            logging.error("No valid host port found.")
+            return (None, None)
+        except KeyError as e:
+            logging.error(f"Error retrieving ports: {e}")
+            return (None, None)
+
+    def collect_metrics(self, container_name, metrics_path="/metrics"):
+        """Collect metrics from the Docker container"""
+        container = self.get_docker_container(container_name)
+        if container:
+            try:
+                host_ip, port = self.get_exposed_port(container)  # Get the exposed port
+                if not port:
+                    logging.error(f"Cannot determine port for container {container_name}")
+                    return None
+
+                # Construct the URL
+                service_url = f"http://{host_ip}:{port}{metrics_path}"
+                logging.debug(f"Collecting metrics from {service_url}")
+                response = requests.get(service_url)
+                response.raise_for_status()
+                return response.text
+            except requests.RequestException as e:
+                logging.error(f"Error collecting metrics from {container_name}: {e}")
+        return None
+
+    def start_collecting_data(self, services, output_dir="/data"):
+        """Start collecting metrics from services"""
+        timestamp = int(time.time())
+        for container_name in services:
+            metrics = self.collect_metrics(container_name)
+            if metrics:
+                output_path = os.path.join(output_dir, f"{container_name}_{timestamp}.txt")
+                logging.debug(f"Writing Docker metrics to {output_path}")
+                with open(output_path, "w") as f:
+                    f.write(metrics)
+            else:
+                logging.error(f"No metrics collected for container {container_name}")
+        return {"status": "success"}
+
+if __name__ == "__main__":
+    docker_collector = DockerMetricsCollector()
+    result = docker_collector.start_collecting_data(
+        services=["llm-tgi-server", "retriever-redis-server", "embedding-tei-server", "tei-embedding-server", "tgi-service", "tei-reranking-server"],
+        output_dir="/path/to/data",
+    )
+    print(result)
diff --git a/evals/benchmark/stresscli/requirements.txt b/evals/benchmark/stresscli/requirements.txt
@@ -7,3 +7,4 @@ numpy
 pytest
 pyyaml
 requests
+docker
-Original file line number
+Diff line change
@@ Expand Up / @@ -7,3 +7,4 @@ numpy @@
     pytest
     pyyaml
     requests
+    docker