Skip to content

Commit

Permalink
stresscli/metrics: Add support to collect service metrics for Docker …
Browse files Browse the repository at this point in the history
…deployed

workload

Signed-off-by: Cathy Zhang <[email protected]>
  • Loading branch information
bjzhjing committed Sep 6, 2024
1 parent 6abbe40 commit 0e541f7
Show file tree
Hide file tree
Showing 4 changed files with 137 additions and 17 deletions.
5 changes: 3 additions & 2 deletions evals/benchmark/benchmark.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,9 @@ test_cases:
e2e:
run_test: true
service_name: "chatqna-backend-server-svc" # Replace with your service name
service_list: # Replace with your k8s service names for metrics collection,
# activate if deployment_type is k8s and collect_service_metric is true
service_list: # Replace with your k8s service names if deploy with k8s
# or container names if deploy with Docker for metrics collection,
# activate if collect_service_metric is true
- "chatqna-tei"
- "chatqna-teirerank"

Expand Down
55 changes: 40 additions & 15 deletions evals/benchmark/stresscli/commands/load_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,13 +66,30 @@ def locust_runtests(kubeconfig, profile):
click.echo(f"Load test results saved to {base_folder}")


def collect_metrics(collector, namespace, services, output_dir):
collector.start_collecting_data(
namespace=namespace,
services=services,
output_dir=output_dir,
restart_pods_flag=False,
)
def collect_metrics(collector, services, output_dir, namespace=None):
"""Collect metrics from the specified services and output directory.
Args:
collector: The metrics collector object.
services (list): A list of services to collect metrics from.
output_dir (str): The directory where metrics will be saved.
namespace (str, optional): The namespace for collecting metrics. Defaults to None.
"""
if namespace:
# If namespace is provided, call with namespace
collector.start_collecting_data(
namespace=namespace,
services=services,
output_dir=output_dir,
restart_pods_flag=False,

)
else:
# If namespace is not provided, call without namespace
collector.start_collecting_data(
services=services,
output_dir=output_dir,
)


def run_locust_test(kubeconfig, global_settings, run_settings, output_folder, index):
Expand Down Expand Up @@ -154,20 +171,28 @@ def run_locust_test(kubeconfig, global_settings, run_settings, output_folder, in
print(f"Running test: {' '.join(cmd)}")
namespace = runspec["namespace"]

if service_metric and runspec["deployment-type"] == "k8s":
from .metrics import MetricsCollector
from .metrics_util import export_metric

collector = MetricsCollector()
if service_metric:
services = global_settings.get("service-list") or []
collect_metrics(collector, namespace, services, start_output_folder)
if runspec["deployment-type"] == "k8s":
from .metrics import MetricsCollector
collector = MetricsCollector()
collect_metrics(collector, services, start_output_folder, namespace)
elif runspec["deployment-type"] == "docker":
from .metrics_docker import DockerMetricsCollector
collector = DockerMetricsCollector()
collect_metrics(collector, services, start_output_folder)

runspec["starttest_time"] = datetime.now().isoformat()
result = subprocess.run(cmd, capture_output=True, text=True)
runspec["endtest_time"] = datetime.now().isoformat()

if service_metric and runspec["deployment-type"] == "k8s":
collect_metrics(collector, namespace, services, end_output_folder)
if service_metric:
from .metrics_util import export_metric
services = global_settings.get("service-list") or []
if runspec["deployment-type"] == "k8s":
collect_metrics(collector, services, end_output_folder, namespace)
elif runspec["deployment-type"] == "docker":
collect_metrics(collector, services, end_output_folder)
export_metric(start_output_folder, end_output_folder, metrics_output_folder, metrics_output, services)

with open(json_output, "w") as json_file:
Expand Down
93 changes: 93 additions & 0 deletions evals/benchmark/stresscli/commands/metrics_docker.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
import json
import docker
import requests
import logging
import os
import time

# Setup logs
log_level = os.getenv("LOG_LEVEL", "ERROR").upper()
logging.basicConfig(level=getattr(logging, log_level))

class DockerMetricsCollector:
def __init__(self):
self.docker_client = docker.from_env()

def get_docker_container(self, container_name):
"""Retrieve Docker container information"""
try:
container = self.docker_client.containers.get(container_name)
logging.info(f"Found Docker container {container_name}")
return container
except docker.errors.NotFound:
logging.error(f"Container {container_name} not found.")
return None

def get_exposed_port(self, container):
"""Get the port exposed to the external environment by the Docker container"""
try:
# Retrieve ports in JSON format
ports_json = container.attrs['NetworkSettings']['Ports']
logging.debug(f"Container ports: {ports_json}")

# Parse the ports to find the host port
for container_port, host_infos in ports_json.items():
for host_info in host_infos:
host_ip = host_info['HostIp']
host_port = host_info['HostPort']

# Use localhost if the port is mapped to 0.0.0.0 or empty
if host_ip in ['0.0.0.0', '']:
logging.debug(f"Found host port {host_port} for container port {container_port} (mapped to all interfaces)")
return ('localhost', host_port)
else:
logging.debug(f"Found host port {host_port} for container port {container_port} (mapped to {host_ip})")
return (host_ip, host_port)

logging.error("No valid host port found.")
return (None, None)
except KeyError as e:
logging.error(f"Error retrieving ports: {e}")
return (None, None)

def collect_metrics(self, container_name, metrics_path="/metrics"):
"""Collect metrics from the Docker container"""
container = self.get_docker_container(container_name)
if container:
try:
host_ip, port = self.get_exposed_port(container) # Get the exposed port
if not port:
logging.error(f"Cannot determine port for container {container_name}")
return None

# Construct the URL
service_url = f"http://{host_ip}:{port}{metrics_path}"
logging.debug(f"Collecting metrics from {service_url}")
response = requests.get(service_url)
response.raise_for_status()
return response.text
except requests.RequestException as e:
logging.error(f"Error collecting metrics from {container_name}: {e}")
return None

def start_collecting_data(self, services, output_dir="/data"):
"""Start collecting metrics from services"""
timestamp = int(time.time())
for container_name in services:
metrics = self.collect_metrics(container_name)
if metrics:
output_path = os.path.join(output_dir, f"{container_name}_{timestamp}.txt")
logging.debug(f"Writing Docker metrics to {output_path}")
with open(output_path, "w") as f:
f.write(metrics)
else:
logging.error(f"No metrics collected for container {container_name}")
return {"status": "success"}

if __name__ == "__main__":
docker_collector = DockerMetricsCollector()
result = docker_collector.start_collecting_data(
services=["llm-tgi-server", "retriever-redis-server", "embedding-tei-server", "tei-embedding-server", "tgi-service", "tei-reranking-server"],
output_dir="/path/to/data",
)
print(result)
1 change: 1 addition & 0 deletions evals/benchmark/stresscli/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,4 @@ numpy
pytest
pyyaml
requests
docker

0 comments on commit 0e541f7

Please sign in to comment.