Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[v1.0] Add docker metric support #113

Merged
merged 3 commits into from
Sep 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions evals/benchmark/benchmark.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,9 @@ test_cases:
e2e:
run_test: true
service_name: "chatqna-backend-server-svc" # Replace with your service name
service_list: # Replace with your k8s service names for metrics collection,
# activate if deployment_type is k8s and collect_service_metric is true
service_list: # Replace with your k8s service names if deploy with k8s
# or container names if deploy with Docker for metrics collection,
# activate if collect_service_metric is true
- "chatqna-tei"
- "chatqna-teirerank"

Expand Down
57 changes: 42 additions & 15 deletions evals/benchmark/stresscli/commands/load_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,13 +66,29 @@ def locust_runtests(kubeconfig, profile):
click.echo(f"Load test results saved to {base_folder}")


def collect_metrics(collector, namespace, services, output_dir):
collector.start_collecting_data(
namespace=namespace,
services=services,
output_dir=output_dir,
restart_pods_flag=False,
)
def collect_metrics(collector, services, output_dir, namespace=None):
"""Collect metrics from the specified services and output directory.

Args:
collector: The metrics collector object.
services (list): A list of services to collect metrics from.
output_dir (str): The directory where metrics will be saved.
namespace (str, optional): The namespace for collecting metrics. Defaults to None.
"""
if namespace:
# If namespace is provided, call with namespace
collector.start_collecting_data(
namespace=namespace,
services=services,
output_dir=output_dir,
restart_pods_flag=False,
)
else:
# If namespace is not provided, call without namespace
collector.start_collecting_data(
services=services,
output_dir=output_dir,
)


def run_locust_test(kubeconfig, global_settings, run_settings, output_folder, index):
Expand Down Expand Up @@ -154,20 +170,31 @@ def run_locust_test(kubeconfig, global_settings, run_settings, output_folder, in
print(f"Running test: {' '.join(cmd)}")
namespace = runspec["namespace"]

if service_metric and runspec["deployment-type"] == "k8s":
from .metrics import MetricsCollector
from .metrics_util import export_metric

collector = MetricsCollector()
if service_metric:
services = global_settings.get("service-list") or []
collect_metrics(collector, namespace, services, start_output_folder)
if runspec["deployment-type"] == "k8s":
from .metrics import MetricsCollector

collector = MetricsCollector()
collect_metrics(collector, services, start_output_folder, namespace)
elif runspec["deployment-type"] == "docker":
from .metrics_docker import DockerMetricsCollector

collector = DockerMetricsCollector()
collect_metrics(collector, services, start_output_folder)

runspec["starttest_time"] = datetime.now().isoformat()
result = subprocess.run(cmd, capture_output=True, text=True)
runspec["endtest_time"] = datetime.now().isoformat()

if service_metric and runspec["deployment-type"] == "k8s":
collect_metrics(collector, namespace, services, end_output_folder)
if service_metric:
from .metrics_util import export_metric

services = global_settings.get("service-list") or []
if runspec["deployment-type"] == "k8s":
collect_metrics(collector, services, end_output_folder, namespace)
elif runspec["deployment-type"] == "docker":
collect_metrics(collector, services, end_output_folder)
export_metric(start_output_folder, end_output_folder, metrics_output_folder, metrics_output, services)

with open(json_output, "w") as json_file:
Expand Down
2 changes: 1 addition & 1 deletion evals/benchmark/stresscli/commands/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ def start_collecting_data(self, namespace, services, output_dir="/data", restart
pod_port = self.get_pod_port(pod_info)
metrics = self.collect_metrics(pod_ip, pod_port, metrics_path)
if metrics:
pod_output_path = os.path.join(output_dir, f"{service_name}_{pod_name}_{timestamp}.txt")
pod_output_path = os.path.join(output_dir, f"{service_name}@{pod_name}_{timestamp}.txt")
logging.debug(f"Writing metrics to {pod_output_path}")
with open(pod_output_path, "w") as f:
f.write(metrics)
Expand Down
111 changes: 111 additions & 0 deletions evals/benchmark/stresscli/commands/metrics_docker.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

import json
import logging
import os
import time

import requests

import docker

# Setup logs
log_level = os.getenv("LOG_LEVEL", "ERROR").upper()
logging.basicConfig(level=getattr(logging, log_level))


class DockerMetricsCollector:
def __init__(self):
self.docker_client = docker.from_env()

def get_docker_container(self, container_name):
"""Retrieve Docker container information."""
try:
container = self.docker_client.containers.get(container_name)
logging.info(f"Found Docker container {container_name}")
return container
except docker.errors.NotFound:
logging.error(f"Container {container_name} not found.")
return None

def get_exposed_port(self, container):
"""Get the port exposed to the external environment by the Docker container."""
try:
# Retrieve ports in JSON format
ports_json = container.attrs["NetworkSettings"]["Ports"]
logging.debug(f"Container ports: {ports_json}")

# Parse the ports to find the host port
for container_port, host_infos in ports_json.items():
for host_info in host_infos:
host_ip = host_info["HostIp"]
host_port = host_info["HostPort"]

# Use localhost if the port is mapped to 0.0.0.0 or empty
if host_ip in ["0.0.0.0", ""]:
logging.debug(
f"Found host port {host_port} for container port {container_port} (mapped to all interfaces)"
)
return ("localhost", host_port)
else:
logging.debug(
f"Found host port {host_port} for container port {container_port} (mapped to {host_ip})"
)
return (host_ip, host_port)

logging.error("No valid host port found.")
return (None, None)
except KeyError as e:
logging.error(f"Error retrieving ports: {e}")
return (None, None)

def collect_metrics(self, container_name, metrics_path="/metrics"):
"""Collect metrics from the Docker container."""
container = self.get_docker_container(container_name)
if container:
try:
host_ip, port = self.get_exposed_port(container) # Get the exposed port
if not port:
logging.error(f"Cannot determine port for container {container_name}")
return None

# Construct the URL
service_url = f"http://{host_ip}:{port}{metrics_path}"
logging.debug(f"Collecting metrics from {service_url}")
response = requests.get(service_url)
response.raise_for_status()
return response.text
except requests.RequestException as e:
logging.error(f"Error collecting metrics from {container_name}: {e}")
return None

def start_collecting_data(self, services, output_dir="/data"):
"""Start collecting metrics from services."""
timestamp = int(time.time())
for container_name in services:
metrics = self.collect_metrics(container_name)
if metrics:
output_path = os.path.join(output_dir, f"{container_name}@{timestamp}.txt")
logging.debug(f"Writing Docker metrics to {output_path}")
with open(output_path, "w") as f:
f.write(metrics)
else:
logging.error(f"No metrics collected for container {container_name}")
return {"status": "success"}


if __name__ == "__main__":
docker_collector = DockerMetricsCollector()
result = docker_collector.start_collecting_data(
services=[
"llm-tgi-server",
"retriever-redis-server",
"embedding-tei-server",
"tei-embedding-server",
"tgi-service",
"tei-reranking-server",
],
output_dir="/path/to/data",
)
print(result)
4 changes: 2 additions & 2 deletions evals/benchmark/stresscli/commands/metrics_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,8 +116,8 @@ def calculate_diff(start_dir, end_dir, output_dir, services=None):
services = [services]

for service_name in services:
# Create a regex pattern to match files starting with the service_name followed by a non-alphanumeric character
pattern = rf"^{re.escape(service_name)}[^a-zA-Z].*\.txt$"
# Create a regex pattern to match files starting with the service_name followed by symbol @
pattern = rf"^{re.escape(service_name)}@.*\.txt$"

start_service_files = [f for f in start_files if re.match(pattern, f)]
end_service_files = [f for f in end_files if re.match(pattern, f)]
Expand Down
1 change: 1 addition & 0 deletions evals/benchmark/stresscli/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
click
deepdiff
docker
flask
kubernetes
locust
Expand Down