diff --git a/docs/my-website/docs/proxy/prometheus.md b/docs/my-website/docs/proxy/prometheus.md index ca5152779195..91bf72e12e5c 100644 --- a/docs/my-website/docs/proxy/prometheus.md +++ b/docs/my-website/docs/proxy/prometheus.md @@ -64,9 +64,9 @@ Use this for for tracking per [user, key, team, etc.](virtual_keys) | Metric Name | Description | |----------------------|--------------------------------------| | `litellm_spend_metric` | Total Spend, per `"user", "key", "model", "team", "end-user"` | -| `litellm_total_tokens` | input + output tokens per `"user", "key", "model", "team", "end-user"` | -| `litellm_input_tokens` | input tokens per `"user", "key", "model", "team", "end-user"` | -| `litellm_output_tokens` | output tokens per `"user", "key", "model", "team", "end-user"` | +| `litellm_total_tokens` | input + output tokens per `"end_user", "hashed_api_key", "api_key_alias", "requested_model", "team", "team_alias", "user", "model"` | +| `litellm_input_tokens` | input tokens per `"end_user", "hashed_api_key", "api_key_alias", "requested_model", "team", "team_alias", "user", "model"` | +| `litellm_output_tokens` | output tokens per `"end_user", "hashed_api_key", "api_key_alias", "requested_model", "team", "team_alias", "user", "model"` | ## Proxy Level Tracking Metrics @@ -134,8 +134,8 @@ Use this for LLM API Error monitoring and tracking remaining rate limits and tok | Metric Name | Description | |----------------------|--------------------------------------| -| `litellm_request_total_latency_metric` | Total latency (seconds) for a request to LiteLLM Proxy Server - tracked for labels `model`, `hashed_api_key`, `api_key_alias`, `team`, `team_alias` | -| `litellm_llm_api_latency_metric` | Latency (seconds) for just the LLM API call - tracked for labels `model`, `hashed_api_key`, `api_key_alias`, `team`, `team_alias` | +| `litellm_request_total_latency_metric` | Total latency (seconds) for a request to LiteLLM Proxy Server - tracked for labels "end_user", "hashed_api_key", "api_key_alias", "requested_model", "team", "team_alias", "user", "model" | +| `litellm_llm_api_latency_metric` | Latency (seconds) for just the LLM API call - tracked for labels "model", "hashed_api_key", "api_key_alias", "team", "team_alias", "requested_model", "end_user", "user" | | `litellm_llm_api_time_to_first_token_metric` | Time to first token for LLM API call - tracked for labels `model`, `hashed_api_key`, `api_key_alias`, `team`, `team_alias` [Note: only emitted for streaming requests] | ## Virtual Key - Budget, Rate Limit Metrics @@ -149,6 +149,55 @@ Metrics used to track LiteLLM Proxy Budgeting and Rate limiting logic | `litellm_remaining_api_key_requests_for_model` | Remaining Requests for a LiteLLM virtual API key, only if a model-specific rate limit (rpm) has been set for that virtual key. Labels: `"hashed_api_key", "api_key_alias", "model"`| | `litellm_remaining_api_key_tokens_for_model` | Remaining Tokens for a LiteLLM virtual API key, only if a model-specific token limit (tpm) has been set for that virtual key. Labels: `"hashed_api_key", "api_key_alias", "model"`| +## [BETA] Custom Metrics + +Track custom metrics on prometheus on all events mentioned above. + +1. Define the custom metrics in the `config.yaml` + +```yaml +model_list: + - model_name: openai/gpt-3.5-turbo + litellm_params: + model: openai/gpt-3.5-turbo + api_key: os.environ/OPENAI_API_KEY + +litellm_settings: + callbacks: ["prometheus"] + custom_prometheus_metadata_labels: ["metadata.foo", "metadata.bar"] +``` + +2. Make a request with the custom metadata labels + +```bash +curl -L -X POST 'http://0.0.0.0:4000/v1/chat/completions' \ +-H 'Content-Type: application/json' \ +-H 'Authorization: Bearer ' \ +-d '{ + "model": "openai/gpt-3.5-turbo", + "messages": [ + { + "role": "user", + "content": [ + { + "type": "text", + "text": "What'\''s in this image?" + } + ] + } + ], + "max_tokens": 300, + "metadata": { + "foo": "hello world" + } +}' +``` + +3. Check your `/metrics` endpoint for the custom metrics + +``` +... "tag": "hello world" ... +``` ## Monitor System Health @@ -170,6 +219,7 @@ litellm_settings: | `litellm_redis_fails` | Number of failed redis calls | | `litellm_self_latency` | Histogram latency for successful litellm api call | + ## **🔥 LiteLLM Maintained Grafana Dashboards ** Link to Grafana Dashboards maintained by LiteLLM @@ -194,6 +244,7 @@ Here is a screenshot of the metrics you can monitor with the LiteLLM Grafana Das | `litellm_requests_metric` | **deprecated** use `litellm_proxy_total_requests_metric` | + ## FAQ ### What are `_created` vs. `_total` metrics? diff --git a/litellm/__init__.py b/litellm/__init__.py index 1e2c1c4e75c1..942e2948a956 100644 --- a/litellm/__init__.py +++ b/litellm/__init__.py @@ -308,6 +308,7 @@ max_end_user_budget: Optional[float] = None disable_end_user_cost_tracking: Optional[bool] = None disable_end_user_cost_tracking_prometheus_only: Optional[bool] = None +custom_prometheus_metadata_labels: Optional[List[str]] = None #### REQUEST PRIORITIZATION #### priority_reservation: Optional[Dict[str, float]] = None #### RELIABILITY #### diff --git a/litellm/integrations/prometheus.py b/litellm/integrations/prometheus.py index 89a9b48137c5..2629f1baf0f2 100644 --- a/litellm/integrations/prometheus.py +++ b/litellm/integrations/prometheus.py @@ -5,6 +5,7 @@ from datetime import datetime, timedelta from typing import List, Optional, cast +import litellm from litellm._logging import print_verbose, verbose_logger from litellm.integrations.custom_logger import CustomLogger from litellm.proxy._types import UserAPIKeyAuth @@ -139,15 +140,7 @@ def __init__( self.litellm_input_tokens_metric = Counter( "litellm_input_tokens", "Total number of input tokens from LLM requests", - labelnames=[ - "end_user", - "hashed_api_key", - "api_key_alias", - "model", - "team", - "team_alias", - "user", - ], + labelnames=PrometheusMetricLabels.litellm_input_tokens_metric.value, ) # Counter for input tokens by tag @@ -162,15 +155,7 @@ def __init__( self.litellm_output_tokens_metric = Counter( "litellm_output_tokens", "Total number of output tokens from LLM requests", - labelnames=[ - "end_user", - "hashed_api_key", - "api_key_alias", - "model", - "team", - "team_alias", - "user", - ], + labelnames=PrometheusMetricLabels.litellm_output_tokens_metric.value, ) # Counter for output tokens by tag @@ -408,6 +393,17 @@ async def async_log_success_event(self, kwargs, response_obj, start_time, end_ti output_tokens = standard_logging_payload["completion_tokens"] tokens_used = standard_logging_payload["total_tokens"] response_cost = standard_logging_payload["response_cost"] + _requester_metadata = standard_logging_payload["metadata"].get( + "requester_metadata" + ) + if standard_logging_payload is not None and isinstance( + standard_logging_payload, dict + ): + _tags = get_tags_from_standard_logging_payload( + cast(StandardLoggingPayload, standard_logging_payload) + ) + else: + _tags = [] print_verbose( f"inside track_prometheus_metrics, model {model}, response_cost {response_cost}, tokens_used {tokens_used}, end_user_id {end_user_id}, user_api_key {user_api_key}" @@ -417,11 +413,19 @@ async def async_log_success_event(self, kwargs, response_obj, start_time, end_ti end_user=end_user_id, hashed_api_key=user_api_key, api_key_alias=user_api_key_alias, - requested_model=model, + requested_model=standard_logging_payload["model_group"], team=user_api_team, team_alias=user_api_team_alias, user=user_id, status_code="200", + model=model, + litellm_model_name=model, + tags=_tags, + model_id=standard_logging_payload["model_id"], + api_base=standard_logging_payload["api_base"], + api_provider=standard_logging_payload["custom_llm_provider"], + exception_status=None, + exception_class=None, ) if ( @@ -459,6 +463,7 @@ async def async_log_success_event(self, kwargs, response_obj, start_time, end_ti user_api_team=user_api_team, user_api_team_alias=user_api_team_alias, user_id=user_id, + enum_values=enum_values, ) # remaining budget metrics @@ -490,6 +495,7 @@ async def async_log_success_event(self, kwargs, response_obj, start_time, end_ti # 1. We just checked if isinstance(standard_logging_payload, dict). Pyright complains. # 2. Pyright does not allow us to run isinstance(standard_logging_payload, StandardLoggingPayload) <- this would be ideal standard_logging_payload=standard_logging_payload, # type: ignore + enum_values=enum_values, ) # set x-ratelimit headers @@ -524,6 +530,7 @@ def _increment_token_metrics( user_api_team: Optional[str], user_api_team_alias: Optional[str], user_id: Optional[str], + enum_values: UserAPIKeyLabelValues, ): # token metrics self.litellm_tokens_metric.labels( @@ -536,23 +543,24 @@ def _increment_token_metrics( user_id, ).inc(standard_logging_payload["total_tokens"]) - _tags = standard_logging_payload["request_tags"] - for tag in _tags: - self.litellm_tokens_by_tag_metric.labels( - **{ - UserAPIKeyLabelNames.TAG.value: tag, - } - ).inc(standard_logging_payload["total_tokens"]) + if standard_logging_payload is not None and isinstance( + standard_logging_payload, dict + ): + _tags = get_tags_from_standard_logging_payload(standard_logging_payload) + for tag in _tags: + self.litellm_tokens_by_tag_metric.labels( + **{ + UserAPIKeyLabelNames.TAG.value: tag, + } + ).inc(standard_logging_payload["total_tokens"]) - self.litellm_input_tokens_metric.labels( - end_user_id, - user_api_key, - user_api_key_alias, - model, - user_api_team, - user_api_team_alias, - user_id, - ).inc(standard_logging_payload["prompt_tokens"]) + _labels = prometheus_label_factory( + supported_enum_labels=PrometheusMetricLabels.litellm_input_tokens_metric.value, + enum_values=enum_values, + ) + self.litellm_input_tokens_metric.labels(**_labels).inc( + standard_logging_payload["prompt_tokens"] + ) for tag in _tags: self.litellm_input_tokens_by_tag_metric.labels( @@ -561,15 +569,14 @@ def _increment_token_metrics( } ).inc(standard_logging_payload["prompt_tokens"]) - self.litellm_output_tokens_metric.labels( - end_user_id, - user_api_key, - user_api_key_alias, - model, - user_api_team, - user_api_team_alias, - user_id, - ).inc(standard_logging_payload["completion_tokens"]) + _labels = prometheus_label_factory( + supported_enum_labels=PrometheusMetricLabels.litellm_output_tokens_metric.value, + enum_values=enum_values, + ) + + self.litellm_output_tokens_metric.labels(**_labels).inc( + standard_logging_payload["completion_tokens"] + ) for tag in _tags: self.litellm_output_tokens_by_tag_metric.labels( @@ -685,6 +692,7 @@ def _set_latency_metrics( user_api_team: Optional[str], user_api_team_alias: Optional[str], standard_logging_payload: StandardLoggingPayload, + enum_values: UserAPIKeyLabelValues, ): # latency metrics model_parameters: dict = standard_logging_payload["model_parameters"] @@ -694,24 +702,6 @@ def _set_latency_metrics( completion_start_time = kwargs.get("completion_start_time", None) - enum_values = UserAPIKeyLabelValues( - end_user=standard_logging_payload["metadata"]["user_api_key_end_user_id"], - user=standard_logging_payload["metadata"]["user_api_key_user_id"], - hashed_api_key=user_api_key, - api_key_alias=user_api_key_alias, - team=user_api_team, - team_alias=user_api_team_alias, - requested_model=standard_logging_payload["model_group"], - model=model, - litellm_model_name=standard_logging_payload["model_group"], - tags=standard_logging_payload["request_tags"], - model_id=standard_logging_payload["model_id"], - api_base=standard_logging_payload["api_base"], - api_provider=standard_logging_payload["custom_llm_provider"], - exception_status=None, - exception_class=None, - ) - if ( completion_start_time is not None and isinstance(completion_start_time, datetime) @@ -841,6 +831,12 @@ async def async_post_call_failure_hook( """ try: _tags = cast(List[str], request_data.get("tags") or []) + request_metadata = request_data.get("metadata", {}) + metadata_tags: Optional[List[str]] = None + if request_metadata is not None and isinstance(request_metadata, dict): + metadata_tags = get_tag_from_metadata(metadata=request_metadata) + if metadata_tags is not None: + _tags.extend(metadata_tags) enum_values = UserAPIKeyLabelValues( end_user=user_api_key_dict.end_user_id, user=user_api_key_dict.user_id, @@ -980,22 +976,27 @@ def set_llm_deployment_failure_metrics(self, request_kwargs: dict): ).inc() # tag based tracking - _tags = standard_logging_payload["request_tags"] - for tag in _tags: - self.litellm_deployment_failure_by_tag_responses.labels( - **{ - UserAPIKeyLabelNames.REQUESTED_MODEL.value: model_group, - UserAPIKeyLabelNames.TAG.value: tag, - UserAPIKeyLabelNames.v2_LITELLM_MODEL_NAME.value: litellm_model_name, - UserAPIKeyLabelNames.MODEL_ID.value: model_id, - UserAPIKeyLabelNames.API_BASE.value: api_base, - UserAPIKeyLabelNames.API_PROVIDER.value: llm_provider, - UserAPIKeyLabelNames.EXCEPTION_CLASS.value: exception.__class__.__name__, - UserAPIKeyLabelNames.EXCEPTION_STATUS.value: str( - getattr(exception, "status_code", None) - ), - } - ).inc() + if standard_logging_payload is not None and isinstance( + standard_logging_payload, dict + ): + _tags = get_tags_from_standard_logging_payload( + cast(StandardLoggingPayload, standard_logging_payload) + ) + for tag in _tags: + self.litellm_deployment_failure_by_tag_responses.labels( + **{ + UserAPIKeyLabelNames.REQUESTED_MODEL.value: model_group, + UserAPIKeyLabelNames.TAG.value: tag, + UserAPIKeyLabelNames.v2_LITELLM_MODEL_NAME.value: litellm_model_name, + UserAPIKeyLabelNames.MODEL_ID.value: model_id, + UserAPIKeyLabelNames.API_BASE.value: api_base, + UserAPIKeyLabelNames.API_PROVIDER.value: llm_provider, + UserAPIKeyLabelNames.EXCEPTION_CLASS.value: exception.__class__.__name__, + UserAPIKeyLabelNames.EXCEPTION_STATUS.value: str( + getattr(exception, "status_code", None) + ), + } + ).inc() self.litellm_deployment_total_requests.labels( litellm_model_name=litellm_model_name, @@ -1201,6 +1202,7 @@ async def log_success_fallback_event( ) _new_model = kwargs.get("model") _tags = cast(List[str], kwargs.get("tags") or []) + enum_values = UserAPIKeyLabelValues( requested_model=original_model_group, fallback_model=_new_model, @@ -1392,3 +1394,45 @@ def prometheus_label_factory( ) return filtered_labels + + +def get_tags_from_standard_logging_payload( + standard_logging_payload: StandardLoggingPayload, +) -> List[str]: + _tags = standard_logging_payload["request_tags"] + _requester_metadata = standard_logging_payload["metadata"].get("requester_metadata") + metadata_tags: Optional[List[str]] = None + if _requester_metadata is not None: + metadata_tags = get_tag_from_metadata(metadata=_requester_metadata) + if metadata_tags is not None: + _tags.extend(metadata_tags) + + return _tags + + +def get_tag_from_metadata(metadata: dict) -> Optional[List[str]]: + """ + Get tag from metadata + """ + keys = litellm.custom_prometheus_metadata_labels + if keys is None or len(keys) == 0: + return None + + result: List[str] = [] + + for key in keys: + # Split the dot notation key into parts + key = key.replace("metadata.", "", 1) if key.startswith("metadata.") else key + + keys_parts = key.split(".") + # Traverse through the dictionary using the parts + value = metadata + for part in keys_parts: + value = value.get(part, None) # Get the value, return None if not found + if value is None: + break + + if value is not None and isinstance(value, str): + result.append(value) + + return result diff --git a/litellm/types/integrations/prometheus.py b/litellm/types/integrations/prometheus.py index 395d9303701b..6dc141a78bcc 100644 --- a/litellm/types/integrations/prometheus.py +++ b/litellm/types/integrations/prometheus.py @@ -164,6 +164,28 @@ class PrometheusMetricLabels(Enum): UserAPIKeyLabelNames.TAG.value, ] + litellm_input_tokens_metric = [ + UserAPIKeyLabelNames.END_USER.value, + UserAPIKeyLabelNames.API_KEY_HASH.value, + UserAPIKeyLabelNames.API_KEY_ALIAS.value, + UserAPIKeyLabelNames.v1_LITELLM_MODEL_NAME.value, + UserAPIKeyLabelNames.TEAM.value, + UserAPIKeyLabelNames.TEAM_ALIAS.value, + UserAPIKeyLabelNames.USER.value, + UserAPIKeyLabelNames.REQUESTED_MODEL.value, + ] + + litellm_output_tokens_metric = [ + UserAPIKeyLabelNames.END_USER.value, + UserAPIKeyLabelNames.API_KEY_HASH.value, + UserAPIKeyLabelNames.API_KEY_ALIAS.value, + UserAPIKeyLabelNames.v1_LITELLM_MODEL_NAME.value, + UserAPIKeyLabelNames.TEAM.value, + UserAPIKeyLabelNames.TEAM_ALIAS.value, + UserAPIKeyLabelNames.USER.value, + UserAPIKeyLabelNames.REQUESTED_MODEL.value, + ] + litellm_deployment_successful_fallbacks = [ UserAPIKeyLabelNames.REQUESTED_MODEL.value, UserAPIKeyLabelNames.FALLBACK_MODEL.value, diff --git a/tests/logging_callback_tests/test_prometheus_unit_tests.py b/tests/logging_callback_tests/test_prometheus_unit_tests.py index 407015067e76..bfd1b8d13c2f 100644 --- a/tests/logging_callback_tests/test_prometheus_unit_tests.py +++ b/tests/logging_callback_tests/test_prometheus_unit_tests.py @@ -14,7 +14,11 @@ import litellm from litellm import completion from litellm._logging import verbose_logger -from litellm.integrations.prometheus import PrometheusLogger, UserAPIKeyLabelValues +from litellm.integrations.prometheus import ( + PrometheusLogger, + UserAPIKeyLabelValues, + get_tag_from_metadata, +) from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler from litellm.types.utils import ( StandardLoggingPayload, @@ -190,6 +194,16 @@ def test_increment_token_metrics(prometheus_logger): standard_logging_payload["prompt_tokens"] = 50 standard_logging_payload["completion_tokens"] = 50 + enum_values = UserAPIKeyLabelValues( + litellm_model_name=standard_logging_payload["model"], + api_provider=standard_logging_payload["custom_llm_provider"], + hashed_api_key=standard_logging_payload["metadata"]["user_api_key_hash"], + api_key_alias=standard_logging_payload["metadata"]["user_api_key_alias"], + team=standard_logging_payload["metadata"]["user_api_key_team_id"], + team_alias=standard_logging_payload["metadata"]["user_api_key_team_alias"], + **standard_logging_payload, + ) + prometheus_logger._increment_token_metrics( standard_logging_payload, end_user_id="user1", @@ -199,6 +213,7 @@ def test_increment_token_metrics(prometheus_logger): user_api_team="team1", user_api_team_alias="team_alias1", user_id="user1", + enum_values=enum_values, ) prometheus_logger.litellm_tokens_metric.labels.assert_called_once_with( @@ -207,14 +222,28 @@ def test_increment_token_metrics(prometheus_logger): prometheus_logger.litellm_tokens_metric.labels().inc.assert_called_once_with(100) prometheus_logger.litellm_input_tokens_metric.labels.assert_called_once_with( - "user1", "key1", "alias1", "gpt-3.5-turbo", "team1", "team_alias1", "user1" + end_user=None, + user=None, + hashed_api_key="test_hash", + api_key_alias="test_alias", + team="test_team", + team_alias="test_team_alias", + requested_model=None, + model="gpt-3.5-turbo", ) prometheus_logger.litellm_input_tokens_metric.labels().inc.assert_called_once_with( 50 ) prometheus_logger.litellm_output_tokens_metric.labels.assert_called_once_with( - "user1", "key1", "alias1", "gpt-3.5-turbo", "team1", "team_alias1", "user1" + end_user=None, + user=None, + hashed_api_key="test_hash", + api_key_alias="test_alias", + team="test_team", + team_alias="test_team_alias", + requested_model=None, + model="gpt-3.5-turbo", ) prometheus_logger.litellm_output_tokens_metric.labels().inc.assert_called_once_with( 50 @@ -274,6 +303,18 @@ def test_set_latency_metrics(prometheus_logger): prometheus_logger.litellm_llm_api_latency_metric = MagicMock() prometheus_logger.litellm_request_total_latency_metric = MagicMock() + enum_values = UserAPIKeyLabelValues( + litellm_model_name=standard_logging_payload["model"], + api_provider=standard_logging_payload["custom_llm_provider"], + hashed_api_key=standard_logging_payload["metadata"]["user_api_key_hash"], + api_key_alias=standard_logging_payload["metadata"]["user_api_key_alias"], + team=standard_logging_payload["metadata"]["user_api_key_team_id"], + team_alias=standard_logging_payload["metadata"]["user_api_key_team_alias"], + requested_model=standard_logging_payload["model_group"], + user=standard_logging_payload["metadata"]["user_api_key_user_id"], + **standard_logging_payload, + ) + now = datetime.now() kwargs = { "end_time": now, # when the request ends @@ -291,6 +332,7 @@ def test_set_latency_metrics(prometheus_logger): user_api_team="team1", user_api_team_alias="team_alias1", standard_logging_payload=standard_logging_payload, + enum_values=enum_values, ) # completion_start_time - api_call_start_time @@ -303,14 +345,14 @@ def test_set_latency_metrics(prometheus_logger): # end_time - api_call_start_time prometheus_logger.litellm_llm_api_latency_metric.labels.assert_called_once_with( - model="gpt-3.5-turbo", - hashed_api_key="key1", - api_key_alias="alias1", - team="team1", - team_alias="team_alias1", + end_user=None, user="test_user", - end_user="test_end_user", + hashed_api_key="test_hash", + api_key_alias="test_alias", + team="test_team", + team_alias="test_team_alias", requested_model="openai-gpt", + model="gpt-3.5-turbo", ) prometheus_logger.litellm_llm_api_latency_metric.labels().observe.assert_called_once_with( 1.5 @@ -318,13 +360,13 @@ def test_set_latency_metrics(prometheus_logger): # total latency for the request prometheus_logger.litellm_request_total_latency_metric.labels.assert_called_once_with( - end_user="test_end_user", - hashed_api_key="key1", - api_key_alias="alias1", - requested_model="openai-gpt", - team="team1", - team_alias="team_alias1", + end_user=None, user="test_user", + hashed_api_key="test_hash", + api_key_alias="test_alias", + team="test_team", + team_alias="test_team_alias", + requested_model="openai-gpt", model="gpt-3.5-turbo", ) prometheus_logger.litellm_request_total_latency_metric.labels().observe.assert_called_once_with( @@ -849,3 +891,11 @@ def test_prometheus_factory(monkeypatch, disable_end_user_tracking): assert returned_dict["end_user"] == None else: assert returned_dict["end_user"] == "test_end_user" + + +def test_get_tag_from_metadata(monkeypatch): + monkeypatch.setattr( + "litellm.custom_prometheus_metadata_labels", ["metadata.foo", "metadata.bar"] + ) + metadata = {"foo": "bar", "bar": "baz", "taz": "qux"} + assert get_tag_from_metadata(metadata) == ["bar", "baz"]