diff --git a/l2mac/config.py b/l2mac/config.py index 8d161ceb..169ae599 100644 --- a/l2mac/config.py +++ b/l2mac/config.py @@ -1,5 +1,6 @@ from enum import Enum from pathlib import Path +from typing import Optional import yaml from pydantic import BaseModel, ValidationError @@ -14,11 +15,17 @@ class OpenAIRateLimitTier(str, Enum): tier5 = "tier5" +class ApiType(str, Enum): + openai = "openai" + azure = "azure" + + class LLMCoreConfig(BaseModel): - api_type: str = "openai" + api_type: ApiType = ApiType.openai model: str = "gpt-4-1106-preview" - base_url: str = "https://api.openai.com/v1" + base_url: Optional[str] = "https://api.openai.com/v1" api_key: str + api_version: Optional[str] = None class LLMSettingsConfig(BaseModel): diff --git a/l2mac/llm_providers/general.py b/l2mac/llm_providers/general.py index df97c847..5eba9c60 100644 --- a/l2mac/llm_providers/general.py +++ b/l2mac/llm_providers/general.py @@ -6,8 +6,13 @@ from time import perf_counter, sleep import openai -from openai import AsyncOpenAI, OpenAI +from openai import AsyncAzureOpenAI, AsyncOpenAI, AzureOpenAI, OpenAI +from l2mac.config import ApiType +from l2mac.llm_providers.openai import ( + openai_models, + openai_rate_limits_per_tier_per_model, +) from l2mac.llm_providers.rate_limiter import ChatRateLimiter from l2mac.llm_providers.utils import find_best_match @@ -22,16 +27,18 @@ def remove_nulls(d): def setup_chat_rate_limiter_internal(config: dict): - if config.llm.api_type == "azure": - model = config.run.model - model_details = AZURE_MODEL_DETAILS_MAP[model] - request_limit = model_details["properties"]["rate_limit_(Requests per minute)"] - token_limit = model_details["properties"]["rate_limit_(Tokens per minute)"] - elif config.llm.api_type == "openai": - from l2mac.llm_providers.openai import rate_limits_per_tier_per_model - + if config.llm.api_type == ApiType.azure: + model = config.llm.model + model_details = find_best_match( + openai_rate_limits_per_tier_per_model[config.llm_settings.rate_limit_tier], model + ) + request_limit = model_details["RPM"] + token_limit = model_details["TPM"] + elif config.llm.api_type == ApiType.openai: model = config.llm.model - model_details = find_best_match(rate_limits_per_tier_per_model[config.llm_settings.rate_limit_tier], model) + model_details = find_best_match( + openai_rate_limits_per_tier_per_model[config.llm_settings.rate_limit_tier], model + ) request_limit = model_details["RPM"] token_limit = model_details["TPM"] else: @@ -42,13 +49,11 @@ def setup_chat_rate_limiter_internal(config: dict): def get_model_max_tokens(config: dict): - if config.llm.api_type == "azure": - model = config.run.model - model_details = AZURE_MODEL_DETAILS_MAP[model] - max_tokens = model_details["properties"]["max_tokens"] - elif config.llm.api_type == "openai": - from l2mac.llm_providers.openai import openai_models - + if config.llm.api_type == ApiType.azure: + model = config.llm.model + model_details = find_best_match(openai_models, model) + max_tokens = model_details["context_window"] + elif config.llm.api_type == ApiType.openai: model = config.llm.model model_details = find_best_match(openai_models, model) max_tokens = model_details["context_window"] @@ -60,26 +65,28 @@ def get_model_max_tokens(config: dict): def get_llm_config(config, logger, name, rate_limiter): - return deepcopy( - { - "model": config.llm.model, - "temperature": config.llm_settings.temperature, - "top_p": config.llm_settings.top_p, - "frequency_penalty": config.llm_settings.frequency_penalty, - "presence_penalty": config.llm_settings.presence_penalty, - "stop": config.llm_settings.stop, - "stream": config.llm_settings.api_stream, - "api_key": config.llm.api_key, - "_open_ai_rate_limit_requests_per_minute": config.llm_settings.rate_limit_requests_per_minute, - "_logger": logger, - "_name": name, - "_rate_limiter": rate_limiter, - "_retry_with_exponential_backoff__initial_delay": config.llm_settings.api_retry_with_exponential_backoff__initial_delay, - "_retry_with_exponential_backoff__exponential_base": config.llm_settings.api_retry_with_exponential_backoff__exponential_base, - "_retry_with_exponential_backoff__jitter": config.llm_settings.api_retry_with_exponential_backoff__jitter, - "_retry_with_exponential_backoff__max_retries": config.llm_settings.api_retry_with_exponential_backoff__max_retries, - } - ) + llm_config_dict = { + "model": config.llm.model, + "api_type": config.llm.api_type, + "temperature": config.llm_settings.temperature, + "top_p": config.llm_settings.top_p, + "frequency_penalty": config.llm_settings.frequency_penalty, + "presence_penalty": config.llm_settings.presence_penalty, + "stop": config.llm_settings.stop, + "stream": config.llm_settings.api_stream, + "api_key": config.llm.api_key, + "_open_ai_rate_limit_requests_per_minute": config.llm_settings.rate_limit_requests_per_minute, + "_logger": logger, + "_name": name, + "_rate_limiter": rate_limiter, + "_retry_with_exponential_backoff__initial_delay": config.llm_settings.api_retry_with_exponential_backoff__initial_delay, + "_retry_with_exponential_backoff__exponential_base": config.llm_settings.api_retry_with_exponential_backoff__exponential_base, + "_retry_with_exponential_backoff__jitter": config.llm_settings.api_retry_with_exponential_backoff__jitter, + "_retry_with_exponential_backoff__max_retries": config.llm_settings.api_retry_with_exponential_backoff__max_retries, + } + if config.llm.api_type == ApiType.azure: + llm_config_dict.update({"azure_endpoint": config.llm.base_url, "api_version": config.llm.api_version}) + return deepcopy(llm_config_dict) def chat_completion_rl(**kwargs): @@ -172,31 +179,25 @@ async def async_chat_completion_rl_inner(**kwargs): kwargs.get("_logger", None) kwargs.get("_name", None) rate_limiter = kwargs.get("_rate_limiter", None) - model = kwargs.get("model", "gpt-3.5-turbo") - if kwargs.get("config", None) and kwargs.get("config", None).llm.api_type == "azure": - model_details = AZURE_MODEL_DETAILS_MAP[model] - kwargs.pop("model", None) - kwargs["engine"] = model_details["engine"] - kwargs["api_key"] = model_details["api_key"] - kwargs["api_version"] = model_details["api_version"] - kwargs["api_base"] = model_details["api_base"] - kwargs["api_type"] = model_details["api_type"] - kwargs.pop("_open_ai_rate_limit_requests_per_minute", None) - kwargs["_open_ai_rate_limit_requests_per_minute"] = model_details["properties"][ - "rate_limit_(Requests per minute)" - ] - - # requests_per_minute = kwargs.get('_open_ai_rate_limit_requests_per_minute', 3000) - # delay_in_seconds = 60.0 / requests_per_minute - # time.sleep(delay_in_seconds) - - aclient = AsyncOpenAI(api_key=kwargs["api_key"]) - kwargs.pop("_open_ai_rate_limit_requests_per_minute", None) - kwargs.pop("_logger", None) - kwargs.pop("api_key", None) - kwargs.pop("_name", None) - kwargs.pop("_rate_limiter", None) - kwargs.pop("_rate_limiter", None) + api_type = kwargs.get("api_type", ApiType.openai) + if api_type == ApiType.openai: + aclient = AsyncOpenAI(api_key=kwargs["api_key"]) + elif api_type == ApiType.azure: + aclient = AsyncAzureOpenAI( + api_key=kwargs["api_key"], api_version=kwargs["api_version"], azure_endpoint=kwargs["azure_endpoint"] + ) + keys_to_remove = { + "_open_ai_rate_limit_requests_per_minute", + "_logger", + "_name", + "api_key", + "api_version", + "azure_endpoint", + "_rate_limiter", + "stream", + "api_type", + } + kwargs = {k: v for k, v in kwargs.items() if k not in keys_to_remove} perf_counter() # if logger: # logger.info(f"[{name}][OpenAI API Request] {kwargs}") @@ -239,38 +240,29 @@ def chat_completion_rl_inner(**kwargs): kwargs.get("_logger", None) kwargs.get("_name", None) rate_limiter = kwargs.get("_rate_limiter", None) - model = kwargs.get("model", "gpt-3.5-turbo") - if kwargs.get("config", None) and kwargs.get("config", None).llm.api_type == "azure": - model_details = AZURE_MODEL_DETAILS_MAP[model] - kwargs.pop("model", None) - kwargs["engine"] = model_details["engine"] - kwargs["api_key"] = model_details["api_key"] - kwargs["api_version"] = model_details["api_version"] - kwargs["api_base"] = model_details["api_base"] - kwargs["api_type"] = model_details["api_type"] - kwargs.pop("_open_ai_rate_limit_requests_per_minute", None) - kwargs["_open_ai_rate_limit_requests_per_minute"] = model_details["properties"][ - "rate_limit_(Requests per minute)" - ] - - # requests_per_minute = kwargs.get('_open_ai_rate_limit_requests_per_minute', 3000) - # delay_in_seconds = 60.0 / requests_per_minute - # time.sleep(delay_in_seconds) - - client = OpenAI(api_key=kwargs["api_key"]) - kwargs.pop("_open_ai_rate_limit_requests_per_minute", None) - kwargs.pop("_logger", None) - kwargs.pop("_name", None) - kwargs.pop("api_key", None) - kwargs.pop("_rate_limiter", None) - kwargs.pop("_rate_limiter", None) - kwargs.pop("stream", None) - + api_type = kwargs.get("api_type", ApiType.openai) + if api_type == ApiType.openai: + client = OpenAI(api_key=kwargs["api_key"]) + elif api_type == ApiType.azure: + client = AzureOpenAI( + api_key=kwargs["api_key"], api_version=kwargs["api_version"], azure_endpoint=kwargs["azure_endpoint"] + ) + keys_to_remove = { + "_open_ai_rate_limit_requests_per_minute", + "_logger", + "_name", + "api_key", + "api_version", + "azure_endpoint", + "_rate_limiter", + "stream", + "api_type", + } + kwargs = {k: v for k, v in kwargs.items() if k not in keys_to_remove} perf_counter() # if logger: # logger.info(f"[{name}][OpenAI API Request] {kwargs}") # pretty_print_chat_messages(kwargs['messages']) - if rate_limiter: rate_limiter.consume(**kwargs) response = client.chat.completions.create(**kwargs) diff --git a/l2mac/llm_providers/openai.py b/l2mac/llm_providers/openai.py index 425c44ce..39fa0d7c 100644 --- a/l2mac/llm_providers/openai.py +++ b/l2mac/llm_providers/openai.py @@ -101,7 +101,7 @@ }, } -rate_limits_per_tier_per_model = { # Updated from https://platform.openai.com/docs/guides/rate-limits/usage-tiers?context=tier-free on 16th April 2024 +openai_rate_limits_per_tier_per_model = { # Updated from https://platform.openai.com/docs/guides/rate-limits/usage-tiers?context=tier-free on 16th April 2024 "free": { "gpt-3.5-turbo": {"RPM": 3, "RPD": 200, "TPM": 40000, "Batch Queue Limit": 200000}, "text-embedding-3-small": {"RPM": 3, "RPD": 200, "TPM": 150000, "Batch Queue Limit": None}, @@ -122,10 +122,10 @@ "dall-e-3": {"RPM": "5 img/min", "RPD": None, "TPM": None, "Batch Queue Limit": None}, }, "tier2": { - "gpt-4-turbo": {"RPM": "5,000", "TPM": "450,000", "Batch Queue Limit": "1,350,000"}, - "gpt-4": {"RPM": "5,000", "TPM": "40,000", "Batch Queue Limit": "200,000"}, - "gpt-3.5-turbo": {"RPM": "3,500", "TPM": "80,000", "Batch Queue Limit": "400,000"}, - "text-embedding-3-large": {"RPM": "500", "TPM": "1,000,000", "Batch Queue Limit": None}, + "gpt-4-turbo": {"RPM": 5000, "TPM": 450000, "Batch Queue Limit": 1350000}, + "gpt-4": {"RPM": 5000, "TPM": 40000, "Batch Queue Limit": 200000}, + "gpt-3.5-turbo": {"RPM": 3500, "TPM": 80000, "Batch Queue Limit": 400000}, + "text-embedding-3-large": {"RPM": 500, "TPM": 1000000, "Batch Queue Limit": None}, "whisper-1": {"RPM": "50", "TPM": None, "Batch Queue Limit": None}, "tts-1": {"RPM": "50", "TPM": None, "Batch Queue Limit": None}, "tts-1-hd": {"RPM": "5", "TPM": None, "Batch Queue Limit": None}, @@ -155,13 +155,13 @@ "dall-e-3": {"RPM": "15 img/min", "TPM": None, "Batch Queue Limit": None}, }, "tier5": { - "gpt-4-turbo": {"RPM": "10,000", "TPM": "1,500,000", "Batch Queue Limit": "250,000,000"}, - "gpt-4": {"RPM": "10,000", "TPM": "300,000", "Batch Queue Limit": "45,000,000"}, - "gpt-3.5-turbo": {"RPM": "10,000", "TPM": "2,000,000", "Batch Queue Limit": "300,000,000"}, - "text-embedding-3-large": {"RPM": "10,000", "TPM": "10,000,000", "Batch Queue Limit": None}, - "whisper-1": {"RPM": "500", "TPM": None, "Batch Queue Limit": None}, - "tts-1": {"RPM": "500", "TPM": None, "Batch Queue Limit": None}, - "tts-1-hd": {"RPM": "20", "TPM": None, "Batch Queue Limit": None}, + "gpt-4-turbo": {"RPM": 10000, "TPM": 1500000, "Batch Queue Limit": 250000000}, + "gpt-4": {"RPM": 10000, "TPM": 300000, "Batch Queue Limit": 45000000}, + "gpt-3.5-turbo": {"RPM": 10000, "TPM": 2000000, "Batch Queue Limit": 300000000}, + "text-embedding-3-large": {"RPM": 10000, "TPM": 10000000, "Batch Queue Limit": None}, + "whisper-1": {"RPM": 500, "TPM": None, "Batch Queue Limit": None}, + "tts-1": {"RPM": 500, "TPM": None, "Batch Queue Limit": None}, + "tts-1-hd": {"RPM": 20, "TPM": None, "Batch Queue Limit": None}, "dall-e-2": {"RPM": "500 img/min", "TPM": None, "Batch Queue Limit": None}, "dall-e-3": {"RPM": "50 img/min", "TPM": None, "Batch Queue Limit": None}, },