From c9171a9d9cc8bab31093e3e3128a8958db5fd549 Mon Sep 17 00:00:00 2001 From: Siddharth Balyan <52913345+alt-glitch@users.noreply.github.com> Date: Thu, 4 Jul 2024 09:46:44 +0530 Subject: [PATCH] support for `Hermes-2-Theta-Llama-3-8B` as default OSS model (#424) * wip: local llm support * wip: local llm support * wip: function calling throught render_templates * wip: working fn calling * fix: message content format * model from julep hf * updated .env.example * switch from llm-embedder to bge/m3 everywhere * minor additions * tei update, .env update, type fix, ellipsis fix * fix(agents-api): Minor type check fix Signed-off-by: Diwank Tomer --------- Signed-off-by: Diwank Tomer Co-authored-by: Diwank Tomer Co-authored-by: Diwank Singh Tomer --- .env.example | 25 +- .../agents_api/activities/embed_docs.py | 4 +- .../agents_api/activities/summarization.py | 4 +- .../agents_api/embed_models_registry.py | 10 +- agents-api/agents_api/env.py | 15 +- agents-api/agents_api/model_registry.py | 105 ++++++- .../agents_api/prompt_assets/sys_prompt.yml | 35 +++ agents-api/agents_api/rec_sum/generate.py | 4 +- .../agents_api/routers/sessions/session.py | 42 +-- agents-api/docker-compose.yml | 23 +- model-serving/Dockerfile | 31 +- model-serving/artifacts/nous-llama-fix.ipynb | 278 ++++++++++++++++++ model-serving/docker-compose.yml | 4 +- 13 files changed, 473 insertions(+), 107 deletions(-) create mode 100644 agents-api/agents_api/prompt_assets/sys_prompt.yml create mode 100644 model-serving/artifacts/nous-llama-fix.ipynb diff --git a/.env.example b/.env.example index fd0bece7d..ce2d956a4 100644 --- a/.env.example +++ b/.env.example @@ -5,24 +5,27 @@ COZO_AUTH_TOKEN=myauthkey COZO_HOST=http://memory-store:9070 COZO_PORT=9070 COZO_ROCKSDB_DIR=cozo.db -DTYPE=bfloat16 +DTYPE=float16 EMBEDDING_SERVICE_URL=http://text-embeddings-inference/embed GATEWAY_PORT=80 -OPENAI_API_KEY="" -GPU_MEMORY_UTILIZATION=0.95 +GPU_MEMORY_UTILIZATION=0.90 + HF_TOKEN="" HUGGING_FACE_HUB_TOKEN="" -JWT_SHARED_KEY=this_shared_key_is_32_48_or_64_bytes_long -MAX_MODEL_LEN=1024 +JWT_SHARED_KEY= + +MAX_MODEL_LEN=8192 MAX_NUM_SEQS=1 MNT_DIR=/data -GF_SECURITY_ADMIN_PASSWORD=changethis MODEL_API_KEY=myauthkey MODEL_API_KEY_HEADER_NAME=Authorization MODEL_API_URL=http://model-serving:8000 MODEL_INFERENCE_URL=http://model-serving:8000/v1 -MODEL_ID=BAAI/llm-embedder -MODEL_NAME = "julep-ai/samantha-1-turbo" +MODEL_ID=BAAI/bge-m3 + +# MODEL_NAME="OpenPipe/Hermes-2-Theta-Llama-3-8B-32k" +MODEL_NAME="julep-ai/Hermes-2-Theta-Llama-3-8B" + SKIP_CHECK_DEVELOPER_HEADERS=true SUMMARIZATION_TOKENS_THRESHOLD=2048 TEMPERATURE_SCALING_FACTOR=0.9 @@ -33,4 +36,8 @@ TEMPORAL_WORKER_URL=temporal:7233 TP_SIZE=1 TRUNCATE_EMBED_TEXT=true TRAEFIK_LOG_LEVEL=DEBUG -WORKER_URL=temporal:7233 \ No newline at end of file +WORKER_URL=temporal:7233 + +AGENTS_API_DEBUG=false +OPENAI_API_KEY= +ANTHROPIC_API_KEY= \ No newline at end of file diff --git a/agents-api/agents_api/activities/embed_docs.py b/agents-api/agents_api/activities/embed_docs.py index 228cee258..633e4d4de 100644 --- a/agents-api/agents_api/activities/embed_docs.py +++ b/agents-api/agents_api/activities/embed_docs.py @@ -1,6 +1,6 @@ from pydantic import UUID4 from temporalio import activity -from agents_api.env import docs_embedding_model_id +from agents_api.env import embedding_model_id from agents_api.models.docs.embed_docs import ( embed_docs_snippets_query, ) @@ -13,7 +13,7 @@ @activity.defn async def embed_docs(doc_id: UUID4, title: str, content: list[str]) -> None: indices, snippets = list(zip(*enumerate(content))) - model = EmbeddingModel.from_model_name(docs_embedding_model_id) + model = EmbeddingModel.from_model_name(embedding_model_id) embeddings = await model.embed( [ { diff --git a/agents-api/agents_api/activities/summarization.py b/agents-api/agents_api/activities/summarization.py index e44fc455d..03a5f0d79 100644 --- a/agents-api/agents_api/activities/summarization.py +++ b/agents-api/agents_api/activities/summarization.py @@ -11,7 +11,7 @@ entries_summarization_query, ) from agents_api.common.protocol.entries import Entry -from ..model_registry import JULEP_MODELS +from ..model_registry import LOCAL_MODELS from ..env import model_inference_url, model_api_key, summarization_model_name from agents_api.rec_sum.entities import get_entities from agents_api.rec_sum.summarize import summarize_messages @@ -135,7 +135,7 @@ async def run_prompt( ) -> str: api_base = None api_key = None - if model in JULEP_MODELS: + if model in LOCAL_MODELS: api_base = model_inference_url api_key = model_api_key model = f"openai/{model}" diff --git a/agents-api/agents_api/embed_models_registry.py b/agents-api/agents_api/embed_models_registry.py index babf1be22..aa5f5ee5a 100644 --- a/agents-api/agents_api/embed_models_registry.py +++ b/agents-api/agents_api/embed_models_registry.py @@ -10,7 +10,7 @@ PromptTooBigError, UnknownTokenizerError, ) -from agents_api.env import docs_embedding_service_url +from agents_api.env import embedding_service_url def normalize_l2(x): @@ -83,7 +83,7 @@ async def embed( embeddings = await embed( input, embedding_service_url=self.embedding_service_url - or docs_embedding_service_url, + or embedding_service_url, embedding_model_name=self.embedding_model_name, ) elif self.embedding_provider == "openai": @@ -130,7 +130,7 @@ def normalize( tokenizer=tiktoken.encoding_for_model("text-embedding-3-large"), ), "Alibaba-NLP/gte-large-en-v1.5": EmbeddingModel( - embedding_service_url=docs_embedding_service_url, + embedding_service_url=embedding_service_url, embedding_provider="julep", embedding_model_name="Alibaba-NLP/gte-large-en-v1.5", original_embedding_dimensions=1024, @@ -139,7 +139,7 @@ def normalize( tokenizer=Tokenizer.from_pretrained("Alibaba-NLP/gte-large-en-v1.5"), ), "BAAI/bge-m3": EmbeddingModel( - embedding_service_url=docs_embedding_service_url, + embedding_service_url=embedding_service_url, embedding_provider="julep", embedding_model_name="BAAI/bge-m3", original_embedding_dimensions=1024, @@ -148,7 +148,7 @@ def normalize( tokenizer=Tokenizer.from_pretrained("BAAI/bge-m3"), ), "BAAI/llm-embedder": EmbeddingModel( - embedding_service_url=docs_embedding_service_url, + embedding_service_url=embedding_service_url, embedding_provider="julep", embedding_model_name="BAAI/llm-embedder", original_embedding_dimensions=1024, diff --git a/agents-api/agents_api/env.py b/agents-api/agents_api/env.py index 2825aba27..15b83d408 100644 --- a/agents-api/agents_api/env.py +++ b/agents-api/agents_api/env.py @@ -44,20 +44,12 @@ "SKIP_CHECK_DEVELOPER_HEADERS", default=False ) -# embedding service URL embedding_service_url: str = env.str( - "EMBEDDING_SERVICE_URL", default="http://0.0.0.0:8082/embed" + "EMBEDDING_SERVICE_URL", default="http://0.0.0.0:8083/embed" ) -docs_embedding_service_url: str = env.str( - "DOCS_EMBEDDING_SERVICE_URL", default="http://0.0.0.0:8083/embed" -) - -embedding_model_id: str = env.str( - "EMBEDDING_MODEL_ID", default="BAAI/bge-large-en-v1.5" -) -docs_embedding_model_id: str = env.str("DOCS_EMBEDDING_MODEL_ID", default="BAAI/bge-m3") +embedding_model_id: str = env.str("EMBEDDING_MODEL_ID", default="BAAI/bge-m3") truncate_embed_text: bool = env.bool("TRUNCATE_EMBED_TEXT", default=False) @@ -84,8 +76,7 @@ temporal_worker_url=temporal_worker_url, temporal_namespace=temporal_namespace, openai_api_key=openai_api_key, - docs_embedding_model_id=docs_embedding_model_id, - docs_embedding_service_url=docs_embedding_service_url, + docs_embedding_service_url=embedding_service_url, embedding_model_id=embedding_model_id, ) diff --git a/agents-api/agents_api/model_registry.py b/agents-api/agents_api/model_registry.py index 350b5559b..aa3cb6854 100644 --- a/agents-api/agents_api/model_registry.py +++ b/agents-api/agents_api/model_registry.py @@ -2,7 +2,8 @@ Model Registry maintains a list of supported models and their configs. """ -from typing import Dict +import ast +import json from agents_api.clients.worker.types import ChatML from agents_api.common.exceptions.agents import ( AgentModelNotValid, @@ -10,6 +11,10 @@ ) import litellm from litellm.utils import get_valid_models +from pydantic import BaseModel +from typing import Dict, Literal, Optional +import xml.etree.ElementTree as ET + GPT4_MODELS: Dict[str, int] = { # stable model names: @@ -93,16 +98,56 @@ OPENAI_MODELS = {**GPT4_MODELS, **TURBO_MODELS, **GPT3_5_MODELS, **GPT3_MODELS} -JULEP_MODELS = { +LOCAL_MODELS = { "julep-ai/samantha-1-turbo": 32768, "julep-ai/samantha-1-turbo-awq": 32768, + "TinyLlama/TinyLlama_v1.1": 2048, + "casperhansen/llama-3-8b-instruct-awq": 8192, + "julep-ai/Hermes-2-Theta-Llama-3-8B": 8192, + "OpenPipe/Hermes-2-Theta-Llama-3-8B-32k": 32768, +} + +LOCAL_MODELS_WITH_TOOL_CALLS = { + "OpenPipe/Hermes-2-Theta-Llama-3-8B-32k": 32768, + "julep-ai/Hermes-2-Theta-Llama-3-8B": 8192, } CHAT_MODELS = {**GPT4_MODELS, **TURBO_MODELS, **CLAUDE_MODELS} +ALL_AVAILABLE_MODELS = litellm.model_list + list(LOCAL_MODELS.keys()) +VALID_MODELS = get_valid_models() + list(LOCAL_MODELS.keys()) + + +class FunctionCall(BaseModel): + arguments: dict + """ + The arguments to call the function with, as generated by the model in JSON + format. Note that the model does not always generate valid JSON, and may + hallucinate parameters not defined by your function schema. Validate the + arguments in your code before calling your function. + """ + + name: str + """The name of the function to call.""" -ALL_AVAILABLE_MODELS = litellm.model_list + list(JULEP_MODELS.keys()) -VALID_MODELS = get_valid_models() + list(JULEP_MODELS.keys()) + +class FunctionDefinition(BaseModel): + name: str + description: Optional[str] = None + parameters: Optional[Dict[str, object]] = None + + +class FunctionSignature(BaseModel): + function: FunctionDefinition + type: Literal["function"] + + +class PromptSchema(BaseModel): + Role: str + Objective: str + Tools: str + Schema: str + Instructions: str def validate_configuration(model: str): @@ -127,7 +172,7 @@ def load_context(init_context: list[ChatML], model: str): } for msg in init_context ] - elif model in JULEP_MODELS: + elif model in LOCAL_MODELS: init_context = [ {"name": msg.name, "role": msg.role, "content": msg.content} for msg in init_context @@ -137,6 +182,54 @@ def load_context(init_context: list[ChatML], model: str): return init_context +def validate_and_extract_tool_calls(assistant_content): + validation_result = False + tool_calls = [] + error_message = None + + try: + # wrap content in root element + xml_root_element = f"{assistant_content}" + root = ET.fromstring(xml_root_element) + + # extract JSON data + for element in root.findall(".//tool_call"): + json_data = None + try: + if element.text is None: + continue + + json_text = element.text.strip() + + try: + # Prioritize json.loads for better error handling + json_data = json.loads(json_text) + except json.JSONDecodeError as json_err: + try: + # Fallback to ast.literal_eval if json.loads fails + json_data = ast.literal_eval(json_text) + except (SyntaxError, ValueError) as eval_err: + error_message = ( + f"JSON parsing failed with both json.loads and ast.literal_eval:\n" + f"- JSON Decode Error: {json_err}\n" + f"- Fallback Syntax/Value Error: {eval_err}\n" + f"- Problematic JSON text: {json_text}" + ) + continue + except BaseException as e: + error_message = f"Cannot strip text: {e}" + + if json_data is not None: + tool_calls.append(json_data) + validation_result = True + + except ET.ParseError as err: + error_message = f"XML Parse Error: {err}" + + # Return default values if no valid data is extracted + return validation_result, tool_calls, error_message + + def get_extra_settings(settings): extra_settings = ( dict( @@ -147,7 +240,7 @@ def get_extra_settings(settings): logit_bias=settings.logit_bias, preset=settings.preset.name if settings.preset else None, ) - if settings.model in JULEP_MODELS + if settings.model in LOCAL_MODELS else {} ) diff --git a/agents-api/agents_api/prompt_assets/sys_prompt.yml b/agents-api/agents_api/prompt_assets/sys_prompt.yml new file mode 100644 index 000000000..0aad05160 --- /dev/null +++ b/agents-api/agents_api/prompt_assets/sys_prompt.yml @@ -0,0 +1,35 @@ +Role: | + You are a function calling AI agent with self-recursion. + You can call only one function at a time and analyse data you get from function response. + You are provided with function signatures within XML tags. + The current date is: {date}. +Objective: | + You may use agentic frameworks for reasoning and planning to help with user query. + Please call a function and wait for function results to be provided to you in the next iteration. + Don't make assumptions about what values to plug into function arguments. + Once you have called a function, results will be fed back to you within XML tags. + Don't make assumptions about tool results if XML tags are not present since function hasn't been executed yet. + Analyze the data once you get the results and call another function. + At each iteration please continue adding the your analysis to previous summary. + Your final response should directly answer the user query with an anlysis or summary of the results of function calls. +Tools: | + Here are the available tools: + {{agent.tools}} + If the provided function signatures doesn't have the function you must call, you may write executable python code in markdown syntax and call code_interpreter() function as follows: + + {{"arguments": {{"code_markdown": , "name": "code_interpreter"}}}} + + Make sure that the json object above with code markdown block is parseable with json.loads() and the XML block with XML ElementTree. +Schema: | + Use the following pydantic model json schema for each tool call you will make: + {schema} +Instructions: | + At the very first turn you don't have so you shouldn't not make up the results. + Please keep a running summary with analysis of previous function results and summaries from previous iterations. + Do not stop calling functions until the task has been accomplished or you've reached max iteration of 10. + Calling multiple functions at once can overload the system and increase cost so call one function at a time please. + If you plan to continue with analysis, always call another function. + For each function call return a valid json object (using doulbe quotes) with function name and arguments within XML tags as follows: + + {{"arguments": , "name": }} + diff --git a/agents-api/agents_api/rec_sum/generate.py b/agents-api/agents_api/rec_sum/generate.py index e05111faa..e1a2eb607 100644 --- a/agents-api/agents_api/rec_sum/generate.py +++ b/agents-api/agents_api/rec_sum/generate.py @@ -1,6 +1,6 @@ from tenacity import retry, stop_after_attempt, wait_fixed from agents_api.env import model_inference_url, model_api_key -from agents_api.model_registry import JULEP_MODELS +from agents_api.model_registry import LOCAL_MODELS from litellm import acompletion @@ -11,7 +11,7 @@ async def generate( **kwargs, ) -> dict: base_url, api_key = None, None - if model in JULEP_MODELS: + if model in LOCAL_MODELS: base_url, api_key = model_inference_url, model_api_key model = f"openai/{model}" diff --git a/agents-api/agents_api/routers/sessions/session.py b/agents-api/agents_api/routers/sessions/session.py index da9121366..fc6309083 100644 --- a/agents-api/agents_api/routers/sessions/session.py +++ b/agents-api/agents_api/routers/sessions/session.py @@ -25,13 +25,14 @@ from ...common.utils.json import CustomJSONEncoder from ...common.utils.messages import stringify_content from ...env import ( - docs_embedding_service_url, - docs_embedding_model_id, + embedding_service_url, + embedding_model_id, ) from ...model_registry import ( - JULEP_MODELS, - get_extra_settings, + LOCAL_MODELS, + LOCAL_MODELS_WITH_TOOL_CALLS, load_context, + validate_and_extract_tool_calls, ) from ...models.entry.add_entries import add_entries_query from ...models.entry.proc_mem_context import proc_mem_context_query @@ -261,8 +262,8 @@ async def forward( ] ], join_inputs=False, - embedding_service_url=docs_embedding_service_url, - embedding_model_name=docs_embedding_model_id, + embedding_service_url=embedding_service_url, + embedding_model_name=embedding_model_id, ) entries: list[Entry] = [] @@ -357,7 +358,10 @@ async def forward( and message.content[0].type == "text" ): message.content = message.content[0].text - + # Add tools to settings + if tools: + settings.tools = settings.tools or [] + settings.tools.extend(tools) # If render_templates=True, render the templates if session_data is not None and session_data.render_templates: @@ -378,6 +382,7 @@ async def forward( "name": session_data.agent_name, "about": session_data.agent_about, "metadata": session_data.agent_metadata, + "tools": settings.tools, }, } @@ -392,11 +397,6 @@ async def forward( if session_data is not None: settings.model = session_data.model - # Add tools to settings - if tools: - settings.tools = settings.tools or [] - settings.tools.extend(tools) - return messages, settings, doc_ids @cache @@ -408,7 +408,7 @@ async def generate( api_base = None api_key = None model = settings.model - if model in JULEP_MODELS: + if model in LOCAL_MODELS: api_base = model_inference_url api_key = model_api_key model = f"openai/{model}" @@ -416,11 +416,8 @@ async def generate( if settings.tools: tools = [(tool.model_dump(exclude="id")) for tool in settings.tools] - extra_body = get_extra_settings(settings) - litellm.drop_params = True litellm.add_function_to_prompt = True - res = await acompletion( model=model, messages=init_context, @@ -435,9 +432,18 @@ async def generate( response_format=settings.response_format, api_base=api_base, api_key=api_key, - **extra_body, ) - + if model in LOCAL_MODELS_WITH_TOOL_CALLS: + validation, tool_call, error_msg = validate_and_extract_tool_calls( + res.choices[0].message.content + ) + if validation: + res.choices[0].message.role = ( + "function_call" if tool_call else "assistant" + ) + res.choices[0].finish_reason = "tool_calls" + res.choices[0].message.tool_calls = tool_call + res.choices[0].message.content = json.dumps(tool_call) return res async def backward( diff --git a/agents-api/docker-compose.yml b/agents-api/docker-compose.yml index ae21334d9..150473c3c 100644 --- a/agents-api/docker-compose.yml +++ b/agents-api/docker-compose.yml @@ -59,9 +59,9 @@ services: container_name: text-embeddings-inference environment: - DTYPE=float16 - - MODEL_ID=BAAI/llm-embedder + - MODEL_ID=BAAI/bge-m3 - image: ghcr.io/huggingface/text-embeddings-inference:1.0 + image: ghcr.io/huggingface/text-embeddings-inference:1.3 ports: - "8082:80" volumes: @@ -75,25 +75,6 @@ services: count: all capabilities: [gpu] - docs-text-embeddings-inference: - container_name: docs-text-embeddings-inference - environment: - - DTYPE=float16 - - MODEL_ID=BAAI/bge-m3 - - image: ghcr.io/huggingface/text-embeddings-inference:1.0 - ports: - - "8083:80" - volumes: - - ~/.cache/huggingface/hub:/data - shm_size: "2gb" - deploy: - resources: - reservations: - devices: - - driver: nvidia - count: all - capabilities: [gpu] temporal: image: julepai/temporal:dev diff --git a/model-serving/Dockerfile b/model-serving/Dockerfile index 0b9f16a4d..b25c81eba 100644 --- a/model-serving/Dockerfile +++ b/model-serving/Dockerfile @@ -1,35 +1,12 @@ -# Use vllm/vllm-openai:0.3.2 as a base image -FROM vllm/vllm-openai:v0.3.3 as builder +FROM vllm/vllm-openai:v0.5.0 as base -# Set environment variables -ENV PYTHONUNBUFFERED True -ENV POETRY_CACHE_DIR=/tmp/poetry_cache - -# Set the working directory -WORKDIR /app - -# Install Poetry -# Disable Poetry's virtual environment as Docker provides isolation -RUN pip install --no-cache-dir --upgrade poetry \ - && poetry config virtualenvs.create false - -# Copy only the Poetry configuration files -COPY pyproject.toml poetry.lock ./ - -# Install dependencies only, excluding the application itself -RUN poetry install --no-dev --no-root - -# Copy the rest of your application code -COPY . . - -# Now, install the application with Poetry, which will not re-install the dependencies -RUN poetry install --no-dev # Define the entrypoint -ENV MODEL_NAME julep-ai/samantha-1-turbo +ENV MODEL_NAME julep-ai/Hermes-2-Theta-Llama-3-8B ENV TP_SIZE 1 ENV MAX_MODEL_LEN 8192 ENV MAX_NUM_SEQS 1 ENV GPU_MEMORY_UTILIZATION 0.95 ENV DTYPE bfloat16 -ENTRYPOINT python3 -m model_api --model $MODEL_NAME --tensor-parallel-size $TP_SIZE --enforce-eager --gpu-memory-utilization $GPU_MEMORY_UTILIZATION --max-model-len $MAX_MODEL_LEN --max-num-seqs $MAX_NUM_SEQS --dtype $DTYPE +ENV MODEL_API_KEY myauthkey +ENTRYPOINT python3 -m vllm.entrypoints.openai.api_server --model $MODEL_NAME --tensor-parallel-size $TP_SIZE --enforce-eager --gpu-memory-utilization $GPU_MEMORY_UTILIZATION --max-model-len $MAX_MODEL_LEN --max-num-seqs $MAX_NUM_SEQS --dtype $DTYPE --trust-remote-code --api_key=$MODEL_API_KEY diff --git a/model-serving/artifacts/nous-llama-fix.ipynb b/model-serving/artifacts/nous-llama-fix.ipynb new file mode 100644 index 000000000..0cfad097e --- /dev/null +++ b/model-serving/artifacts/nous-llama-fix.ipynb @@ -0,0 +1,278 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 86, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/sidbin/miniconda3/envs/julep/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n", + " warnings.warn(\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "b9c2b4fa7a02414581da3b7ec438472a", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "tokenizer_config.json: 0%| | 0.00/50.7k [00:00', '')" + ] + }, + "execution_count": 89, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "old_token, new_token" + ] + }, + { + "cell_type": "code", + "execution_count": 79, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "8fbb2345fcca46c5b4f8cdbf7fac4901", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Fetching 12 files: 0%| | 0/12 [00:00