Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

support for Hermes-2-Theta-Llama-3-8B as default OSS model #424

Merged
merged 12 commits into from
Jul 4, 2024
25 changes: 16 additions & 9 deletions .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -5,24 +5,27 @@ COZO_AUTH_TOKEN=myauthkey
COZO_HOST=http://memory-store:9070
COZO_PORT=9070
COZO_ROCKSDB_DIR=cozo.db
DTYPE=bfloat16
DTYPE=float16
EMBEDDING_SERVICE_URL=http://text-embeddings-inference/embed
GATEWAY_PORT=80
OPENAI_API_KEY=""
GPU_MEMORY_UTILIZATION=0.95
GPU_MEMORY_UTILIZATION=0.90

HF_TOKEN=""
HUGGING_FACE_HUB_TOKEN=""
JWT_SHARED_KEY=this_shared_key_is_32_48_or_64_bytes_long
MAX_MODEL_LEN=1024
JWT_SHARED_KEY=

MAX_MODEL_LEN=8192
MAX_NUM_SEQS=1
MNT_DIR=/data
GF_SECURITY_ADMIN_PASSWORD=changethis
MODEL_API_KEY=myauthkey
MODEL_API_KEY_HEADER_NAME=Authorization
MODEL_API_URL=http://model-serving:8000
MODEL_INFERENCE_URL=http://model-serving:8000/v1
MODEL_ID=BAAI/llm-embedder
MODEL_NAME = "julep-ai/samantha-1-turbo"
MODEL_ID=BAAI/bge-m3

# MODEL_NAME="OpenPipe/Hermes-2-Theta-Llama-3-8B-32k"
MODEL_NAME="julep-ai/Hermes-2-Theta-Llama-3-8B"

SKIP_CHECK_DEVELOPER_HEADERS=true
SUMMARIZATION_TOKENS_THRESHOLD=2048
TEMPERATURE_SCALING_FACTOR=0.9
Expand All @@ -33,4 +36,8 @@ TEMPORAL_WORKER_URL=temporal:7233
TP_SIZE=1
TRUNCATE_EMBED_TEXT=true
TRAEFIK_LOG_LEVEL=DEBUG
WORKER_URL=temporal:7233
WORKER_URL=temporal:7233

AGENTS_API_DEBUG=false
OPENAI_API_KEY=
ANTHROPIC_API_KEY=
4 changes: 2 additions & 2 deletions agents-api/agents_api/activities/embed_docs.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from pydantic import UUID4
from temporalio import activity
from agents_api.env import docs_embedding_model_id
from agents_api.env import embedding_model_id
from agents_api.models.docs.embed_docs import (
embed_docs_snippets_query,
)
Expand All @@ -13,7 +13,7 @@
@activity.defn
async def embed_docs(doc_id: UUID4, title: str, content: list[str]) -> None:
indices, snippets = list(zip(*enumerate(content)))
model = EmbeddingModel.from_model_name(docs_embedding_model_id)
model = EmbeddingModel.from_model_name(embedding_model_id)
embeddings = await model.embed(
[
{
Expand Down
4 changes: 2 additions & 2 deletions agents-api/agents_api/activities/summarization.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
entries_summarization_query,
)
from agents_api.common.protocol.entries import Entry
from ..model_registry import JULEP_MODELS
from ..model_registry import LOCAL_MODELS
from ..env import model_inference_url, model_api_key, summarization_model_name
from agents_api.rec_sum.entities import get_entities
from agents_api.rec_sum.summarize import summarize_messages
Expand Down Expand Up @@ -135,7 +135,7 @@ async def run_prompt(
) -> str:
api_base = None
api_key = None
if model in JULEP_MODELS:
if model in LOCAL_MODELS:
api_base = model_inference_url
api_key = model_api_key
model = f"openai/{model}"
Expand Down
2 changes: 1 addition & 1 deletion agents-api/agents_api/autogen/openapi_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -837,7 +837,7 @@ class ImageUrl(BaseModel):
"""
URL or base64 data url (e.g. `data:image/jpeg;base64,<the base64 encoded image>`)
"""
detail: Detail | None = "auto"
detail: Detail | None = "auto" # pytype: disable=annotation-type-mismatch
"""
image detail to feed into the model can be low | high | auto
"""
Expand Down
10 changes: 5 additions & 5 deletions agents-api/agents_api/embed_models_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
PromptTooBigError,
UnknownTokenizerError,
)
from agents_api.env import docs_embedding_service_url
from agents_api.env import embedding_service_url


def normalize_l2(x):
Expand Down Expand Up @@ -83,7 +83,7 @@ async def embed(
embeddings = await embed(
input,
embedding_service_url=self.embedding_service_url
or docs_embedding_service_url,
or embedding_service_url,
embedding_model_name=self.embedding_model_name,
)
elif self.embedding_provider == "openai":
Expand Down Expand Up @@ -130,7 +130,7 @@ def normalize(
tokenizer=tiktoken.encoding_for_model("text-embedding-3-large"),
),
"Alibaba-NLP/gte-large-en-v1.5": EmbeddingModel(
embedding_service_url=docs_embedding_service_url,
embedding_service_url=embedding_service_url,
embedding_provider="julep",
embedding_model_name="Alibaba-NLP/gte-large-en-v1.5",
original_embedding_dimensions=1024,
Expand All @@ -139,7 +139,7 @@ def normalize(
tokenizer=Tokenizer.from_pretrained("Alibaba-NLP/gte-large-en-v1.5"),
),
"BAAI/bge-m3": EmbeddingModel(
embedding_service_url=docs_embedding_service_url,
embedding_service_url=embedding_service_url,
embedding_provider="julep",
embedding_model_name="BAAI/bge-m3",
original_embedding_dimensions=1024,
Expand All @@ -148,7 +148,7 @@ def normalize(
tokenizer=Tokenizer.from_pretrained("BAAI/bge-m3"),
),
"BAAI/llm-embedder": EmbeddingModel(
embedding_service_url=docs_embedding_service_url,
embedding_service_url=embedding_service_url,
embedding_provider="julep",
embedding_model_name="BAAI/llm-embedder",
original_embedding_dimensions=1024,
Expand Down
15 changes: 3 additions & 12 deletions agents-api/agents_api/env.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,20 +44,12 @@
"SKIP_CHECK_DEVELOPER_HEADERS", default=False
)

# embedding service URL
embedding_service_url: str = env.str(
"EMBEDDING_SERVICE_URL", default="http://0.0.0.0:8082/embed"
"EMBEDDING_SERVICE_URL", default="http://0.0.0.0:8083/embed"
)

docs_embedding_service_url: str = env.str(
"DOCS_EMBEDDING_SERVICE_URL", default="http://0.0.0.0:8083/embed"
)

embedding_model_id: str = env.str(
"EMBEDDING_MODEL_ID", default="BAAI/bge-large-en-v1.5"
)

docs_embedding_model_id: str = env.str("DOCS_EMBEDDING_MODEL_ID", default="BAAI/bge-m3")
embedding_model_id: str = env.str("EMBEDDING_MODEL_ID", default="BAAI/bge-m3")

truncate_embed_text: bool = env.bool("TRUNCATE_EMBED_TEXT", default=False)

Expand All @@ -84,8 +76,7 @@
temporal_worker_url=temporal_worker_url,
temporal_namespace=temporal_namespace,
openai_api_key=openai_api_key,
docs_embedding_model_id=docs_embedding_model_id,
docs_embedding_service_url=docs_embedding_service_url,
docs_embedding_service_url=embedding_service_url,
embedding_model_id=embedding_model_id,
)

Expand Down
105 changes: 99 additions & 6 deletions agents-api/agents_api/model_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,19 @@
Model Registry maintains a list of supported models and their configs.
"""

from typing import Dict
import ast
import json
from agents_api.clients.worker.types import ChatML
from agents_api.common.exceptions.agents import (
AgentModelNotValid,
MissingAgentModelAPIKeyError,
)
import litellm
from litellm.utils import get_valid_models
from pydantic import BaseModel
from typing import Dict, Literal, Optional
import xml.etree.ElementTree as ET


GPT4_MODELS: Dict[str, int] = {
# stable model names:
Expand Down Expand Up @@ -93,16 +98,56 @@

OPENAI_MODELS = {**GPT4_MODELS, **TURBO_MODELS, **GPT3_5_MODELS, **GPT3_MODELS}

JULEP_MODELS = {
LOCAL_MODELS = {
"julep-ai/samantha-1-turbo": 32768,
"julep-ai/samantha-1-turbo-awq": 32768,
"TinyLlama/TinyLlama_v1.1": 2048,
"casperhansen/llama-3-8b-instruct-awq": 8192,
"julep-ai/Hermes-2-Theta-Llama-3-8B": 8192,
"OpenPipe/Hermes-2-Theta-Llama-3-8B-32k": 32768,
}

LOCAL_MODELS_WITH_TOOL_CALLS = {
"OpenPipe/Hermes-2-Theta-Llama-3-8B-32k": 32768,
"julep-ai/Hermes-2-Theta-Llama-3-8B": 8192,
}

CHAT_MODELS = {**GPT4_MODELS, **TURBO_MODELS, **CLAUDE_MODELS}

ALL_AVAILABLE_MODELS = litellm.model_list + list(LOCAL_MODELS.keys())
VALID_MODELS = get_valid_models() + list(LOCAL_MODELS.keys())


class FunctionCall(BaseModel):
arguments: dict
"""
The arguments to call the function with, as generated by the model in JSON
format. Note that the model does not always generate valid JSON, and may
hallucinate parameters not defined by your function schema. Validate the
arguments in your code before calling your function.
"""

name: str
"""The name of the function to call."""

ALL_AVAILABLE_MODELS = litellm.model_list + list(JULEP_MODELS.keys())
VALID_MODELS = get_valid_models() + list(JULEP_MODELS.keys())

class FunctionDefinition(BaseModel):
name: str
description: Optional[str] = None
parameters: Optional[Dict[str, object]] = None


class FunctionSignature(BaseModel):
function: FunctionDefinition
type: Literal["function"]


class PromptSchema(BaseModel):
Role: str
Objective: str
Tools: str
Schema: str
Instructions: str


def validate_configuration(model: str):
Expand All @@ -127,7 +172,7 @@ def load_context(init_context: list[ChatML], model: str):
}
for msg in init_context
]
elif model in JULEP_MODELS:
elif model in LOCAL_MODELS:
init_context = [
{"name": msg.name, "role": msg.role, "content": msg.content}
for msg in init_context
Expand All @@ -137,6 +182,54 @@ def load_context(init_context: list[ChatML], model: str):
return init_context


def validate_and_extract_tool_calls(assistant_content):
validation_result = False
tool_calls = []
error_message = None

try:
# wrap content in root element
xml_root_element = f"<root>{assistant_content}</root>"
root = ET.fromstring(xml_root_element)

# extract JSON data
for element in root.findall(".//tool_call"):
json_data = None
try:
if element.text is None:
continue

json_text = element.text.strip()

try:
# Prioritize json.loads for better error handling
json_data = json.loads(json_text)
except json.JSONDecodeError as json_err:
try:
# Fallback to ast.literal_eval if json.loads fails
json_data = ast.literal_eval(json_text)
except (SyntaxError, ValueError) as eval_err:
error_message = (
f"JSON parsing failed with both json.loads and ast.literal_eval:\n"
f"- JSON Decode Error: {json_err}\n"
f"- Fallback Syntax/Value Error: {eval_err}\n"
f"- Problematic JSON text: {json_text}"
)
continue
except BaseException as e:
error_message = f"Cannot strip text: {e}"

if json_data is not None:
tool_calls.append(json_data)
validation_result = True

except ET.ParseError as err:
error_message = f"XML Parse Error: {err}"

# Return default values if no valid data is extracted
return validation_result, tool_calls, error_message


def get_extra_settings(settings):
extra_settings = (
dict(
Expand All @@ -147,7 +240,7 @@ def get_extra_settings(settings):
logit_bias=settings.logit_bias,
preset=settings.preset.name if settings.preset else None,
)
if settings.model in JULEP_MODELS
if settings.model in LOCAL_MODELS
else {}
)

Expand Down
35 changes: 35 additions & 0 deletions agents-api/agents_api/prompt_assets/sys_prompt.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
Role: |
You are a function calling AI agent with self-recursion.
You can call only one function at a time and analyse data you get from function response.
You are provided with function signatures within <tools></tools> XML tags.
The current date is: {date}.
Objective: |
You may use agentic frameworks for reasoning and planning to help with user query.
Please call a function and wait for function results to be provided to you in the next iteration.
Don't make assumptions about what values to plug into function arguments.
Once you have called a function, results will be fed back to you within <tool_response></tool_response> XML tags.
Don't make assumptions about tool results if <tool_response> XML tags are not present since function hasn't been executed yet.
Analyze the data once you get the results and call another function.
At each iteration please continue adding the your analysis to previous summary.
Your final response should directly answer the user query with an anlysis or summary of the results of function calls.
Tools: |
Here are the available tools:
<tools> {{agent.tools}} </tools>
If the provided function signatures doesn't have the function you must call, you may write executable python code in markdown syntax and call code_interpreter() function as follows:
<tool_call>
{{"arguments": {{"code_markdown": <python-code>, "name": "code_interpreter"}}}}
</tool_call>
Make sure that the json object above with code markdown block is parseable with json.loads() and the XML block with XML ElementTree.
Schema: |
Use the following pydantic model json schema for each tool call you will make:
{schema}
Instructions: |
At the very first turn you don't have <tool_results> so you shouldn't not make up the results.
Please keep a running summary with analysis of previous function results and summaries from previous iterations.
Do not stop calling functions until the task has been accomplished or you've reached max iteration of 10.
Calling multiple functions at once can overload the system and increase cost so call one function at a time please.
If you plan to continue with analysis, always call another function.
For each function call return a valid json object (using doulbe quotes) with function name and arguments within <tool_call></tool_call> XML tags as follows:
<tool_call>
{{"arguments": <args-dict>, "name": <function-name>}}
</tool_call>
4 changes: 2 additions & 2 deletions agents-api/agents_api/rec_sum/generate.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from tenacity import retry, stop_after_attempt, wait_fixed
from agents_api.env import model_inference_url, model_api_key
from agents_api.model_registry import JULEP_MODELS
from agents_api.model_registry import LOCAL_MODELS
from litellm import acompletion


Expand All @@ -11,7 +11,7 @@ async def generate(
**kwargs,
) -> dict:
base_url, api_key = None, None
if model in JULEP_MODELS:
if model in LOCAL_MODELS:
base_url, api_key = model_inference_url, model_api_key
model = f"openai/{model}"

Expand Down
Loading
Loading