diff --git a/.env.example b/.env.example
index fd0bece7d..ce2d956a4 100644
--- a/.env.example
+++ b/.env.example
@@ -5,24 +5,27 @@ COZO_AUTH_TOKEN=myauthkey
COZO_HOST=http://memory-store:9070
COZO_PORT=9070
COZO_ROCKSDB_DIR=cozo.db
-DTYPE=bfloat16
+DTYPE=float16
EMBEDDING_SERVICE_URL=http://text-embeddings-inference/embed
GATEWAY_PORT=80
-OPENAI_API_KEY=""
-GPU_MEMORY_UTILIZATION=0.95
+GPU_MEMORY_UTILIZATION=0.90
+
HF_TOKEN=""
HUGGING_FACE_HUB_TOKEN=""
-JWT_SHARED_KEY=this_shared_key_is_32_48_or_64_bytes_long
-MAX_MODEL_LEN=1024
+JWT_SHARED_KEY=
+
+MAX_MODEL_LEN=8192
MAX_NUM_SEQS=1
MNT_DIR=/data
-GF_SECURITY_ADMIN_PASSWORD=changethis
MODEL_API_KEY=myauthkey
MODEL_API_KEY_HEADER_NAME=Authorization
MODEL_API_URL=http://model-serving:8000
MODEL_INFERENCE_URL=http://model-serving:8000/v1
-MODEL_ID=BAAI/llm-embedder
-MODEL_NAME = "julep-ai/samantha-1-turbo"
+MODEL_ID=BAAI/bge-m3
+
+# MODEL_NAME="OpenPipe/Hermes-2-Theta-Llama-3-8B-32k"
+MODEL_NAME="julep-ai/Hermes-2-Theta-Llama-3-8B"
+
SKIP_CHECK_DEVELOPER_HEADERS=true
SUMMARIZATION_TOKENS_THRESHOLD=2048
TEMPERATURE_SCALING_FACTOR=0.9
@@ -33,4 +36,8 @@ TEMPORAL_WORKER_URL=temporal:7233
TP_SIZE=1
TRUNCATE_EMBED_TEXT=true
TRAEFIK_LOG_LEVEL=DEBUG
-WORKER_URL=temporal:7233
\ No newline at end of file
+WORKER_URL=temporal:7233
+
+AGENTS_API_DEBUG=false
+OPENAI_API_KEY=
+ANTHROPIC_API_KEY=
\ No newline at end of file
diff --git a/agents-api/agents_api/activities/embed_docs.py b/agents-api/agents_api/activities/embed_docs.py
index 228cee258..633e4d4de 100644
--- a/agents-api/agents_api/activities/embed_docs.py
+++ b/agents-api/agents_api/activities/embed_docs.py
@@ -1,6 +1,6 @@
from pydantic import UUID4
from temporalio import activity
-from agents_api.env import docs_embedding_model_id
+from agents_api.env import embedding_model_id
from agents_api.models.docs.embed_docs import (
embed_docs_snippets_query,
)
@@ -13,7 +13,7 @@
@activity.defn
async def embed_docs(doc_id: UUID4, title: str, content: list[str]) -> None:
indices, snippets = list(zip(*enumerate(content)))
- model = EmbeddingModel.from_model_name(docs_embedding_model_id)
+ model = EmbeddingModel.from_model_name(embedding_model_id)
embeddings = await model.embed(
[
{
diff --git a/agents-api/agents_api/activities/summarization.py b/agents-api/agents_api/activities/summarization.py
index e44fc455d..03a5f0d79 100644
--- a/agents-api/agents_api/activities/summarization.py
+++ b/agents-api/agents_api/activities/summarization.py
@@ -11,7 +11,7 @@
entries_summarization_query,
)
from agents_api.common.protocol.entries import Entry
-from ..model_registry import JULEP_MODELS
+from ..model_registry import LOCAL_MODELS
from ..env import model_inference_url, model_api_key, summarization_model_name
from agents_api.rec_sum.entities import get_entities
from agents_api.rec_sum.summarize import summarize_messages
@@ -135,7 +135,7 @@ async def run_prompt(
) -> str:
api_base = None
api_key = None
- if model in JULEP_MODELS:
+ if model in LOCAL_MODELS:
api_base = model_inference_url
api_key = model_api_key
model = f"openai/{model}"
diff --git a/agents-api/agents_api/embed_models_registry.py b/agents-api/agents_api/embed_models_registry.py
index babf1be22..aa5f5ee5a 100644
--- a/agents-api/agents_api/embed_models_registry.py
+++ b/agents-api/agents_api/embed_models_registry.py
@@ -10,7 +10,7 @@
PromptTooBigError,
UnknownTokenizerError,
)
-from agents_api.env import docs_embedding_service_url
+from agents_api.env import embedding_service_url
def normalize_l2(x):
@@ -83,7 +83,7 @@ async def embed(
embeddings = await embed(
input,
embedding_service_url=self.embedding_service_url
- or docs_embedding_service_url,
+ or embedding_service_url,
embedding_model_name=self.embedding_model_name,
)
elif self.embedding_provider == "openai":
@@ -130,7 +130,7 @@ def normalize(
tokenizer=tiktoken.encoding_for_model("text-embedding-3-large"),
),
"Alibaba-NLP/gte-large-en-v1.5": EmbeddingModel(
- embedding_service_url=docs_embedding_service_url,
+ embedding_service_url=embedding_service_url,
embedding_provider="julep",
embedding_model_name="Alibaba-NLP/gte-large-en-v1.5",
original_embedding_dimensions=1024,
@@ -139,7 +139,7 @@ def normalize(
tokenizer=Tokenizer.from_pretrained("Alibaba-NLP/gte-large-en-v1.5"),
),
"BAAI/bge-m3": EmbeddingModel(
- embedding_service_url=docs_embedding_service_url,
+ embedding_service_url=embedding_service_url,
embedding_provider="julep",
embedding_model_name="BAAI/bge-m3",
original_embedding_dimensions=1024,
@@ -148,7 +148,7 @@ def normalize(
tokenizer=Tokenizer.from_pretrained("BAAI/bge-m3"),
),
"BAAI/llm-embedder": EmbeddingModel(
- embedding_service_url=docs_embedding_service_url,
+ embedding_service_url=embedding_service_url,
embedding_provider="julep",
embedding_model_name="BAAI/llm-embedder",
original_embedding_dimensions=1024,
diff --git a/agents-api/agents_api/env.py b/agents-api/agents_api/env.py
index 2825aba27..15b83d408 100644
--- a/agents-api/agents_api/env.py
+++ b/agents-api/agents_api/env.py
@@ -44,20 +44,12 @@
"SKIP_CHECK_DEVELOPER_HEADERS", default=False
)
-# embedding service URL
embedding_service_url: str = env.str(
- "EMBEDDING_SERVICE_URL", default="http://0.0.0.0:8082/embed"
+ "EMBEDDING_SERVICE_URL", default="http://0.0.0.0:8083/embed"
)
-docs_embedding_service_url: str = env.str(
- "DOCS_EMBEDDING_SERVICE_URL", default="http://0.0.0.0:8083/embed"
-)
-
-embedding_model_id: str = env.str(
- "EMBEDDING_MODEL_ID", default="BAAI/bge-large-en-v1.5"
-)
-docs_embedding_model_id: str = env.str("DOCS_EMBEDDING_MODEL_ID", default="BAAI/bge-m3")
+embedding_model_id: str = env.str("EMBEDDING_MODEL_ID", default="BAAI/bge-m3")
truncate_embed_text: bool = env.bool("TRUNCATE_EMBED_TEXT", default=False)
@@ -84,8 +76,7 @@
temporal_worker_url=temporal_worker_url,
temporal_namespace=temporal_namespace,
openai_api_key=openai_api_key,
- docs_embedding_model_id=docs_embedding_model_id,
- docs_embedding_service_url=docs_embedding_service_url,
+ docs_embedding_service_url=embedding_service_url,
embedding_model_id=embedding_model_id,
)
diff --git a/agents-api/agents_api/model_registry.py b/agents-api/agents_api/model_registry.py
index 350b5559b..aa3cb6854 100644
--- a/agents-api/agents_api/model_registry.py
+++ b/agents-api/agents_api/model_registry.py
@@ -2,7 +2,8 @@
Model Registry maintains a list of supported models and their configs.
"""
-from typing import Dict
+import ast
+import json
from agents_api.clients.worker.types import ChatML
from agents_api.common.exceptions.agents import (
AgentModelNotValid,
@@ -10,6 +11,10 @@
)
import litellm
from litellm.utils import get_valid_models
+from pydantic import BaseModel
+from typing import Dict, Literal, Optional
+import xml.etree.ElementTree as ET
+
GPT4_MODELS: Dict[str, int] = {
# stable model names:
@@ -93,16 +98,56 @@
OPENAI_MODELS = {**GPT4_MODELS, **TURBO_MODELS, **GPT3_5_MODELS, **GPT3_MODELS}
-JULEP_MODELS = {
+LOCAL_MODELS = {
"julep-ai/samantha-1-turbo": 32768,
"julep-ai/samantha-1-turbo-awq": 32768,
+ "TinyLlama/TinyLlama_v1.1": 2048,
+ "casperhansen/llama-3-8b-instruct-awq": 8192,
+ "julep-ai/Hermes-2-Theta-Llama-3-8B": 8192,
+ "OpenPipe/Hermes-2-Theta-Llama-3-8B-32k": 32768,
+}
+
+LOCAL_MODELS_WITH_TOOL_CALLS = {
+ "OpenPipe/Hermes-2-Theta-Llama-3-8B-32k": 32768,
+ "julep-ai/Hermes-2-Theta-Llama-3-8B": 8192,
}
CHAT_MODELS = {**GPT4_MODELS, **TURBO_MODELS, **CLAUDE_MODELS}
+ALL_AVAILABLE_MODELS = litellm.model_list + list(LOCAL_MODELS.keys())
+VALID_MODELS = get_valid_models() + list(LOCAL_MODELS.keys())
+
+
+class FunctionCall(BaseModel):
+ arguments: dict
+ """
+ The arguments to call the function with, as generated by the model in JSON
+ format. Note that the model does not always generate valid JSON, and may
+ hallucinate parameters not defined by your function schema. Validate the
+ arguments in your code before calling your function.
+ """
+
+ name: str
+ """The name of the function to call."""
-ALL_AVAILABLE_MODELS = litellm.model_list + list(JULEP_MODELS.keys())
-VALID_MODELS = get_valid_models() + list(JULEP_MODELS.keys())
+
+class FunctionDefinition(BaseModel):
+ name: str
+ description: Optional[str] = None
+ parameters: Optional[Dict[str, object]] = None
+
+
+class FunctionSignature(BaseModel):
+ function: FunctionDefinition
+ type: Literal["function"]
+
+
+class PromptSchema(BaseModel):
+ Role: str
+ Objective: str
+ Tools: str
+ Schema: str
+ Instructions: str
def validate_configuration(model: str):
@@ -127,7 +172,7 @@ def load_context(init_context: list[ChatML], model: str):
}
for msg in init_context
]
- elif model in JULEP_MODELS:
+ elif model in LOCAL_MODELS:
init_context = [
{"name": msg.name, "role": msg.role, "content": msg.content}
for msg in init_context
@@ -137,6 +182,54 @@ def load_context(init_context: list[ChatML], model: str):
return init_context
+def validate_and_extract_tool_calls(assistant_content):
+ validation_result = False
+ tool_calls = []
+ error_message = None
+
+ try:
+ # wrap content in root element
+ xml_root_element = f"{assistant_content}"
+ root = ET.fromstring(xml_root_element)
+
+ # extract JSON data
+ for element in root.findall(".//tool_call"):
+ json_data = None
+ try:
+ if element.text is None:
+ continue
+
+ json_text = element.text.strip()
+
+ try:
+ # Prioritize json.loads for better error handling
+ json_data = json.loads(json_text)
+ except json.JSONDecodeError as json_err:
+ try:
+ # Fallback to ast.literal_eval if json.loads fails
+ json_data = ast.literal_eval(json_text)
+ except (SyntaxError, ValueError) as eval_err:
+ error_message = (
+ f"JSON parsing failed with both json.loads and ast.literal_eval:\n"
+ f"- JSON Decode Error: {json_err}\n"
+ f"- Fallback Syntax/Value Error: {eval_err}\n"
+ f"- Problematic JSON text: {json_text}"
+ )
+ continue
+ except BaseException as e:
+ error_message = f"Cannot strip text: {e}"
+
+ if json_data is not None:
+ tool_calls.append(json_data)
+ validation_result = True
+
+ except ET.ParseError as err:
+ error_message = f"XML Parse Error: {err}"
+
+ # Return default values if no valid data is extracted
+ return validation_result, tool_calls, error_message
+
+
def get_extra_settings(settings):
extra_settings = (
dict(
@@ -147,7 +240,7 @@ def get_extra_settings(settings):
logit_bias=settings.logit_bias,
preset=settings.preset.name if settings.preset else None,
)
- if settings.model in JULEP_MODELS
+ if settings.model in LOCAL_MODELS
else {}
)
diff --git a/agents-api/agents_api/prompt_assets/sys_prompt.yml b/agents-api/agents_api/prompt_assets/sys_prompt.yml
new file mode 100644
index 000000000..0aad05160
--- /dev/null
+++ b/agents-api/agents_api/prompt_assets/sys_prompt.yml
@@ -0,0 +1,35 @@
+Role: |
+ You are a function calling AI agent with self-recursion.
+ You can call only one function at a time and analyse data you get from function response.
+ You are provided with function signatures within XML tags.
+ The current date is: {date}.
+Objective: |
+ You may use agentic frameworks for reasoning and planning to help with user query.
+ Please call a function and wait for function results to be provided to you in the next iteration.
+ Don't make assumptions about what values to plug into function arguments.
+ Once you have called a function, results will be fed back to you within XML tags.
+ Don't make assumptions about tool results if XML tags are not present since function hasn't been executed yet.
+ Analyze the data once you get the results and call another function.
+ At each iteration please continue adding the your analysis to previous summary.
+ Your final response should directly answer the user query with an anlysis or summary of the results of function calls.
+Tools: |
+ Here are the available tools:
+ {{agent.tools}}
+ If the provided function signatures doesn't have the function you must call, you may write executable python code in markdown syntax and call code_interpreter() function as follows:
+
+ {{"arguments": {{"code_markdown": , "name": "code_interpreter"}}}}
+
+ Make sure that the json object above with code markdown block is parseable with json.loads() and the XML block with XML ElementTree.
+Schema: |
+ Use the following pydantic model json schema for each tool call you will make:
+ {schema}
+Instructions: |
+ At the very first turn you don't have so you shouldn't not make up the results.
+ Please keep a running summary with analysis of previous function results and summaries from previous iterations.
+ Do not stop calling functions until the task has been accomplished or you've reached max iteration of 10.
+ Calling multiple functions at once can overload the system and increase cost so call one function at a time please.
+ If you plan to continue with analysis, always call another function.
+ For each function call return a valid json object (using doulbe quotes) with function name and arguments within XML tags as follows:
+
+ {{"arguments": , "name": }}
+
diff --git a/agents-api/agents_api/rec_sum/generate.py b/agents-api/agents_api/rec_sum/generate.py
index e05111faa..e1a2eb607 100644
--- a/agents-api/agents_api/rec_sum/generate.py
+++ b/agents-api/agents_api/rec_sum/generate.py
@@ -1,6 +1,6 @@
from tenacity import retry, stop_after_attempt, wait_fixed
from agents_api.env import model_inference_url, model_api_key
-from agents_api.model_registry import JULEP_MODELS
+from agents_api.model_registry import LOCAL_MODELS
from litellm import acompletion
@@ -11,7 +11,7 @@ async def generate(
**kwargs,
) -> dict:
base_url, api_key = None, None
- if model in JULEP_MODELS:
+ if model in LOCAL_MODELS:
base_url, api_key = model_inference_url, model_api_key
model = f"openai/{model}"
diff --git a/agents-api/agents_api/routers/sessions/session.py b/agents-api/agents_api/routers/sessions/session.py
index da9121366..fc6309083 100644
--- a/agents-api/agents_api/routers/sessions/session.py
+++ b/agents-api/agents_api/routers/sessions/session.py
@@ -25,13 +25,14 @@
from ...common.utils.json import CustomJSONEncoder
from ...common.utils.messages import stringify_content
from ...env import (
- docs_embedding_service_url,
- docs_embedding_model_id,
+ embedding_service_url,
+ embedding_model_id,
)
from ...model_registry import (
- JULEP_MODELS,
- get_extra_settings,
+ LOCAL_MODELS,
+ LOCAL_MODELS_WITH_TOOL_CALLS,
load_context,
+ validate_and_extract_tool_calls,
)
from ...models.entry.add_entries import add_entries_query
from ...models.entry.proc_mem_context import proc_mem_context_query
@@ -261,8 +262,8 @@ async def forward(
]
],
join_inputs=False,
- embedding_service_url=docs_embedding_service_url,
- embedding_model_name=docs_embedding_model_id,
+ embedding_service_url=embedding_service_url,
+ embedding_model_name=embedding_model_id,
)
entries: list[Entry] = []
@@ -357,7 +358,10 @@ async def forward(
and message.content[0].type == "text"
):
message.content = message.content[0].text
-
+ # Add tools to settings
+ if tools:
+ settings.tools = settings.tools or []
+ settings.tools.extend(tools)
# If render_templates=True, render the templates
if session_data is not None and session_data.render_templates:
@@ -378,6 +382,7 @@ async def forward(
"name": session_data.agent_name,
"about": session_data.agent_about,
"metadata": session_data.agent_metadata,
+ "tools": settings.tools,
},
}
@@ -392,11 +397,6 @@ async def forward(
if session_data is not None:
settings.model = session_data.model
- # Add tools to settings
- if tools:
- settings.tools = settings.tools or []
- settings.tools.extend(tools)
-
return messages, settings, doc_ids
@cache
@@ -408,7 +408,7 @@ async def generate(
api_base = None
api_key = None
model = settings.model
- if model in JULEP_MODELS:
+ if model in LOCAL_MODELS:
api_base = model_inference_url
api_key = model_api_key
model = f"openai/{model}"
@@ -416,11 +416,8 @@ async def generate(
if settings.tools:
tools = [(tool.model_dump(exclude="id")) for tool in settings.tools]
- extra_body = get_extra_settings(settings)
-
litellm.drop_params = True
litellm.add_function_to_prompt = True
-
res = await acompletion(
model=model,
messages=init_context,
@@ -435,9 +432,18 @@ async def generate(
response_format=settings.response_format,
api_base=api_base,
api_key=api_key,
- **extra_body,
)
-
+ if model in LOCAL_MODELS_WITH_TOOL_CALLS:
+ validation, tool_call, error_msg = validate_and_extract_tool_calls(
+ res.choices[0].message.content
+ )
+ if validation:
+ res.choices[0].message.role = (
+ "function_call" if tool_call else "assistant"
+ )
+ res.choices[0].finish_reason = "tool_calls"
+ res.choices[0].message.tool_calls = tool_call
+ res.choices[0].message.content = json.dumps(tool_call)
return res
async def backward(
diff --git a/agents-api/docker-compose.yml b/agents-api/docker-compose.yml
index ae21334d9..150473c3c 100644
--- a/agents-api/docker-compose.yml
+++ b/agents-api/docker-compose.yml
@@ -59,9 +59,9 @@ services:
container_name: text-embeddings-inference
environment:
- DTYPE=float16
- - MODEL_ID=BAAI/llm-embedder
+ - MODEL_ID=BAAI/bge-m3
- image: ghcr.io/huggingface/text-embeddings-inference:1.0
+ image: ghcr.io/huggingface/text-embeddings-inference:1.3
ports:
- "8082:80"
volumes:
@@ -75,25 +75,6 @@ services:
count: all
capabilities: [gpu]
- docs-text-embeddings-inference:
- container_name: docs-text-embeddings-inference
- environment:
- - DTYPE=float16
- - MODEL_ID=BAAI/bge-m3
-
- image: ghcr.io/huggingface/text-embeddings-inference:1.0
- ports:
- - "8083:80"
- volumes:
- - ~/.cache/huggingface/hub:/data
- shm_size: "2gb"
- deploy:
- resources:
- reservations:
- devices:
- - driver: nvidia
- count: all
- capabilities: [gpu]
temporal:
image: julepai/temporal:dev
diff --git a/model-serving/Dockerfile b/model-serving/Dockerfile
index 0b9f16a4d..b25c81eba 100644
--- a/model-serving/Dockerfile
+++ b/model-serving/Dockerfile
@@ -1,35 +1,12 @@
-# Use vllm/vllm-openai:0.3.2 as a base image
-FROM vllm/vllm-openai:v0.3.3 as builder
+FROM vllm/vllm-openai:v0.5.0 as base
-# Set environment variables
-ENV PYTHONUNBUFFERED True
-ENV POETRY_CACHE_DIR=/tmp/poetry_cache
-
-# Set the working directory
-WORKDIR /app
-
-# Install Poetry
-# Disable Poetry's virtual environment as Docker provides isolation
-RUN pip install --no-cache-dir --upgrade poetry \
- && poetry config virtualenvs.create false
-
-# Copy only the Poetry configuration files
-COPY pyproject.toml poetry.lock ./
-
-# Install dependencies only, excluding the application itself
-RUN poetry install --no-dev --no-root
-
-# Copy the rest of your application code
-COPY . .
-
-# Now, install the application with Poetry, which will not re-install the dependencies
-RUN poetry install --no-dev
# Define the entrypoint
-ENV MODEL_NAME julep-ai/samantha-1-turbo
+ENV MODEL_NAME julep-ai/Hermes-2-Theta-Llama-3-8B
ENV TP_SIZE 1
ENV MAX_MODEL_LEN 8192
ENV MAX_NUM_SEQS 1
ENV GPU_MEMORY_UTILIZATION 0.95
ENV DTYPE bfloat16
-ENTRYPOINT python3 -m model_api --model $MODEL_NAME --tensor-parallel-size $TP_SIZE --enforce-eager --gpu-memory-utilization $GPU_MEMORY_UTILIZATION --max-model-len $MAX_MODEL_LEN --max-num-seqs $MAX_NUM_SEQS --dtype $DTYPE
+ENV MODEL_API_KEY myauthkey
+ENTRYPOINT python3 -m vllm.entrypoints.openai.api_server --model $MODEL_NAME --tensor-parallel-size $TP_SIZE --enforce-eager --gpu-memory-utilization $GPU_MEMORY_UTILIZATION --max-model-len $MAX_MODEL_LEN --max-num-seqs $MAX_NUM_SEQS --dtype $DTYPE --trust-remote-code --api_key=$MODEL_API_KEY
diff --git a/model-serving/artifacts/nous-llama-fix.ipynb b/model-serving/artifacts/nous-llama-fix.ipynb
new file mode 100644
index 000000000..0cfad097e
--- /dev/null
+++ b/model-serving/artifacts/nous-llama-fix.ipynb
@@ -0,0 +1,278 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 86,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/home/sidbin/miniconda3/envs/julep/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
+ " warnings.warn(\n"
+ ]
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "b9c2b4fa7a02414581da3b7ec438472a",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "tokenizer_config.json: 0%| | 0.00/50.7k [00:00, ?B/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "4ad1285872434e7994a2451f789c4b1d",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "tokenizer.json: 0%| | 0.00/9.09M [00:00, ?B/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "8a55dcc3ed7245289cd56a9b897e9b41",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "special_tokens_map.json: 0%| | 0.00/444 [00:00, ?B/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig, AutoModel\n",
+ "\n",
+ "\n",
+ "main_tokenizer = AutoTokenizer.from_pretrained(\"NousResearch/Hermes-2-Theta-Llama-3-8B\")\n",
+ "stable_tokenizer = AutoTokenizer.from_pretrained(\n",
+ " \"NousResearch/Hermes-2-Theta-Llama-3-8B\",\n",
+ " revision=\"dd2bfa013380639acf9e8fa45ceacbc45fb44081\",\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 75,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "config = AutoConfig.from_pretrained(\n",
+ " \"NousResearch/Hermes-2-Theta-Llama-3-8B\",\n",
+ " revision=\"dd2bfa013380639acf9e8fa45ceacbc45fb44081\",\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "token_id_to_change = 128005\n",
+ "new_token = stable_tokenizer.convert_ids_to_tokens(token_id_to_change)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "old_token = main_tokenizer.convert_ids_to_tokens(token_id_to_change)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 89,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "('<|reserved_special_token_3|>', '')"
+ ]
+ },
+ "execution_count": 89,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "old_token, new_token"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 79,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "8fbb2345fcca46c5b4f8cdbf7fac4901",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "Fetching 12 files: 0%| | 0/12 [00:00, ?it/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "from huggingface_hub import login, snapshot_download, create_repo, Repository\n",
+ "\n",
+ "# Step 3: Clone the Original Model Repository\n",
+ "model_name = \"NousResearch/Hermes-2-Theta-Llama-3-8B\"\n",
+ "snapshot = snapshot_download(\n",
+ " repo_id=model_name,\n",
+ " repo_type=\"model\",\n",
+ " revision=\"dd2bfa013380639acf9e8fa45ceacbc45fb44081\",\n",
+ " local_dir=model_name,\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 82,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "new_repo_name = \"julep-ai/Hermes-2-Theta-Llama-3-8B\"\n",
+ "create_repo(repo_id=new_repo_name, repo_type=\"model\", private=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 85,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "c21ecaf4ac474112a0d7b4c2cc9db3fc",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "model-00003-of-00004.safetensors: 0%| | 0.00/4.92G [00:00, ?B/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "8f7bcf4eb58a429b921f3a44f28b3da9",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "model-00001-of-00004.safetensors: 0%| | 0.00/4.98G [00:00, ?B/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "7cf4db172c9e40f9920c7c59187bbce3",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "model-00002-of-00004.safetensors: 0%| | 0.00/5.00G [00:00, ?B/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "40bc6816734444bbadb467938c772a78",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "model-00004-of-00004.safetensors: 0%| | 0.00/1.17G [00:00, ?B/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "8c0901790c0a4d0393405b0c7f89cdeb",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "Upload 4 LFS files: 0%| | 0/4 [00:00, ?it/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/plain": [
+ "CommitInfo(commit_url='https://huggingface.co/julep-ai/Hermes-2-Theta-Llama-3-8B/commit/2b3a264ffc8a3bfe6f127b324762a47eadf30289', commit_message='Upload folder using huggingface_hub', commit_description='', oid='2b3a264ffc8a3bfe6f127b324762a47eadf30289', pr_url=None, pr_revision=None, pr_num=None)"
+ ]
+ },
+ "execution_count": 85,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from huggingface_hub import upload_folder\n",
+ "\n",
+ "upload_folder(repo_id=new_repo_name, repo_type=\"model\", folder_path=snapshot)"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "julep",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.14"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/model-serving/docker-compose.yml b/model-serving/docker-compose.yml
index 2342c03e0..4e2447cd6 100644
--- a/model-serving/docker-compose.yml
+++ b/model-serving/docker-compose.yml
@@ -6,10 +6,8 @@ services:
env_file: "../.env"
environment:
- - MODEL_API_KEY=${MODEL_API_KEY}
- - MODEL_API_KEY_HEADER_NAME=${MODEL_API_KEY_HEADER_NAME}
- - SKIP_CHECK_DEVELOPER_HEADERS=${SKIP_CHECK_DEVELOPER_HEADERS}
- MODEL_NAME=${MODEL_NAME}
+ - MODEL_API_KEY=${MODEL_API_KEY}
- HF_TOKEN=${HF_TOKEN}
- HUGGING_FACE_HUB_TOKEN=${HUGGING_FACE_HUB_TOKEN}
- SENTRY_DSN=${SENTRY_DSN}