diff --git a/.env.example b/.env.example
index fd0bece7d..ce2d956a4 100644
--- a/.env.example
+++ b/.env.example
@@ -5,24 +5,27 @@ COZO_AUTH_TOKEN=myauthkey
 COZO_HOST=http://memory-store:9070
 COZO_PORT=9070
 COZO_ROCKSDB_DIR=cozo.db
-DTYPE=bfloat16
+DTYPE=float16
 EMBEDDING_SERVICE_URL=http://text-embeddings-inference/embed
 GATEWAY_PORT=80
-OPENAI_API_KEY=""
-GPU_MEMORY_UTILIZATION=0.95
+GPU_MEMORY_UTILIZATION=0.90
+
 HF_TOKEN=""
 HUGGING_FACE_HUB_TOKEN=""
-JWT_SHARED_KEY=this_shared_key_is_32_48_or_64_bytes_long
-MAX_MODEL_LEN=1024
+JWT_SHARED_KEY=
+
+MAX_MODEL_LEN=8192
 MAX_NUM_SEQS=1
 MNT_DIR=/data
-GF_SECURITY_ADMIN_PASSWORD=changethis
 MODEL_API_KEY=myauthkey
 MODEL_API_KEY_HEADER_NAME=Authorization
 MODEL_API_URL=http://model-serving:8000
 MODEL_INFERENCE_URL=http://model-serving:8000/v1
-MODEL_ID=BAAI/llm-embedder
-MODEL_NAME = "julep-ai/samantha-1-turbo"
+MODEL_ID=BAAI/bge-m3
+
+# MODEL_NAME="OpenPipe/Hermes-2-Theta-Llama-3-8B-32k"
+MODEL_NAME="julep-ai/Hermes-2-Theta-Llama-3-8B"
+
 SKIP_CHECK_DEVELOPER_HEADERS=true
 SUMMARIZATION_TOKENS_THRESHOLD=2048
 TEMPERATURE_SCALING_FACTOR=0.9
@@ -33,4 +36,8 @@ TEMPORAL_WORKER_URL=temporal:7233
 TP_SIZE=1
 TRUNCATE_EMBED_TEXT=true
 TRAEFIK_LOG_LEVEL=DEBUG
-WORKER_URL=temporal:7233
\ No newline at end of file
+WORKER_URL=temporal:7233
+
+AGENTS_API_DEBUG=false
+OPENAI_API_KEY=
+ANTHROPIC_API_KEY=
\ No newline at end of file
diff --git a/agents-api/agents_api/activities/embed_docs.py b/agents-api/agents_api/activities/embed_docs.py
index 228cee258..633e4d4de 100644
--- a/agents-api/agents_api/activities/embed_docs.py
+++ b/agents-api/agents_api/activities/embed_docs.py
@@ -1,6 +1,6 @@
 from pydantic import UUID4
 from temporalio import activity
-from agents_api.env import docs_embedding_model_id
+from agents_api.env import embedding_model_id
 from agents_api.models.docs.embed_docs import (
     embed_docs_snippets_query,
 )
@@ -13,7 +13,7 @@
 @activity.defn
 async def embed_docs(doc_id: UUID4, title: str, content: list[str]) -> None:
     indices, snippets = list(zip(*enumerate(content)))
-    model = EmbeddingModel.from_model_name(docs_embedding_model_id)
+    model = EmbeddingModel.from_model_name(embedding_model_id)
     embeddings = await model.embed(
         [
             {
diff --git a/agents-api/agents_api/activities/summarization.py b/agents-api/agents_api/activities/summarization.py
index e44fc455d..03a5f0d79 100644
--- a/agents-api/agents_api/activities/summarization.py
+++ b/agents-api/agents_api/activities/summarization.py
@@ -11,7 +11,7 @@
     entries_summarization_query,
 )
 from agents_api.common.protocol.entries import Entry
-from ..model_registry import JULEP_MODELS
+from ..model_registry import LOCAL_MODELS
 from ..env import model_inference_url, model_api_key, summarization_model_name
 from agents_api.rec_sum.entities import get_entities
 from agents_api.rec_sum.summarize import summarize_messages
@@ -135,7 +135,7 @@ async def run_prompt(
 ) -> str:
     api_base = None
     api_key = None
-    if model in JULEP_MODELS:
+    if model in LOCAL_MODELS:
         api_base = model_inference_url
         api_key = model_api_key
         model = f"openai/{model}"
diff --git a/agents-api/agents_api/embed_models_registry.py b/agents-api/agents_api/embed_models_registry.py
index babf1be22..aa5f5ee5a 100644
--- a/agents-api/agents_api/embed_models_registry.py
+++ b/agents-api/agents_api/embed_models_registry.py
@@ -10,7 +10,7 @@
     PromptTooBigError,
     UnknownTokenizerError,
 )
-from agents_api.env import docs_embedding_service_url
+from agents_api.env import embedding_service_url
 
 
 def normalize_l2(x):
@@ -83,7 +83,7 @@ async def embed(
             embeddings = await embed(
                 input,
                 embedding_service_url=self.embedding_service_url
-                or docs_embedding_service_url,
+                or embedding_service_url,
                 embedding_model_name=self.embedding_model_name,
             )
         elif self.embedding_provider == "openai":
@@ -130,7 +130,7 @@ def normalize(
         tokenizer=tiktoken.encoding_for_model("text-embedding-3-large"),
     ),
     "Alibaba-NLP/gte-large-en-v1.5": EmbeddingModel(
-        embedding_service_url=docs_embedding_service_url,
+        embedding_service_url=embedding_service_url,
         embedding_provider="julep",
         embedding_model_name="Alibaba-NLP/gte-large-en-v1.5",
         original_embedding_dimensions=1024,
@@ -139,7 +139,7 @@ def normalize(
         tokenizer=Tokenizer.from_pretrained("Alibaba-NLP/gte-large-en-v1.5"),
     ),
     "BAAI/bge-m3": EmbeddingModel(
-        embedding_service_url=docs_embedding_service_url,
+        embedding_service_url=embedding_service_url,
         embedding_provider="julep",
         embedding_model_name="BAAI/bge-m3",
         original_embedding_dimensions=1024,
@@ -148,7 +148,7 @@ def normalize(
         tokenizer=Tokenizer.from_pretrained("BAAI/bge-m3"),
     ),
     "BAAI/llm-embedder": EmbeddingModel(
-        embedding_service_url=docs_embedding_service_url,
+        embedding_service_url=embedding_service_url,
         embedding_provider="julep",
         embedding_model_name="BAAI/llm-embedder",
         original_embedding_dimensions=1024,
diff --git a/agents-api/agents_api/env.py b/agents-api/agents_api/env.py
index 2825aba27..15b83d408 100644
--- a/agents-api/agents_api/env.py
+++ b/agents-api/agents_api/env.py
@@ -44,20 +44,12 @@
     "SKIP_CHECK_DEVELOPER_HEADERS", default=False
 )
 
-# embedding service URL
 embedding_service_url: str = env.str(
-    "EMBEDDING_SERVICE_URL", default="http://0.0.0.0:8082/embed"
+    "EMBEDDING_SERVICE_URL", default="http://0.0.0.0:8083/embed"
 )
 
-docs_embedding_service_url: str = env.str(
-    "DOCS_EMBEDDING_SERVICE_URL", default="http://0.0.0.0:8083/embed"
-)
-
-embedding_model_id: str = env.str(
-    "EMBEDDING_MODEL_ID", default="BAAI/bge-large-en-v1.5"
-)
 
-docs_embedding_model_id: str = env.str("DOCS_EMBEDDING_MODEL_ID", default="BAAI/bge-m3")
+embedding_model_id: str = env.str("EMBEDDING_MODEL_ID", default="BAAI/bge-m3")
 
 truncate_embed_text: bool = env.bool("TRUNCATE_EMBED_TEXT", default=False)
 
@@ -84,8 +76,7 @@
     temporal_worker_url=temporal_worker_url,
     temporal_namespace=temporal_namespace,
     openai_api_key=openai_api_key,
-    docs_embedding_model_id=docs_embedding_model_id,
-    docs_embedding_service_url=docs_embedding_service_url,
+    docs_embedding_service_url=embedding_service_url,
     embedding_model_id=embedding_model_id,
 )
 
diff --git a/agents-api/agents_api/model_registry.py b/agents-api/agents_api/model_registry.py
index 350b5559b..aa3cb6854 100644
--- a/agents-api/agents_api/model_registry.py
+++ b/agents-api/agents_api/model_registry.py
@@ -2,7 +2,8 @@
 Model Registry maintains a list of supported models and their configs.
 """
 
-from typing import Dict
+import ast
+import json
 from agents_api.clients.worker.types import ChatML
 from agents_api.common.exceptions.agents import (
     AgentModelNotValid,
@@ -10,6 +11,10 @@
 )
 import litellm
 from litellm.utils import get_valid_models
+from pydantic import BaseModel
+from typing import Dict, Literal, Optional
+import xml.etree.ElementTree as ET
+
 
 GPT4_MODELS: Dict[str, int] = {
     # stable model names:
@@ -93,16 +98,56 @@
 
 OPENAI_MODELS = {**GPT4_MODELS, **TURBO_MODELS, **GPT3_5_MODELS, **GPT3_MODELS}
 
-JULEP_MODELS = {
+LOCAL_MODELS = {
     "julep-ai/samantha-1-turbo": 32768,
     "julep-ai/samantha-1-turbo-awq": 32768,
+    "TinyLlama/TinyLlama_v1.1": 2048,
+    "casperhansen/llama-3-8b-instruct-awq": 8192,
+    "julep-ai/Hermes-2-Theta-Llama-3-8B": 8192,
+    "OpenPipe/Hermes-2-Theta-Llama-3-8B-32k": 32768,
+}
+
+LOCAL_MODELS_WITH_TOOL_CALLS = {
+    "OpenPipe/Hermes-2-Theta-Llama-3-8B-32k": 32768,
+    "julep-ai/Hermes-2-Theta-Llama-3-8B": 8192,
 }
 
 CHAT_MODELS = {**GPT4_MODELS, **TURBO_MODELS, **CLAUDE_MODELS}
 
+ALL_AVAILABLE_MODELS = litellm.model_list + list(LOCAL_MODELS.keys())
+VALID_MODELS = get_valid_models() + list(LOCAL_MODELS.keys())
+
+
+class FunctionCall(BaseModel):
+    arguments: dict
+    """
+    The arguments to call the function with, as generated by the model in JSON
+    format. Note that the model does not always generate valid JSON, and may
+    hallucinate parameters not defined by your function schema. Validate the
+    arguments in your code before calling your function.
+    """
+
+    name: str
+    """The name of the function to call."""
 
-ALL_AVAILABLE_MODELS = litellm.model_list + list(JULEP_MODELS.keys())
-VALID_MODELS = get_valid_models() + list(JULEP_MODELS.keys())
+
+class FunctionDefinition(BaseModel):
+    name: str
+    description: Optional[str] = None
+    parameters: Optional[Dict[str, object]] = None
+
+
+class FunctionSignature(BaseModel):
+    function: FunctionDefinition
+    type: Literal["function"]
+
+
+class PromptSchema(BaseModel):
+    Role: str
+    Objective: str
+    Tools: str
+    Schema: str
+    Instructions: str
 
 
 def validate_configuration(model: str):
@@ -127,7 +172,7 @@ def load_context(init_context: list[ChatML], model: str):
             }
             for msg in init_context
         ]
-    elif model in JULEP_MODELS:
+    elif model in LOCAL_MODELS:
         init_context = [
             {"name": msg.name, "role": msg.role, "content": msg.content}
             for msg in init_context
@@ -137,6 +182,54 @@ def load_context(init_context: list[ChatML], model: str):
     return init_context
 
 
+def validate_and_extract_tool_calls(assistant_content):
+    validation_result = False
+    tool_calls = []
+    error_message = None
+
+    try:
+        # wrap content in root element
+        xml_root_element = f"<root>{assistant_content}</root>"
+        root = ET.fromstring(xml_root_element)
+
+        # extract JSON data
+        for element in root.findall(".//tool_call"):
+            json_data = None
+            try:
+                if element.text is None:
+                    continue
+
+                json_text = element.text.strip()
+
+                try:
+                    # Prioritize json.loads for better error handling
+                    json_data = json.loads(json_text)
+                except json.JSONDecodeError as json_err:
+                    try:
+                        # Fallback to ast.literal_eval if json.loads fails
+                        json_data = ast.literal_eval(json_text)
+                    except (SyntaxError, ValueError) as eval_err:
+                        error_message = (
+                            f"JSON parsing failed with both json.loads and ast.literal_eval:\n"
+                            f"- JSON Decode Error: {json_err}\n"
+                            f"- Fallback Syntax/Value Error: {eval_err}\n"
+                            f"- Problematic JSON text: {json_text}"
+                        )
+                        continue
+            except BaseException as e:
+                error_message = f"Cannot strip text: {e}"
+
+            if json_data is not None:
+                tool_calls.append(json_data)
+                validation_result = True
+
+    except ET.ParseError as err:
+        error_message = f"XML Parse Error: {err}"
+
+    # Return default values if no valid data is extracted
+    return validation_result, tool_calls, error_message
+
+
 def get_extra_settings(settings):
     extra_settings = (
         dict(
@@ -147,7 +240,7 @@ def get_extra_settings(settings):
             logit_bias=settings.logit_bias,
             preset=settings.preset.name if settings.preset else None,
         )
-        if settings.model in JULEP_MODELS
+        if settings.model in LOCAL_MODELS
         else {}
     )
 
diff --git a/agents-api/agents_api/prompt_assets/sys_prompt.yml b/agents-api/agents_api/prompt_assets/sys_prompt.yml
new file mode 100644
index 000000000..0aad05160
--- /dev/null
+++ b/agents-api/agents_api/prompt_assets/sys_prompt.yml
@@ -0,0 +1,35 @@
+Role: |
+  You are a function calling AI agent with self-recursion.
+  You can call only one function at a time and analyse data you get from function response.
+  You are provided with function signatures within <tools></tools> XML tags.
+  The current date is: {date}.
+Objective: |
+  You may use agentic frameworks for reasoning and planning to help with user query.
+  Please call a function and wait for function results to be provided to you in the next iteration.
+  Don't make assumptions about what values to plug into function arguments.
+  Once you have called a function, results will be fed back to you within <tool_response></tool_response> XML tags.
+  Don't make assumptions about tool results if <tool_response> XML tags are not present since function hasn't been executed yet.
+  Analyze the data once you get the results and call another function.
+  At each iteration please continue adding the your analysis to previous summary.
+  Your final response should directly answer the user query with an anlysis or summary of the results of function calls.
+Tools: |
+  Here are the available tools:
+  <tools> {{agent.tools}} </tools>
+  If the provided function signatures doesn't have the function you must call, you may write executable python code in markdown syntax and call code_interpreter() function as follows:
+  <tool_call>
+  {{"arguments": {{"code_markdown": <python-code>, "name": "code_interpreter"}}}}
+  </tool_call>
+  Make sure that the json object above with code markdown block is parseable with json.loads() and the XML block with XML ElementTree.
+Schema: |
+  Use the following pydantic model json schema for each tool call you will make:
+  {schema}
+Instructions: |
+  At the very first turn you don't have <tool_results> so you shouldn't not make up the results.
+  Please keep a running summary with analysis of previous function results and summaries from previous iterations.
+  Do not stop calling functions until the task has been accomplished or you've reached max iteration of 10.
+  Calling multiple functions at once can overload the system and increase cost so call one function at a time please.
+  If you plan to continue with analysis, always call another function.
+  For each function call return a valid json object (using doulbe quotes) with function name and arguments within <tool_call></tool_call> XML tags as follows:
+  <tool_call>
+  {{"arguments": <args-dict>, "name": <function-name>}}
+  </tool_call>
diff --git a/agents-api/agents_api/rec_sum/generate.py b/agents-api/agents_api/rec_sum/generate.py
index e05111faa..e1a2eb607 100644
--- a/agents-api/agents_api/rec_sum/generate.py
+++ b/agents-api/agents_api/rec_sum/generate.py
@@ -1,6 +1,6 @@
 from tenacity import retry, stop_after_attempt, wait_fixed
 from agents_api.env import model_inference_url, model_api_key
-from agents_api.model_registry import JULEP_MODELS
+from agents_api.model_registry import LOCAL_MODELS
 from litellm import acompletion
 
 
@@ -11,7 +11,7 @@ async def generate(
     **kwargs,
 ) -> dict:
     base_url, api_key = None, None
-    if model in JULEP_MODELS:
+    if model in LOCAL_MODELS:
         base_url, api_key = model_inference_url, model_api_key
         model = f"openai/{model}"
 
diff --git a/agents-api/agents_api/routers/sessions/session.py b/agents-api/agents_api/routers/sessions/session.py
index da9121366..fc6309083 100644
--- a/agents-api/agents_api/routers/sessions/session.py
+++ b/agents-api/agents_api/routers/sessions/session.py
@@ -25,13 +25,14 @@
 from ...common.utils.json import CustomJSONEncoder
 from ...common.utils.messages import stringify_content
 from ...env import (
-    docs_embedding_service_url,
-    docs_embedding_model_id,
+    embedding_service_url,
+    embedding_model_id,
 )
 from ...model_registry import (
-    JULEP_MODELS,
-    get_extra_settings,
+    LOCAL_MODELS,
+    LOCAL_MODELS_WITH_TOOL_CALLS,
     load_context,
+    validate_and_extract_tool_calls,
 )
 from ...models.entry.add_entries import add_entries_query
 from ...models.entry.proc_mem_context import proc_mem_context_query
@@ -261,8 +262,8 @@ async def forward(
                 ]
             ],
             join_inputs=False,
-            embedding_service_url=docs_embedding_service_url,
-            embedding_model_name=docs_embedding_model_id,
+            embedding_service_url=embedding_service_url,
+            embedding_model_name=embedding_model_id,
         )
 
         entries: list[Entry] = []
@@ -357,7 +358,10 @@ async def forward(
                 and message.content[0].type == "text"
             ):
                 message.content = message.content[0].text
-
+                # Add tools to settings
+        if tools:
+            settings.tools = settings.tools or []
+            settings.tools.extend(tools)
         # If render_templates=True, render the templates
         if session_data is not None and session_data.render_templates:
 
@@ -378,6 +382,7 @@ async def forward(
                     "name": session_data.agent_name,
                     "about": session_data.agent_about,
                     "metadata": session_data.agent_metadata,
+                    "tools": settings.tools,
                 },
             }
 
@@ -392,11 +397,6 @@ async def forward(
         if session_data is not None:
             settings.model = session_data.model
 
-        # Add tools to settings
-        if tools:
-            settings.tools = settings.tools or []
-            settings.tools.extend(tools)
-
         return messages, settings, doc_ids
 
     @cache
@@ -408,7 +408,7 @@ async def generate(
         api_base = None
         api_key = None
         model = settings.model
-        if model in JULEP_MODELS:
+        if model in LOCAL_MODELS:
             api_base = model_inference_url
             api_key = model_api_key
             model = f"openai/{model}"
@@ -416,11 +416,8 @@ async def generate(
         if settings.tools:
             tools = [(tool.model_dump(exclude="id")) for tool in settings.tools]
 
-        extra_body = get_extra_settings(settings)
-
         litellm.drop_params = True
         litellm.add_function_to_prompt = True
-
         res = await acompletion(
             model=model,
             messages=init_context,
@@ -435,9 +432,18 @@ async def generate(
             response_format=settings.response_format,
             api_base=api_base,
             api_key=api_key,
-            **extra_body,
         )
-
+        if model in LOCAL_MODELS_WITH_TOOL_CALLS:
+            validation, tool_call, error_msg = validate_and_extract_tool_calls(
+                res.choices[0].message.content
+            )
+            if validation:
+                res.choices[0].message.role = (
+                    "function_call" if tool_call else "assistant"
+                )
+                res.choices[0].finish_reason = "tool_calls"
+                res.choices[0].message.tool_calls = tool_call
+                res.choices[0].message.content = json.dumps(tool_call)
         return res
 
     async def backward(
diff --git a/agents-api/docker-compose.yml b/agents-api/docker-compose.yml
index ae21334d9..150473c3c 100644
--- a/agents-api/docker-compose.yml
+++ b/agents-api/docker-compose.yml
@@ -59,9 +59,9 @@ services:
     container_name: text-embeddings-inference
     environment:
       - DTYPE=float16
-      - MODEL_ID=BAAI/llm-embedder
+      - MODEL_ID=BAAI/bge-m3
 
-    image: ghcr.io/huggingface/text-embeddings-inference:1.0
+    image: ghcr.io/huggingface/text-embeddings-inference:1.3
     ports:
       - "8082:80"
     volumes:
@@ -75,25 +75,6 @@ services:
               count: all
               capabilities: [gpu]
 
-  docs-text-embeddings-inference:
-    container_name: docs-text-embeddings-inference
-    environment:
-      - DTYPE=float16
-      - MODEL_ID=BAAI/bge-m3
-
-    image: ghcr.io/huggingface/text-embeddings-inference:1.0
-    ports:
-      - "8083:80"
-    volumes:
-      - ~/.cache/huggingface/hub:/data
-    shm_size: "2gb"
-    deploy:
-      resources:
-        reservations:
-          devices:
-            - driver: nvidia
-              count: all
-              capabilities: [gpu]
 
   temporal:
     image: julepai/temporal:dev
diff --git a/model-serving/Dockerfile b/model-serving/Dockerfile
index 0b9f16a4d..b25c81eba 100644
--- a/model-serving/Dockerfile
+++ b/model-serving/Dockerfile
@@ -1,35 +1,12 @@
-# Use vllm/vllm-openai:0.3.2 as a base image
-FROM vllm/vllm-openai:v0.3.3 as builder
+FROM vllm/vllm-openai:v0.5.0 as base
 
-# Set environment variables
-ENV PYTHONUNBUFFERED True
-ENV POETRY_CACHE_DIR=/tmp/poetry_cache
-
-# Set the working directory
-WORKDIR /app
-
-# Install Poetry
-# Disable Poetry's virtual environment as Docker provides isolation
-RUN pip install --no-cache-dir --upgrade poetry \
-    && poetry config virtualenvs.create false
-
-# Copy only the Poetry configuration files
-COPY pyproject.toml poetry.lock ./
-
-# Install dependencies only, excluding the application itself
-RUN poetry install --no-dev --no-root
-
-# Copy the rest of your application code
-COPY . .
-
-# Now, install the application with Poetry, which will not re-install the dependencies
-RUN poetry install --no-dev
 
 # Define the entrypoint
-ENV MODEL_NAME julep-ai/samantha-1-turbo
+ENV MODEL_NAME julep-ai/Hermes-2-Theta-Llama-3-8B
 ENV TP_SIZE 1
 ENV MAX_MODEL_LEN 8192
 ENV MAX_NUM_SEQS 1
 ENV GPU_MEMORY_UTILIZATION 0.95
 ENV DTYPE bfloat16
-ENTRYPOINT python3 -m model_api --model $MODEL_NAME --tensor-parallel-size $TP_SIZE --enforce-eager --gpu-memory-utilization $GPU_MEMORY_UTILIZATION --max-model-len $MAX_MODEL_LEN --max-num-seqs $MAX_NUM_SEQS --dtype $DTYPE
+ENV MODEL_API_KEY myauthkey
+ENTRYPOINT python3 -m vllm.entrypoints.openai.api_server --model $MODEL_NAME --tensor-parallel-size $TP_SIZE --enforce-eager --gpu-memory-utilization $GPU_MEMORY_UTILIZATION --max-model-len $MAX_MODEL_LEN --max-num-seqs $MAX_NUM_SEQS --dtype $DTYPE --trust-remote-code --api_key=$MODEL_API_KEY
diff --git a/model-serving/artifacts/nous-llama-fix.ipynb b/model-serving/artifacts/nous-llama-fix.ipynb
new file mode 100644
index 000000000..0cfad097e
--- /dev/null
+++ b/model-serving/artifacts/nous-llama-fix.ipynb
@@ -0,0 +1,278 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 86,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/sidbin/miniconda3/envs/julep/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
+      "  warnings.warn(\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "b9c2b4fa7a02414581da3b7ec438472a",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "tokenizer_config.json:   0%|          | 0.00/50.7k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "4ad1285872434e7994a2451f789c4b1d",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "8a55dcc3ed7245289cd56a9b897e9b41",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "special_tokens_map.json:   0%|          | 0.00/444 [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig, AutoModel\n",
+    "\n",
+    "\n",
+    "main_tokenizer = AutoTokenizer.from_pretrained(\"NousResearch/Hermes-2-Theta-Llama-3-8B\")\n",
+    "stable_tokenizer = AutoTokenizer.from_pretrained(\n",
+    "    \"NousResearch/Hermes-2-Theta-Llama-3-8B\",\n",
+    "    revision=\"dd2bfa013380639acf9e8fa45ceacbc45fb44081\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 75,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "config = AutoConfig.from_pretrained(\n",
+    "    \"NousResearch/Hermes-2-Theta-Llama-3-8B\",\n",
+    "    revision=\"dd2bfa013380639acf9e8fa45ceacbc45fb44081\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "token_id_to_change = 128005\n",
+    "new_token = stable_tokenizer.convert_ids_to_tokens(token_id_to_change)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "old_token = main_tokenizer.convert_ids_to_tokens(token_id_to_change)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 89,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "('<|reserved_special_token_3|>', '<tool_response>')"
+      ]
+     },
+     "execution_count": 89,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "old_token, new_token"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 79,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "8fbb2345fcca46c5b4f8cdbf7fac4901",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Fetching 12 files:   0%|          | 0/12 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "from huggingface_hub import login, snapshot_download, create_repo, Repository\n",
+    "\n",
+    "# Step 3: Clone the Original Model Repository\n",
+    "model_name = \"NousResearch/Hermes-2-Theta-Llama-3-8B\"\n",
+    "snapshot = snapshot_download(\n",
+    "    repo_id=model_name,\n",
+    "    repo_type=\"model\",\n",
+    "    revision=\"dd2bfa013380639acf9e8fa45ceacbc45fb44081\",\n",
+    "    local_dir=model_name,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 82,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "new_repo_name = \"julep-ai/Hermes-2-Theta-Llama-3-8B\"\n",
+    "create_repo(repo_id=new_repo_name, repo_type=\"model\", private=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 85,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "c21ecaf4ac474112a0d7b4c2cc9db3fc",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "8f7bcf4eb58a429b921f3a44f28b3da9",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "7cf4db172c9e40f9920c7c59187bbce3",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "40bc6816734444bbadb467938c772a78",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "8c0901790c0a4d0393405b0c7f89cdeb",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Upload 4 LFS files:   0%|          | 0/4 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "CommitInfo(commit_url='https://huggingface.co/julep-ai/Hermes-2-Theta-Llama-3-8B/commit/2b3a264ffc8a3bfe6f127b324762a47eadf30289', commit_message='Upload folder using huggingface_hub', commit_description='', oid='2b3a264ffc8a3bfe6f127b324762a47eadf30289', pr_url=None, pr_revision=None, pr_num=None)"
+      ]
+     },
+     "execution_count": 85,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from huggingface_hub import upload_folder\n",
+    "\n",
+    "upload_folder(repo_id=new_repo_name, repo_type=\"model\", folder_path=snapshot)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "julep",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.14"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/model-serving/docker-compose.yml b/model-serving/docker-compose.yml
index 2342c03e0..4e2447cd6 100644
--- a/model-serving/docker-compose.yml
+++ b/model-serving/docker-compose.yml
@@ -6,10 +6,8 @@ services:
     env_file: "../.env"
 
     environment:
-      - MODEL_API_KEY=${MODEL_API_KEY}
-      - MODEL_API_KEY_HEADER_NAME=${MODEL_API_KEY_HEADER_NAME}
-      - SKIP_CHECK_DEVELOPER_HEADERS=${SKIP_CHECK_DEVELOPER_HEADERS}
       - MODEL_NAME=${MODEL_NAME}
+      - MODEL_API_KEY=${MODEL_API_KEY}
       - HF_TOKEN=${HF_TOKEN}
       - HUGGING_FACE_HUB_TOKEN=${HUGGING_FACE_HUB_TOKEN}
       - SENTRY_DSN=${SENTRY_DSN}