From 646daa69d15f99bdc7739a8c345e56c17979a728 Mon Sep 17 00:00:00 2001 From: Juan Calderon-Perez <835733+gaby@users.noreply.github.com> Date: Tue, 19 Sep 2023 23:10:12 -0400 Subject: [PATCH] Add support for using dynamic threads (#756) --- api/src/serge/models/chat.py | 1 - api/src/serge/routers/chat.py | 5 +++-- api/src/serge/utils/llm.py | 1 - web/src/routes/+layout.svelte | 2 +- web/src/routes/+page.svelte | 15 --------------- web/src/routes/chat/[id]/+page.svelte | 23 +---------------------- web/src/routes/chat/[id]/+page.ts | 1 - 7 files changed, 5 insertions(+), 43 deletions(-) diff --git a/api/src/serge/models/chat.py b/api/src/serge/models/chat.py index e5cc7b1a0ec..9b535b3697e 100644 --- a/api/src/serge/models/chat.py +++ b/api/src/serge/models/chat.py @@ -14,7 +14,6 @@ class ChatParameters(BaseModel): # logits_all: bool # vocab_only: bool # use_mlock: bool - n_threads: int # n_batch: int last_n_tokens_size: int max_tokens: int diff --git a/api/src/serge/routers/chat.py b/api/src/serge/routers/chat.py index d026743e4b4..47b6b866913 100644 --- a/api/src/serge/routers/chat.py +++ b/api/src/serge/routers/chat.py @@ -1,3 +1,5 @@ +import os + from typing import Optional from fastapi import APIRouter from langchain.memory import RedisChatMessageHistory @@ -28,7 +30,6 @@ async def create_new_chat( repeat_last_n: int = 64, repeat_penalty: float = 1.3, init_prompt: str = "Below is an instruction that describes a task. Write a response that appropriately completes the request.", - n_threads: int = 4, ): try: client = Llama( @@ -51,7 +52,7 @@ async def create_new_chat( n_gpu_layers=gpu_layers, last_n_tokens_size=repeat_last_n, repeat_penalty=repeat_penalty, - n_threads=n_threads, + n_threads=len(os.sched_getaffinity(0)), init_prompt=init_prompt, ) # create the chat diff --git a/api/src/serge/utils/llm.py b/api/src/serge/utils/llm.py index 749b22f62e0..45422c7204a 100644 --- a/api/src/serge/utils/llm.py +++ b/api/src/serge/utils/llm.py @@ -119,7 +119,6 @@ def _default_params(self) -> dict[str, Any]: "stop_sequences": self.stop_sequences, "repeat_penalty": self.repeat_penalty, "top_k": self.top_k, - "n_threads": self.n_threads, "n_ctx": self.n_ctx, "n_gpu_layers": self.n_gpu_layers, "n_parts": self.n_parts, diff --git a/web/src/routes/+layout.svelte b/web/src/routes/+layout.svelte index b5289ed0c90..98e2a358bd6 100644 --- a/web/src/routes/+layout.svelte +++ b/web/src/routes/+layout.svelte @@ -104,7 +104,7 @@ `/api/chat/?model=${dataCht.params.model_path}&temperature=${dataCht.params.temperature}&top_k=${dataCht.params.top_k}` + `&top_p=${dataCht.params.top_p}&max_length=${dataCht.params.max_tokens}&context_window=${dataCht.params.n_ctx}` + `&repeat_last_n=${dataCht.params.last_n_tokens_size}&repeat_penalty=${dataCht.params.repeat_penalty}` + - `&n_threads=${dataCht.params.n_threads}&init_prompt=${dataCht.history[0].data.content}` + + `&init_prompt=${dataCht.history[0].data.content}` + `&gpu_layers=${dataCht.params.n_gpu_layers}`, { diff --git a/web/src/routes/+page.svelte b/web/src/routes/+page.svelte index 83175868981..91388f8c5b2 100644 --- a/web/src/routes/+page.svelte +++ b/web/src/routes/+page.svelte @@ -23,7 +23,6 @@ let init_prompt = "Below is an instruction that describes a task. Write a response that appropriately completes the request."; - let n_threads = 4; let context_window = 2048; let gpu_layers = 0; @@ -226,20 +225,6 @@ {/each} -