From 3a31295ef876f64bd70379acebcf084541c931e7 Mon Sep 17 00:00:00 2001 From: XinyaoWa Date: Wed, 18 Sep 2024 20:39:23 +0800 Subject: [PATCH] Align parameters for "max_token, repetition_penalty,presence_penalty,frequency_penalty" (#608) * align max_tokens Signed-off-by: Xinyao Wang * aligin repetition_penalty Signed-off-by: Xinyao Wang * fix bug Signed-off-by: Xinyao Wang * align penalty parameters Signed-off-by: Xinyao Wang * fix bug Signed-off-by: Xinyao Wang * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix bug Signed-off-by: Xinyao Wang * fix bug Signed-off-by: Xinyao Wang * fix bug Signed-off-by: Xinyao Wang * fix bug Signed-off-by: Xinyao Wang * align max_tokens Signed-off-by: Xinyao Wang * fix bug Signed-off-by: Xinyao Wang * fix bug Signed-off-by: Xinyao Wang * debug Signed-off-by: Xinyao Wang * debug Signed-off-by: Xinyao Wang * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix langchain version bug Signed-off-by: Xinyao Wang * fix langchain version bug Signed-off-by: Xinyao Wang * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Xinyao Wang Co-authored-by: kevinintel Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: lvliang-intel --- comps/cores/mega/gateway.py | 48 +++++++++++++------ comps/cores/proto/api_protocol.py | 4 +- comps/cores/proto/docarray.py | 6 +++ .../llms/faq-generation/tgi/langchain/llm.py | 2 +- .../tgi/langchain/requirements.txt | 3 ++ comps/llms/summarization/tgi/langchain/llm.py | 2 +- comps/llms/text-generation/README.md | 6 +-- .../ollama/langchain/README.md | 2 +- .../text-generation/ollama/langchain/llm.py | 2 +- .../text-generation/predictionguard/README.md | 4 +- .../predictionguard/llm_predictionguard.py | 4 +- comps/llms/text-generation/ray_serve/llm.py | 2 +- comps/llms/text-generation/tgi/README.md | 20 +++++--- comps/llms/text-generation/tgi/llm.py | 4 +- .../text-generation/vllm/langchain/README.md | 10 ++-- .../text-generation/vllm/langchain/llm.py | 8 +++- .../text-generation/vllm/langchain/query.sh | 2 +- .../vllm/langchain/requirements.txt | 6 ++- .../vllm/llama_index/README.md | 2 +- .../text-generation/vllm/llama_index/llm.py | 2 +- .../text-generation/vllm/llama_index/query.sh | 2 +- comps/llms/text-generation/vllm/ray/README.md | 4 +- comps/llms/text-generation/vllm/ray/llm.py | 5 +- comps/llms/text-generation/vllm/ray/query.sh | 2 +- comps/llms/utils/lm-eval/self_hosted_hf.py | 2 +- ...st_llms_text-generation_predictionguard.sh | 2 +- tests/llms/test_llms_text-generation_tgi.sh | 2 +- ...-generation_vllm_langchain_on_intel_hpu.sh | 2 +- ...generation_vllm_llamaindex_on_intel_hpu.sh | 2 +- ...s_text-generation_vllm_ray_on_intel_hpu.sh | 2 +- 30 files changed, 107 insertions(+), 57 deletions(-) diff --git a/comps/cores/mega/gateway.py b/comps/cores/mega/gateway.py index 2636a0f83..115658e31 100644 --- a/comps/cores/mega/gateway.py +++ b/comps/cores/mega/gateway.py @@ -160,11 +160,13 @@ async def handle_request(self, request: Request): chat_request = ChatCompletionRequest.parse_obj(data) prompt = self._handle_message(chat_request.messages) parameters = LLMParams( - max_new_tokens=chat_request.max_tokens if chat_request.max_tokens else 1024, + max_tokens=chat_request.max_tokens if chat_request.max_tokens else 1024, top_k=chat_request.top_k if chat_request.top_k else 10, top_p=chat_request.top_p if chat_request.top_p else 0.95, temperature=chat_request.temperature if chat_request.temperature else 0.01, - repetition_penalty=chat_request.presence_penalty if chat_request.presence_penalty else 1.03, + frequency_penalty=chat_request.frequency_penalty if chat_request.frequency_penalty else 0.0, + presence_penalty=chat_request.presence_penalty if chat_request.presence_penalty else 0.0, + repetition_penalty=chat_request.repetition_penalty if chat_request.repetition_penalty else 1.03, streaming=stream_opt, chat_template=chat_request.chat_template if chat_request.chat_template else None, ) @@ -214,11 +216,13 @@ async def handle_request(self, request: Request): chat_request = ChatCompletionRequest.parse_obj(data) prompt = self._handle_message(chat_request.messages) parameters = LLMParams( - max_new_tokens=chat_request.max_tokens if chat_request.max_tokens else 1024, + max_tokens=chat_request.max_tokens if chat_request.max_tokens else 1024, top_k=chat_request.top_k if chat_request.top_k else 10, top_p=chat_request.top_p if chat_request.top_p else 0.95, temperature=chat_request.temperature if chat_request.temperature else 0.01, - repetition_penalty=chat_request.presence_penalty if chat_request.presence_penalty else 1.03, + frequency_penalty=chat_request.frequency_penalty if chat_request.frequency_penalty else 0.0, + presence_penalty=chat_request.presence_penalty if chat_request.presence_penalty else 0.0, + repetition_penalty=chat_request.repetition_penalty if chat_request.repetition_penalty else 1.03, streaming=stream_opt, ) result_dict, runtime_graph = await self.megaservice.schedule( @@ -350,11 +354,13 @@ async def handle_request(self, request: Request): chat_request = ChatCompletionRequest.parse_obj(data) prompt = self._handle_message(chat_request.messages) parameters = LLMParams( - max_new_tokens=chat_request.max_tokens if chat_request.max_tokens else 1024, + max_tokens=chat_request.max_tokens if chat_request.max_tokens else 1024, top_k=chat_request.top_k if chat_request.top_k else 10, top_p=chat_request.top_p if chat_request.top_p else 0.95, temperature=chat_request.temperature if chat_request.temperature else 0.01, - repetition_penalty=chat_request.presence_penalty if chat_request.presence_penalty else 1.03, + frequency_penalty=chat_request.frequency_penalty if chat_request.frequency_penalty else 0.0, + presence_penalty=chat_request.presence_penalty if chat_request.presence_penalty else 0.0, + repetition_penalty=chat_request.repetition_penalty if chat_request.repetition_penalty else 1.03, streaming=stream_opt, ) result_dict, runtime_graph = await self.megaservice.schedule( @@ -399,11 +405,13 @@ async def handle_request(self, request: Request): chat_request = AudioChatCompletionRequest.parse_obj(data) parameters = LLMParams( # relatively lower max_tokens for audio conversation - max_new_tokens=chat_request.max_tokens if chat_request.max_tokens else 128, + max_tokens=chat_request.max_tokens if chat_request.max_tokens else 128, top_k=chat_request.top_k if chat_request.top_k else 10, top_p=chat_request.top_p if chat_request.top_p else 0.95, temperature=chat_request.temperature if chat_request.temperature else 0.01, - repetition_penalty=chat_request.presence_penalty if chat_request.presence_penalty else 1.03, + frequency_penalty=chat_request.frequency_penalty if chat_request.frequency_penalty else 0.0, + presence_penalty=chat_request.presence_penalty if chat_request.presence_penalty else 0.0, + repetition_penalty=chat_request.repetition_penalty if chat_request.repetition_penalty else 1.03, streaming=False, # TODO add streaming LLM output as input to TTS ) result_dict, runtime_graph = await self.megaservice.schedule( @@ -428,11 +436,13 @@ async def handle_request(self, request: Request): chat_request = ChatCompletionRequest.parse_obj(data) prompt = self._handle_message(chat_request.messages) parameters = LLMParams( - max_new_tokens=chat_request.max_tokens if chat_request.max_tokens else 1024, + max_tokens=chat_request.max_tokens if chat_request.max_tokens else 1024, top_k=chat_request.top_k if chat_request.top_k else 10, top_p=chat_request.top_p if chat_request.top_p else 0.95, temperature=chat_request.temperature if chat_request.temperature else 0.01, - repetition_penalty=chat_request.presence_penalty if chat_request.presence_penalty else 1.03, + frequency_penalty=chat_request.frequency_penalty if chat_request.frequency_penalty else 0.0, + presence_penalty=chat_request.presence_penalty if chat_request.presence_penalty else 0.0, + repetition_penalty=chat_request.repetition_penalty if chat_request.repetition_penalty else 1.03, streaming=stream_opt, ) result_dict, runtime_graph = await self.megaservice.schedule( @@ -472,11 +482,13 @@ async def handle_request(self, request: Request): chat_request = ChatCompletionRequest.parse_obj(data) prompt = self._handle_message(chat_request.messages) parameters = LLMParams( - max_new_tokens=chat_request.max_tokens if chat_request.max_tokens else 1024, + max_tokens=chat_request.max_tokens if chat_request.max_tokens else 1024, top_k=chat_request.top_k if chat_request.top_k else 10, top_p=chat_request.top_p if chat_request.top_p else 0.95, temperature=chat_request.temperature if chat_request.temperature else 0.01, - repetition_penalty=chat_request.presence_penalty if chat_request.presence_penalty else 1.03, + frequency_penalty=chat_request.frequency_penalty if chat_request.frequency_penalty else 0.0, + presence_penalty=chat_request.presence_penalty if chat_request.presence_penalty else 0.0, + repetition_penalty=chat_request.repetition_penalty if chat_request.repetition_penalty else 1.03, streaming=stream_opt, ) result_dict, runtime_graph = await self.megaservice.schedule( @@ -520,7 +532,9 @@ async def handle_request(self, request: Request): top_k=chat_request.top_k if chat_request.top_k else 10, top_p=chat_request.top_p if chat_request.top_p else 0.95, temperature=chat_request.temperature if chat_request.temperature else 0.01, - repetition_penalty=chat_request.presence_penalty if chat_request.presence_penalty else 1.03, + frequency_penalty=chat_request.frequency_penalty if chat_request.frequency_penalty else 0.0, + presence_penalty=chat_request.presence_penalty if chat_request.presence_penalty else 0.0, + repetition_penalty=chat_request.repetition_penalty if chat_request.repetition_penalty else 1.03, streaming=stream_opt, ) result_dict, runtime_graph = await self.megaservice.schedule( @@ -569,7 +583,9 @@ async def handle_request(self, request: Request): top_k=chat_request.top_k if chat_request.top_k else 10, top_p=chat_request.top_p if chat_request.top_p else 0.95, temperature=chat_request.temperature if chat_request.temperature else 0.01, - repetition_penalty=chat_request.presence_penalty if chat_request.presence_penalty else 1.03, + frequency_penalty=chat_request.frequency_penalty if chat_request.frequency_penalty else 0.0, + presence_penalty=chat_request.presence_penalty if chat_request.presence_penalty else 0.0, + repetition_penalty=chat_request.repetition_penalty if chat_request.repetition_penalty else 1.03, streaming=stream_opt, ) result_dict, runtime_graph = await self.megaservice.schedule( @@ -758,7 +774,9 @@ async def handle_request(self, request: Request): top_k=chat_request.top_k if chat_request.top_k else 10, top_p=chat_request.top_p if chat_request.top_p else 0.95, temperature=chat_request.temperature if chat_request.temperature else 0.01, - repetition_penalty=chat_request.presence_penalty if chat_request.presence_penalty else 1.03, + frequency_penalty=chat_request.frequency_penalty if chat_request.frequency_penalty else 0.0, + presence_penalty=chat_request.presence_penalty if chat_request.presence_penalty else 0.0, + repetition_penalty=chat_request.repetition_penalty if chat_request.repetition_penalty else 1.03, streaming=stream_opt, chat_template=chat_request.chat_template if chat_request.chat_template else None, ) diff --git a/comps/cores/proto/api_protocol.py b/comps/cores/proto/api_protocol.py index 2596fb17a..d2fb0adb1 100644 --- a/comps/cores/proto/api_protocol.py +++ b/comps/cores/proto/api_protocol.py @@ -285,8 +285,9 @@ class AudioChatCompletionRequest(BaseModel): max_tokens: Optional[int] = 1024 stop: Optional[Union[str, List[str]]] = None stream: Optional[bool] = False - presence_penalty: Optional[float] = 1.03 + presence_penalty: Optional[float] = 0.0 frequency_penalty: Optional[float] = 0.0 + repetition_penalty: Optional[float] = 1.03 user: Optional[str] = None @@ -345,6 +346,7 @@ class CompletionRequest(BaseModel): echo: Optional[bool] = False presence_penalty: Optional[float] = 0.0 frequency_penalty: Optional[float] = 0.0 + repetition_penalty: Optional[float] = 1.03 user: Optional[str] = None use_beam_search: Optional[bool] = False best_of: Optional[int] = None diff --git a/comps/cores/proto/docarray.py b/comps/cores/proto/docarray.py index af62f5104..94edba694 100644 --- a/comps/cores/proto/docarray.py +++ b/comps/cores/proto/docarray.py @@ -145,11 +145,14 @@ class RerankedDoc(BaseDoc): class LLMParamsDoc(BaseDoc): model: Optional[str] = None # for openai and ollama query: str + max_tokens: int = 1024 max_new_tokens: int = 1024 top_k: int = 10 top_p: float = 0.95 typical_p: float = 0.95 temperature: float = 0.01 + frequency_penalty: float = 0.0 + presence_penalty: float = 0.0 repetition_penalty: float = 1.03 streaming: bool = True @@ -179,11 +182,14 @@ def chat_template_must_contain_variables(cls, v): class LLMParams(BaseDoc): + max_tokens: int = 1024 max_new_tokens: int = 1024 top_k: int = 10 top_p: float = 0.95 typical_p: float = 0.95 temperature: float = 0.01 + frequency_penalty: float = 0.0 + presence_penalty: float = 0.0 repetition_penalty: float = 1.03 streaming: bool = True diff --git a/comps/llms/faq-generation/tgi/langchain/llm.py b/comps/llms/faq-generation/tgi/langchain/llm.py index 0b4d70e85..2b6a96060 100644 --- a/comps/llms/faq-generation/tgi/langchain/llm.py +++ b/comps/llms/faq-generation/tgi/langchain/llm.py @@ -40,7 +40,7 @@ def llm_generate(input: LLMParamsDoc): llm_endpoint = os.getenv("TGI_LLM_ENDPOINT", "http://localhost:8080") llm = HuggingFaceEndpoint( endpoint_url=llm_endpoint, - max_new_tokens=input.max_new_tokens, + max_new_tokens=input.max_tokens, top_k=input.top_k, top_p=input.top_p, typical_p=input.typical_p, diff --git a/comps/llms/faq-generation/tgi/langchain/requirements.txt b/comps/llms/faq-generation/tgi/langchain/requirements.txt index fa1548d7c..36257d393 100644 --- a/comps/llms/faq-generation/tgi/langchain/requirements.txt +++ b/comps/llms/faq-generation/tgi/langchain/requirements.txt @@ -2,7 +2,10 @@ docarray[full] fastapi huggingface_hub langchain +langchain-huggingface +langchain-openai langchain_community +langchainhub opentelemetry-api opentelemetry-exporter-otlp opentelemetry-sdk diff --git a/comps/llms/summarization/tgi/langchain/llm.py b/comps/llms/summarization/tgi/langchain/llm.py index 80c5d3924..40150ff81 100644 --- a/comps/llms/summarization/tgi/langchain/llm.py +++ b/comps/llms/summarization/tgi/langchain/llm.py @@ -39,7 +39,7 @@ def llm_generate(input: LLMParamsDoc): llm_endpoint = os.getenv("TGI_LLM_ENDPOINT", "http://localhost:8080") llm = HuggingFaceEndpoint( endpoint_url=llm_endpoint, - max_new_tokens=input.max_new_tokens, + max_new_tokens=input.max_tokens, top_k=input.top_k, top_p=input.top_p, typical_p=input.typical_p, diff --git a/comps/llms/text-generation/README.md b/comps/llms/text-generation/README.md index 9c4af98c1..b31c571a5 100644 --- a/comps/llms/text-generation/README.md +++ b/comps/llms/text-generation/README.md @@ -374,7 +374,7 @@ curl http://${your_ip}:8008/v1/chat/completions \ ### 3.3 Consume LLM Service -You can set the following model parameters according to your actual needs, such as `max_new_tokens`, `streaming`. +You can set the following model parameters according to your actual needs, such as `max_tokens`, `streaming`. The `streaming` parameter determines the format of the data returned by the API. It will return text string with `streaming=false`, return text streaming flow with `streaming=true`. @@ -385,7 +385,7 @@ curl http://${your_ip}:9000/v1/chat/completions \ -H 'Content-Type: application/json' \ -d '{ "query":"What is Deep Learning?", - "max_new_tokens":17, + "max_tokens":17, "top_k":10, "top_p":0.95, "typical_p":0.95, @@ -401,7 +401,7 @@ curl http://${your_ip}:9000/v1/chat/completions \ -H 'Content-Type: application/json' \ -d '{ "query":"What is Deep Learning?", - "max_new_tokens":17, + "max_tokens":17, "top_k":10, "top_p":0.95, "typical_p":0.95, diff --git a/comps/llms/text-generation/ollama/langchain/README.md b/comps/llms/text-generation/ollama/langchain/README.md index ec9a293eb..65285bb11 100644 --- a/comps/llms/text-generation/ollama/langchain/README.md +++ b/comps/llms/text-generation/ollama/langchain/README.md @@ -70,5 +70,5 @@ docker run --network host -e http_proxy=$http_proxy -e https_proxy=$https_proxy ## Consume the Ollama Microservice ```bash -curl http://127.0.0.1:9000/v1/chat/completions -X POST -d '{"model": "llama3", "query":"What is Deep Learning?","max_new_tokens":32,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true}' -H 'Content-Type: application/json' +curl http://127.0.0.1:9000/v1/chat/completions -X POST -d '{"model": "llama3", "query":"What is Deep Learning?","max_tokens":32,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true}' -H 'Content-Type: application/json' ``` diff --git a/comps/llms/text-generation/ollama/langchain/llm.py b/comps/llms/text-generation/ollama/langchain/llm.py index 06d02461c..9830cca15 100644 --- a/comps/llms/text-generation/ollama/langchain/llm.py +++ b/comps/llms/text-generation/ollama/langchain/llm.py @@ -25,7 +25,7 @@ def llm_generate(input: LLMParamsDoc): ollama = Ollama( base_url=ollama_endpoint, model=input.model if input.model else model_name, - num_predict=input.max_new_tokens, + num_predict=input.max_tokens, top_k=input.top_k, top_p=input.top_p, temperature=input.temperature, diff --git a/comps/llms/text-generation/predictionguard/README.md b/comps/llms/text-generation/predictionguard/README.md index e506793d9..1045d361d 100644 --- a/comps/llms/text-generation/predictionguard/README.md +++ b/comps/llms/text-generation/predictionguard/README.md @@ -29,7 +29,7 @@ curl -X POST http://localhost:9000/v1/chat/completions \ -d '{ "model": "Hermes-2-Pro-Llama-3-8B", "query": "Tell me a joke.", - "max_new_tokens": 100, + "max_tokens": 100, "temperature": 0.7, "top_p": 0.9, "top_k": 50, @@ -45,7 +45,7 @@ curl -N -X POST http://localhost:9000/v1/chat/completions \ -d '{ "model": "Hermes-2-Pro-Llama-3-8B", "query": "Tell me a joke.", - "max_new_tokens": 100, + "max_tokens": 100, "temperature": 0.7, "top_p": 0.9, "top_k": 50, diff --git a/comps/llms/text-generation/predictionguard/llm_predictionguard.py b/comps/llms/text-generation/predictionguard/llm_predictionguard.py index ea70c11bc..d6c9398ce 100644 --- a/comps/llms/text-generation/predictionguard/llm_predictionguard.py +++ b/comps/llms/text-generation/predictionguard/llm_predictionguard.py @@ -49,7 +49,7 @@ async def stream_generator(): for res in client.chat.completions.create( model=input.model, messages=messages, - max_tokens=input.max_new_tokens, + max_tokens=input.max_tokens, temperature=input.temperature, top_p=input.top_p, top_k=input.top_k, @@ -69,7 +69,7 @@ async def stream_generator(): response = client.chat.completions.create( model=input.model, messages=messages, - max_tokens=input.max_new_tokens, + max_tokens=input.max_tokens, temperature=input.temperature, top_p=input.top_p, top_k=input.top_k, diff --git a/comps/llms/text-generation/ray_serve/llm.py b/comps/llms/text-generation/ray_serve/llm.py index c86025625..1203794cd 100644 --- a/comps/llms/text-generation/ray_serve/llm.py +++ b/comps/llms/text-generation/ray_serve/llm.py @@ -47,7 +47,7 @@ def llm_generate(input: LLMParamsDoc): openai_api_base=llm_endpoint + "/v1", model_name=llm_model, openai_api_key=os.getenv("OPENAI_API_KEY", "not_needed"), - max_tokens=input.max_new_tokens, + max_tokens=input.max_tokens, temperature=input.temperature, streaming=input.streaming, request_timeout=600, diff --git a/comps/llms/text-generation/tgi/README.md b/comps/llms/text-generation/tgi/README.md index c6843df4e..37428f3f1 100644 --- a/comps/llms/text-generation/tgi/README.md +++ b/comps/llms/text-generation/tgi/README.md @@ -88,7 +88,7 @@ curl http://${your_ip}:9000/v1/health_check\ ### 3.2 Consume LLM Service -You can set the following model parameters according to your actual needs, such as `max_new_tokens`, `streaming`. +You can set the following model parameters according to your actual needs, such as `max_tokens`, `streaming`. The `streaming` parameter determines the format of the data returned by the API. It will return text string with `streaming=false`, return text streaming flow with `streaming=true`. @@ -96,28 +96,34 @@ The `streaming` parameter determines the format of the data returned by the API. # non-streaming mode curl http://${your_ip}:9000/v1/chat/completions \ -X POST \ - -d '{"query":"What is Deep Learning?","max_new_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":false}' \ + -d '{"query":"What is Deep Learning?","max_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":false}' \ -H 'Content-Type: application/json' # streaming mode curl http://${your_ip}:9000/v1/chat/completions \ -X POST \ - -d '{"query":"What is Deep Learning?","max_new_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true}' \ + -d '{"query":"What is Deep Learning?","max_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true}' \ -H 'Content-Type: application/json' -# custom chat template +# consume with SearchedDoc curl http://${your_ip}:9000/v1/chat/completions \ -X POST \ - -d '{"query":"What is Deep Learning?","max_new_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true, "chat_template":"### You are a helpful, respectful and honest assistant to help the user with questions.\n### Context: {context}\n### Question: {question}\n### Answer:"}' \ + -d '{"initial_query":"What is Deep Learning?","retrieved_docs":[{"text":"Deep Learning is a ..."},{"text":"Deep Learning is b ..."}]}' \ -H 'Content-Type: application/json' +``` -# consume with SearchedDoc +For parameters in above modes, please refer to [HuggingFace InferenceClient API](https://huggingface.co/docs/huggingface_hub/package_reference/inference_client#huggingface_hub.InferenceClient.text_generation) (except we rename 'max_new_tokens' to 'max_tokens') + +```bash +# custom chat template curl http://${your_ip}:9000/v1/chat/completions \ -X POST \ - -d '{"initial_query":"What is Deep Learning?","retrieved_docs":[{"text":"Deep Learning is a ..."},{"text":"Deep Learning is b ..."}]}' \ + -d '{"query":"What is Deep Learning?","max_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"presence_penalty":1.03", frequency_penalty":0.0, "streaming":true, "chat_template":"### You are a helpful, respectful and honest assistant to help the user with questions.\n### Context: {context}\n### Question: {question}\n### Answer:"}' \ -H 'Content-Type: application/json' ``` +For parameters in Chat mode, please refer to [OpenAI API](https://platform.openai.com/docs/api-reference/chat/create) + ### 4. Validated Model | Model | TGI | diff --git a/comps/llms/text-generation/tgi/llm.py b/comps/llms/text-generation/tgi/llm.py index d0ad2dbf1..d96518296 100644 --- a/comps/llms/text-generation/tgi/llm.py +++ b/comps/llms/text-generation/tgi/llm.py @@ -69,7 +69,7 @@ async def llm_generate(input: Union[LLMParamsDoc, ChatCompletionRequest, Searche text_generation = await llm.text_generation( prompt=prompt, stream=new_input.streaming, - max_new_tokens=new_input.max_new_tokens, + max_new_tokens=new_input.max_tokens, repetition_penalty=new_input.repetition_penalty, temperature=new_input.temperature, top_k=new_input.top_k, @@ -119,7 +119,7 @@ async def stream_generator(): text_generation = await llm.text_generation( prompt=prompt, stream=input.streaming, - max_new_tokens=input.max_new_tokens, + max_new_tokens=input.max_tokens, repetition_penalty=input.repetition_penalty, temperature=input.temperature, top_k=input.top_k, diff --git a/comps/llms/text-generation/vllm/langchain/README.md b/comps/llms/text-generation/vllm/langchain/README.md index 6db006535..6f41b9fe0 100644 --- a/comps/llms/text-generation/vllm/langchain/README.md +++ b/comps/llms/text-generation/vllm/langchain/README.md @@ -196,26 +196,26 @@ curl http://${your_ip}:9000/v1/health_check\ User can set the following model parameters according to needs: -- max_new_tokens: Total output token +- max_tokens: Total output token - streaming(true/false): return text response in streaming mode or non-streaming mode ```bash # 1. Non-streaming mode curl http://${your_ip}:9000/v1/chat/completions \ -X POST \ - -d '{"query":"What is Deep Learning?","max_new_tokens":17,"top_p":0.95,"temperature":0.01,"streaming":false}' \ + -d '{"query":"What is Deep Learning?","max_tokens":17,"top_p":1,"temperature":0.7,"frequency_penalty":0,"presence_penalty":0, "streaming":false}' \ -H 'Content-Type: application/json' # 2. Streaming mode curl http://${your_ip}:9000/v1/chat/completions \ -X POST \ - -d '{"query":"What is Deep Learning?","max_new_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true}' \ + -d '{"query":"What is Deep Learning?","max_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true}' \ -H 'Content-Type: application/json' # 3. Custom chat template with streaming mode curl http://${your_ip}:9000/v1/chat/completions \ -X POST \ - -d '{"query":"What is Deep Learning?","max_new_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true, "chat_template":"### You are a helpful, respectful and honest assistant to help the user with questions.\n### Context: {context}\n### Question: {question}\n### Answer:"}' \ + -d '{"query":"What is Deep Learning?","max_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true, "chat_template":"### You are a helpful, respectful and honest assistant to help the user with questions.\n### Context: {context}\n### Question: {question}\n### Answer:"}' \ -H 'Content-Type: application/json' 4. # Chat with SearchedDoc (Retrieval context) @@ -224,3 +224,5 @@ curl http://${your_ip}:9000/v1/chat/completions \ -d '{"initial_query":"What is Deep Learning?","retrieved_docs":[{"text":"Deep Learning is a ..."},{"text":"Deep Learning is b ..."}]}' \ -H 'Content-Type: application/json' ``` + +For parameters, can refer to [LangChain VLLMOpenAI API](https://api.python.langchain.com/en/latest/llms/langchain_community.llms.vllm.VLLMOpenAI.html) diff --git a/comps/llms/text-generation/vllm/langchain/llm.py b/comps/llms/text-generation/vllm/langchain/llm.py index 9c6f1a047..fdb245320 100644 --- a/comps/llms/text-generation/vllm/langchain/llm.py +++ b/comps/llms/text-generation/vllm/langchain/llm.py @@ -79,10 +79,12 @@ def llm_generate(input: Union[LLMParamsDoc, ChatCompletionRequest, SearchedDoc]) llm = VLLMOpenAI( openai_api_key="EMPTY", openai_api_base=llm_endpoint + "/v1", - max_tokens=new_input.max_new_tokens, + max_tokens=new_input.max_tokens, model_name=model_name, top_p=new_input.top_p, temperature=new_input.temperature, + frequency_penalty=new_input.frequency_penalty, + presence_penalty=new_input.presence_penalty, streaming=new_input.streaming, ) @@ -132,10 +134,12 @@ def stream_generator(): llm = VLLMOpenAI( openai_api_key="EMPTY", openai_api_base=llm_endpoint + "/v1", - max_tokens=input.max_new_tokens, + max_tokens=input.max_tokens, model_name=model_name, top_p=input.top_p, temperature=input.temperature, + frequency_penalty=input.frequency_penalty, + presence_penalty=input.presence_penalty, streaming=input.streaming, ) diff --git a/comps/llms/text-generation/vllm/langchain/query.sh b/comps/llms/text-generation/vllm/langchain/query.sh index 5784b13a6..13b63511b 100644 --- a/comps/llms/text-generation/vllm/langchain/query.sh +++ b/comps/llms/text-generation/vllm/langchain/query.sh @@ -15,5 +15,5 @@ curl http://${your_ip}:8008/v1/completions \ ##query microservice curl http://${your_ip}:9000/v1/chat/completions \ -X POST \ - -d '{"query":"What is Deep Learning?","max_new_tokens":17,"top_p":0.95,"temperature":0.01,"streaming":false}' \ + -d '{"query":"What is Deep Learning?","max_tokens":17,"top_p":1,"temperature":0.7,"frequency_penalty":0,"presence_penalty":0, "streaming":false}' \ -H 'Content-Type: application/json' diff --git a/comps/llms/text-generation/vllm/langchain/requirements.txt b/comps/llms/text-generation/vllm/langchain/requirements.txt index d096a69ca..9bea1261f 100644 --- a/comps/llms/text-generation/vllm/langchain/requirements.txt +++ b/comps/llms/text-generation/vllm/langchain/requirements.txt @@ -1,7 +1,11 @@ docarray[full] fastapi huggingface_hub -langchain==0.1.16 +langchain #==0.1.12 +langchain-huggingface +langchain-openai +langchain_community +langchainhub opentelemetry-api opentelemetry-exporter-otlp opentelemetry-sdk diff --git a/comps/llms/text-generation/vllm/llama_index/README.md b/comps/llms/text-generation/vllm/llama_index/README.md index 4bd51c812..bf30abdf7 100644 --- a/comps/llms/text-generation/vllm/llama_index/README.md +++ b/comps/llms/text-generation/vllm/llama_index/README.md @@ -184,6 +184,6 @@ bash launch_microservice.sh ```bash curl http://${your_ip}:9000/v1/chat/completions \ -X POST \ - -d '{"query":"What is Deep Learning?","max_new_tokens":17,"top_p":0.95,"temperature":0.01,"streaming":false}' \ + -d '{"query":"What is Deep Learning?","max_tokens":17,"top_p":0.95,"temperature":0.01,"streaming":false}' \ -H 'Content-Type: application/json' ``` diff --git a/comps/llms/text-generation/vllm/llama_index/llm.py b/comps/llms/text-generation/vllm/llama_index/llm.py index b66348bf3..4c3957bae 100644 --- a/comps/llms/text-generation/vllm/llama_index/llm.py +++ b/comps/llms/text-generation/vllm/llama_index/llm.py @@ -47,7 +47,7 @@ def llm_generate(input: LLMParamsDoc): llm = OpenAILike( api_key="fake", api_base=llm_endpoint + "/v1", - max_tokens=input.max_new_tokens, + max_tokens=input.max_tokens, model=model_name, top_p=input.top_p, temperature=input.temperature, diff --git a/comps/llms/text-generation/vllm/llama_index/query.sh b/comps/llms/text-generation/vllm/llama_index/query.sh index 5784b13a6..68beefc4d 100644 --- a/comps/llms/text-generation/vllm/llama_index/query.sh +++ b/comps/llms/text-generation/vllm/llama_index/query.sh @@ -15,5 +15,5 @@ curl http://${your_ip}:8008/v1/completions \ ##query microservice curl http://${your_ip}:9000/v1/chat/completions \ -X POST \ - -d '{"query":"What is Deep Learning?","max_new_tokens":17,"top_p":0.95,"temperature":0.01,"streaming":false}' \ + -d '{"query":"What is Deep Learning?","max_tokens":17,"top_p":0.95,"temperature":0.01,"streaming":false}' \ -H 'Content-Type: application/json' diff --git a/comps/llms/text-generation/vllm/ray/README.md b/comps/llms/text-generation/vllm/ray/README.md index 0b9386d4f..f08aa8d24 100644 --- a/comps/llms/text-generation/vllm/ray/README.md +++ b/comps/llms/text-generation/vllm/ray/README.md @@ -82,6 +82,8 @@ bash ./launch_microservice.sh ```bash curl http://${your_ip}:9000/v1/chat/completions \ -X POST \ - -d '{"query":"What is Deep Learning?","max_new_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":false}' \ + -d '{"query":"What is Deep Learning?","max_tokens":17,"top_p":1,"temperature":0.7,"frequency_penalty":0,"presence_penalty":0, "streaming":false}' \ -H 'Content-Type: application/json' ``` + +For parameters, can refer to [LangChain ChatOpenAI API](https://python.langchain.com/v0.2/api_reference/openai/chat_models/langchain_openai.chat_models.base.ChatOpenAI.html) diff --git a/comps/llms/text-generation/vllm/ray/llm.py b/comps/llms/text-generation/vllm/ray/llm.py index e7efe6527..b11b45fb7 100644 --- a/comps/llms/text-generation/vllm/ray/llm.py +++ b/comps/llms/text-generation/vllm/ray/llm.py @@ -39,8 +39,11 @@ def llm_generate(input: LLMParamsDoc): openai_api_base=llm_endpoint + "/v1", model_name=llm_model, openai_api_key=os.getenv("OPENAI_API_KEY", "not_needed"), - max_tokens=input.max_new_tokens, + max_tokens=input.max_tokens, + top_p=input.top_p, temperature=input.temperature, + frequency_penalty=input.frequency_penalty, + presence_penalty=input.presence_penalty, streaming=input.streaming, request_timeout=600, ) diff --git a/comps/llms/text-generation/vllm/ray/query.sh b/comps/llms/text-generation/vllm/ray/query.sh index 3555751d1..87c3ce4f6 100644 --- a/comps/llms/text-generation/vllm/ray/query.sh +++ b/comps/llms/text-generation/vllm/ray/query.sh @@ -11,5 +11,5 @@ curl http://${your_ip}:8006/v1/chat/completions \ ##query microservice curl http://${your_ip}:9000/v1/chat/completions \ -X POST \ - -d '{"query":"What is Deep Learning?","max_new_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":false}' \ + -d '{"query":"What is Deep Learning?","max_tokens":17,"top_p":1,"temperature":0.7,"frequency_penalty":0,"presence_penalty":0, "streaming":false}' \ -H 'Content-Type: application/json' diff --git a/comps/llms/utils/lm-eval/self_hosted_hf.py b/comps/llms/utils/lm-eval/self_hosted_hf.py index 441605be0..5ea4accc0 100644 --- a/comps/llms/utils/lm-eval/self_hosted_hf.py +++ b/comps/llms/utils/lm-eval/self_hosted_hf.py @@ -22,7 +22,7 @@ class LLMCompletionDoc(BaseDoc): batched_inputs: List logprobs: int = 10 - max_tokens: int = 0 + max_new_tokens: int = 0 temperature: float = 0.0 diff --git a/tests/llms/test_llms_text-generation_predictionguard.sh b/tests/llms/test_llms_text-generation_predictionguard.sh index 39a66bcf4..0faad3ae4 100644 --- a/tests/llms/test_llms_text-generation_predictionguard.sh +++ b/tests/llms/test_llms_text-generation_predictionguard.sh @@ -36,7 +36,7 @@ function validate_microservice() { llm_service_port=9000 result=$(http_proxy="" curl http://${ip_address}:${llm_service_port}/v1/chat/completions \ -X POST \ - -d '{"model": "Hermes-2-Pro-Llama-3-8B", "query": "What is AI?", "streaming": false, "max_new_tokens": 100, "temperature": 0.7, "top_p": 1.0, "top_k": 50}' \ + -d '{"model": "Hermes-2-Pro-Llama-3-8B", "query": "What is AI?", "streaming": false, "max_tokens": 100, "temperature": 0.7, "top_p": 1.0, "top_k": 50}' \ -H 'Content-Type: application/json') if [[ $result == *"text"* ]]; then diff --git a/tests/llms/test_llms_text-generation_tgi.sh b/tests/llms/test_llms_text-generation_tgi.sh index db01b60e2..383535efc 100644 --- a/tests/llms/test_llms_text-generation_tgi.sh +++ b/tests/llms/test_llms_text-generation_tgi.sh @@ -48,7 +48,7 @@ function validate_microservice() { llm_port=5005 result=$(http_proxy="" curl http://${ip_address}:${llm_port}/v1/chat/completions \ -X POST \ - -d '{"query":"What is Deep Learning?", "max_new_tokens": 128}' \ + -d '{"query":"What is Deep Learning?", "max_tokens": 128}' \ -H 'Content-Type: application/json') if [[ $result == *"DONE"* ]]; then echo "Result correct." diff --git a/tests/llms/test_llms_text-generation_vllm_langchain_on_intel_hpu.sh b/tests/llms/test_llms_text-generation_vllm_langchain_on_intel_hpu.sh index 291e729a5..6ecf5d2d6 100644 --- a/tests/llms/test_llms_text-generation_vllm_langchain_on_intel_hpu.sh +++ b/tests/llms/test_llms_text-generation_vllm_langchain_on_intel_hpu.sh @@ -92,7 +92,7 @@ function validate_microservice() { fi result=$(http_proxy="" curl http://${ip_address}:5030/v1/chat/completions \ -X POST \ - -d '{"query":"What is Deep Learning?","max_new_tokens":17,"top_p":0.95,"temperature":0.01,"streaming":false}' \ + -d '{"query":"What is Deep Learning?","max_tokens":17,"top_p":1,"temperature":0.7,"frequency_penalty":0,"presence_penalty":0, "streaming":false}' \ -H 'Content-Type: application/json') if [[ $result == *"text"* ]]; then echo "Result correct." diff --git a/tests/llms/test_llms_text-generation_vllm_llamaindex_on_intel_hpu.sh b/tests/llms/test_llms_text-generation_vllm_llamaindex_on_intel_hpu.sh index 43fa4b8dc..ca67a00f4 100644 --- a/tests/llms/test_llms_text-generation_vllm_llamaindex_on_intel_hpu.sh +++ b/tests/llms/test_llms_text-generation_vllm_llamaindex_on_intel_hpu.sh @@ -92,7 +92,7 @@ function validate_microservice() { fi result=$(http_proxy="" curl http://${ip_address}:5030/v1/chat/completions \ -X POST \ - -d '{"query":"What is Deep Learning?","max_new_tokens":17,"top_p":0.95,"temperature":0.01,"streaming":false}' \ + -d '{"query":"What is Deep Learning?","max_tokens":17,"top_p":0.95,"temperature":0.01,"streaming":false}' \ -H 'Content-Type: application/json') if [[ $result == *"text"* ]]; then echo "Result correct." diff --git a/tests/llms/test_llms_text-generation_vllm_ray_on_intel_hpu.sh b/tests/llms/test_llms_text-generation_vllm_ray_on_intel_hpu.sh index e20c1e537..8f9dbec64 100644 --- a/tests/llms/test_llms_text-generation_vllm_ray_on_intel_hpu.sh +++ b/tests/llms/test_llms_text-generation_vllm_ray_on_intel_hpu.sh @@ -91,7 +91,7 @@ function validate_microservice() { service_port=5032 result=$(http_proxy="" curl http://${ip_address}:$service_port/v1/chat/completions \ -X POST \ - -d '{"query":"What is Deep Learning?","max_new_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":false}' \ + -d '{"query":"What is Deep Learning?","max_tokens":17,"top_p":1,"temperature":0.7,"frequency_penalty":0,"presence_penalty":0, "streaming":false}' \ -H 'Content-Type: application/json') if [[ $result == *"text"* ]]; then echo "Result correct."