Fix LLM special token issue (#895)

* Fix LLM special token issue Signed-off-by: lvliang-intel <[email protected]> * update code Signed-off-by: lvliang-intel <[email protected]> * update logic Signed-off-by: lvliang-intel <[email protected]> * update vllm llm Signed-off-by: lvliang-intel <[email protected]> --------- Signed-off-by: lvliang-intel <[email protected]> Co-authored-by: ZePan110 <[email protected]>
opea-project · Nov 14, 2024 · 517a5b0 · 517a5b0
1 parent 32bcde4
commit 517a5b0
Show file tree

Hide file tree

Showing 3 changed files with 30 additions and 23 deletions.
diff --git a/comps/llms/text-generation/tgi/llm.py b/comps/llms/text-generation/tgi/llm.py
@@ -112,11 +112,12 @@ async def stream_generator():
                 chat_response = ""
                 async for text in text_generation:
                     stream_gen_time.append(time.time() - start)
-                    chat_response += text
-                    chunk_repr = repr(text.encode("utf-8"))
-                    if logflag:
-                        logger.info(f"[ SearchedDoc ] chunk:{chunk_repr}")
-                    yield f"data: {chunk_repr}\n\n"
+                    if text not in ["<|im_end|>", "<|endoftext|>"]:
+                        chat_response += text
+                        chunk_repr = repr(text.encode("utf-8"))
+                        if logflag:
+                            logger.info(f"[ SearchedDoc ] chunk:{chunk_repr}")
+                        yield f"data: {chunk_repr}\n\n"
                 if logflag:
                     logger.info(f"[ SearchedDoc ] stream response: {chat_response}")
                 statistics_dict["opea_service@llm_tgi"].append_latency(stream_gen_time[-1], stream_gen_time[0])
@@ -162,11 +163,12 @@ async def stream_generator():
                 chat_response = ""
                 async for text in text_generation:
                     stream_gen_time.append(time.time() - start)
-                    chat_response += text
-                    chunk_repr = repr(text.encode("utf-8"))
-                    if logflag:
-                        logger.info(f"[ LLMParamsDoc ] chunk:{chunk_repr}")
-                    yield f"data: {chunk_repr}\n\n"
+                    if text not in ["<|im_end|>", "<|endoftext|>"]:
+                        chat_response += text
+                        chunk_repr = repr(text.encode("utf-8"))
+                        if logflag:
+                            logger.info(f"[ LLMParamsDoc ] chunk:{chunk_repr}")
+                        yield f"data: {chunk_repr}\n\n"
                 if logflag:
                     logger.info(f"[ LLMParamsDoc ] stream response: {chat_response}")
                 statistics_dict["opea_service@llm_tgi"].append_latency(stream_gen_time[-1], stream_gen_time[0])
@@ -271,7 +273,9 @@ def stream_generator():
                 for c in chat_completion:
                     if logflag:
                         logger.info(c)
-                    yield f"data: {c.model_dump_json()}\n\n"
+                    chunk = c.model_dump_json()
+                    if chunk not in ["<|im_end|>", "<|endoftext|>"]:
+                        yield f"data: {chunk}\n\n"
                 yield "data: [DONE]\n\n"
 
             return StreamingResponse(stream_generator(), media_type="text/event-stream")

diff --git a/comps/llms/text-generation/vllm/langchain/llm.py b/comps/llms/text-generation/vllm/langchain/llm.py
@@ -124,11 +124,12 @@ async def llm_generate(input: Union[LLMParamsDoc, ChatCompletionRequest, Searche
             async def stream_generator():
                 chat_response = ""
                 async for text in llm.astream(new_input.query, **parameters):
-                    chat_response += text
-                    chunk_repr = repr(text.encode("utf-8"))
-                    if logflag:
-                        logger.info(f"[ SearchedDoc ] chunk: {chunk_repr}")
-                    yield f"data: {chunk_repr}\n\n"
+                    if text not in ["<|im_end|>", "<|endoftext|>"]:
+                        chat_response += text
+                        chunk_repr = repr(text.encode("utf-8"))
+                        if logflag:
+                            logger.info(f"[ SearchedDoc ] chunk: {chunk_repr}")
+                        yield f"data: {chunk_repr}\n\n"
                 if logflag:
                     logger.info(f"[ SearchedDoc ] stream response: {chat_response}")
                 yield "data: [DONE]\n\n"
@@ -175,11 +176,12 @@ async def stream_generator():
             async def stream_generator():
                 chat_response = ""
                 async for text in llm.astream(prompt, **parameters):
-                    chat_response += text
-                    chunk_repr = repr(text.encode("utf-8"))
-                    if logflag:
-                        logger.info(f"[ LLMParamsDoc ] chunk: {chunk_repr}")
-                    yield f"data: {chunk_repr}\n\n"
+                    if text not in ["<|im_end|>", "<|endoftext|>"]:
+                        chat_response += text
+                        chunk_repr = repr(text.encode("utf-8"))
+                        if logflag:
+                            logger.info(f"[ LLMParamsDoc ] chunk: {chunk_repr}")
+                        yield f"data: {chunk_repr}\n\n"
                 if logflag:
                     logger.info(f"[ LLMParamsDoc ] stream response: {chat_response}")
                 yield "data: [DONE]\n\n"

diff --git a/comps/llms/text-generation/vllm/llama_index/llm.py b/comps/llms/text-generation/vllm/llama_index/llm.py
@@ -58,8 +58,9 @@ async def llm_generate(input: LLMParamsDoc):
 
         async def stream_generator():
             async for text in llm.astream_complete(input.query):
-                output = text.text
-                yield f"data: {output}\n\n"
+                if text.text not in ["<|im_end|>", "<|endoftext|>"]:
+                    output = text.text
+                    yield f"data: {output}\n\n"
             if logflag:
                 logger.info(f"[llm - chat_stream] stream response: {output}")
             yield "data: [DONE]\n\n"