Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Define embedding/ranking/llm request/response format #289

Draft
wants to merge 26 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
2a5f0f1
define embedding/ranking/llm request/response by referring to openai …
lkk12014402 Jul 10, 2024
7e49cf4
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jul 10, 2024
e3ce9b9
update embedding request/response.
lkk12014402 Jul 11, 2024
8c46fa9
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jul 11, 2024
7e369c3
update chat completion/embedding request/response format.
lkk12014402 Jul 11, 2024
3c1ff7c
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jul 11, 2024
db3acff
format retrival request/response and output retrieval metadata.
lkk12014402 Jul 12, 2024
6d9ed47
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jul 12, 2024
79258e7
format retrival request/response and output retrieval metadata.
lkk12014402 Jul 12, 2024
9b5c798
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jul 12, 2024
00ac00a
format reranking request/response and output rerank metadata.
lkk12014402 Jul 12, 2024
3030318
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jul 12, 2024
671efa5
format llm microservice with openai format.
lkk12014402 Jul 15, 2024
60dee7b
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jul 15, 2024
0440948
add openai dependency.
lkk12014402 Jul 15, 2024
bf5650c
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jul 15, 2024
9a35261
Update api_protocol.py
lkk12014402 Jul 15, 2024
bb86966
update `ChatCompletionRequest` params.
lkk12014402 Jul 17, 2024
cfc8389
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jul 17, 2024
6747c34
format llm microservice with openai format.
lkk12014402 Jul 17, 2024
88a5cf4
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jul 17, 2024
e830e4d
format llm microservice with openai format.
lkk12014402 Jul 17, 2024
745f2da
Merge branch 'enhance_openai_format' of https://github.com/opea-proje…
lkk12014402 Jul 17, 2024
73257b2
format llm microservice with openai format.
lkk12014402 Jul 17, 2024
0d19e2b
format llm microservice with openai format.
lkk12014402 Jul 17, 2024
e20c353
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jul 17, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
276 changes: 228 additions & 48 deletions comps/cores/proto/api_protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
from fastapi.responses import JSONResponse
from pydantic import BaseModel, Field

from comps.utils import random_uuid


class ServiceCard(BaseModel):
object: str = "service"
Expand All @@ -30,24 +32,128 @@ class UsageInfo(BaseModel):
completion_tokens: Optional[int] = 0


class ResponseFormat(BaseModel):
# type must be "json_object" or "text"
type: Literal["text", "json_object"]


class StreamOptions(BaseModel):
# refer https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/openai/protocol.py#L105
include_usage: Optional[bool]


class FunctionDefinition(BaseModel):
name: str
description: Optional[str] = None
parameters: Optional[Dict[str, Any]] = None


class ChatCompletionToolsParam(BaseModel):
type: Literal["function"] = "function"
function: FunctionDefinition


class ChatCompletionNamedFunction(BaseModel):
name: str


class ChatCompletionNamedToolChoiceParam(BaseModel):
function: ChatCompletionNamedFunction
type: Literal["function"] = "function"


class ChatCompletionRequest(BaseModel):
# Ordered by official OpenAI API documentation
# https://platform.openai.com/docs/api-reference/chat/create
messages: Union[
str,
List[Dict[str, str]],
List[Dict[str, Union[str, List[Dict[str, Union[str, Dict[str, str]]]]]]],
]
model: Optional[str] = "Intel/neural-chat-7b-v3-3"
temperature: Optional[float] = 0.01
top_p: Optional[float] = 0.95
top_k: Optional[int] = 10
frequency_penalty: Optional[float] = 0.0
logit_bias: Optional[Dict[str, float]] = None
logprobs: Optional[bool] = False
top_logprobs: Optional[int] = 0
max_tokens: Optional[int] = 16 # use https://platform.openai.com/docs/api-reference/completions/create
n: Optional[int] = 1
max_tokens: Optional[int] = 1024
stop: Optional[Union[str, List[str]]] = None
presence_penalty: Optional[float] = 0.0
response_format: Optional[ResponseFormat] = None
seed: Optional[int] = None
service_tier: Optional[str] = None
stop: Union[str, List[str], None] = Field(default_factory=list)
stream: Optional[bool] = False
presence_penalty: Optional[float] = 1.03
frequency_penalty: Optional[float] = 0.0
stream_options: Optional[StreamOptions] = None
temperature: Optional[float] = 1.0 # vllm default 0.7
top_p: Optional[float] = None # openai default 1.0, but tgi needs `top_p` must be > 0.0 and < 1.0, set None
tools: Optional[List[ChatCompletionToolsParam]] = None
tool_choice: Optional[Union[Literal["none"], ChatCompletionNamedToolChoiceParam]] = "none"
parallel_tool_calls: Optional[bool] = True
user: Optional[str] = None

# vllm reference: https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/openai/protocol.py#L130
top_k: Optional[int] = -1
repetition_penalty: Optional[float] = 1.0

# doc: begin-chat-completion-extra-params
echo: Optional[bool] = Field(
default=False,
description=(
"If true, the new message will be prepended with the last message " "if they belong to the same role."
),
)
add_generation_prompt: Optional[bool] = Field(
default=True,
description=(
"If true, the generation prompt will be added to the chat template. "
"This is a parameter used by chat template in tokenizer config of the "
"model."
),
)
add_special_tokens: Optional[bool] = Field(
default=False,
description=(
"If true, special tokens (e.g. BOS) will be added to the prompt "
"on top of what is added by the chat template. "
"For most models, the chat template takes care of adding the "
"special tokens so this should be set to False (as is the "
"default)."
),
)
documents: Optional[List[Dict[str, str]]] = Field(
default=None,
description=(
"A list of dicts representing documents that will be accessible to "
"the model if it is performing RAG (retrieval-augmented generation)."
" If the template does not support RAG, this argument will have no "
"effect. We recommend that each document should be a dict containing "
'"title" and "text" keys.'
),
)
chat_template: Optional[str] = Field(
default=None,
description=(
"A Jinja template to use for this conversion. "
"If this is not passed, the model's default chat template will be "
"used instead."
),
)
chat_template_kwargs: Optional[Dict[str, Any]] = Field(
default=None,
description=("Additional kwargs to pass to the template renderer. " "Will be accessible by the chat template."),
)
# doc: end-chat-completion-extra-params

# tgi reference: https://huggingface.github.io/text-generation-inference/#/Text%20Generation%20Inference/generate
# some tgi parameters in use
# default values are same with
# https://github.com/huggingface/text-generation-inference/blob/main/router/src/lib.rs#L190
# max_new_tokens: Optional[int] = 100 # Priority use openai
# top_k: Optional[int] = None
# top_p: Optional[float] = None # Priority use openai
typical_p: Optional[float] = None
# repetition_penalty: Optional[float] = None


class AudioChatCompletionRequest(BaseModel):
audio: str
Expand All @@ -71,78 +177,152 @@ class AudioChatCompletionRequest(BaseModel):
user: Optional[str] = None


class TokenCheckRequestItem(BaseModel):
model: str
prompt: str
max_tokens: int


class TokenCheckRequest(BaseModel):
prompts: List[TokenCheckRequestItem]


class TokenCheckResponseItem(BaseModel):
fits: bool
tokenCount: int
contextLength: int


class TokenCheckResponse(BaseModel):
prompts: List[TokenCheckResponseItem]


class EmbeddingRequest(BaseModel):
# Ordered by official OpenAI API documentation
# https://platform.openai.com/docs/api-reference/embeddings
model: Optional[str] = None
input: Union[List[int], List[List[int]], str, List[str]]
encoding_format: Optional[str] = Field("float", pattern="^(float|base64)$")
dimensions: Optional[int] = 768 # Keep only the first 768 elements
user: Optional[str] = None


class EmbeddingResponseData(BaseModel):
index: int
object: str = "embedding"
embedding: Union[List[float], str]


class EmbeddingResponse(BaseModel):
object: str = "list"
model: Optional[str] = None
data: List[EmbeddingResponseData]
usage: Optional[UsageInfo] = None


class RetrievalRequest(BaseModel):
embedding: Union[EmbeddingResponse, List[float]] = None
text: Optional[str] = None # search_type maybe need, like "mmr"
search_type: str = "similarity"
k: int = 4
distance_threshold: Optional[float] = None
fetch_k: int = 20
lambda_mult: float = 0.5
score_threshold: float = 0.2


class RetrievalResponseData(BaseModel):
text: str
metadata: Optional[Dict[str, Any]] = None


class RetrievalResponse(BaseModel):
retrieved_docs: List[RetrievalResponseData]


class RerankingRequest(BaseModel):
text: str
retrieved_docs: Union[List[RetrievalResponseData], List[Dict[str, Any]]]
top_n: int = 1


class RerankingResponseData(BaseModel):
text: str
score: Optional[float] = 0.0


class RerankingResponse(BaseModel):
reranked_docs: List[RerankingResponseData]


class ChatMessage(BaseModel):
role: str
content: str


class ChatCompletionLogProb(BaseModel):
token: str
logprob: float = -9999.0
bytes: Optional[List[int]] = None


class ChatCompletionLogProbsContent(ChatCompletionLogProb):
top_logprobs: List[ChatCompletionLogProb] = Field(default_factory=list)


class ChatCompletionLogProbs(BaseModel):
content: Optional[List[ChatCompletionLogProbsContent]] = None


class ChatCompletionResponseChoice(BaseModel):
index: int
message: ChatMessage
finish_reason: Optional[Literal["stop", "length"]] = None
logprobs: Optional[ChatCompletionLogProbs] = None
finish_reason: Optional[str] = None
stop_reason: Optional[Union[int, str]] = None


class ChatCompletionResponse(BaseModel):
id: str = Field(default_factory=lambda: f"chatcmpl-{shortuuid.random()}")
object: str = "chat.completion"
id: str = Field(default_factory=lambda: f"chatcmpl-{random_uuid()}")
object: Literal["chat.completion"] = "chat.completion"
created: int = Field(default_factory=lambda: int(time.time()))
model: str
choices: List[ChatCompletionResponseChoice]
usage: UsageInfo


class FunctionCall(BaseModel):
name: str
arguments: str


class ToolCall(BaseModel):
id: str = Field(default_factory=lambda: f"chatcmpl-tool-{random_uuid()}")
type: Literal["function"] = "function"
function: FunctionCall


class DeltaMessage(BaseModel):
role: Optional[str] = None
content: Optional[str] = None
tool_calls: List[ToolCall] = Field(default_factory=list)


class ChatCompletionResponseStreamChoice(BaseModel):
index: int
delta: DeltaMessage
finish_reason: Optional[Literal["stop", "length"]] = None
logprobs: Optional[ChatCompletionLogProbs] = None
finish_reason: Optional[str] = None
stop_reason: Optional[Union[int, str]] = None


class ChatCompletionStreamResponse(BaseModel):
id: str = Field(default_factory=lambda: f"chatcmpl-{shortuuid.random()}")
object: str = "chat.completion.chunk"
id: str = Field(default_factory=lambda: f"chatcmpl-{random_uuid()}")
object: Literal["chat.completion.chunk"] = "chat.completion.chunk"
created: int = Field(default_factory=lambda: int(time.time()))
model: str
choices: List[ChatCompletionResponseStreamChoice]


class TokenCheckRequestItem(BaseModel):
model: str
prompt: str
max_tokens: int


class TokenCheckRequest(BaseModel):
prompts: List[TokenCheckRequestItem]


class TokenCheckResponseItem(BaseModel):
fits: bool
tokenCount: int
contextLength: int


class TokenCheckResponse(BaseModel):
prompts: List[TokenCheckResponseItem]


class EmbeddingsRequest(BaseModel):
model: Optional[str] = None
engine: Optional[str] = None
input: Union[str, List[Any]]
user: Optional[str] = None
encoding_format: Optional[str] = None


class EmbeddingsResponse(BaseModel):
object: str = "list"
data: List[Dict[str, Any]]
model: str
usage: UsageInfo
usage: Optional[UsageInfo] = Field(default=None)


class CompletionRequest(BaseModel):
Expand Down
25 changes: 9 additions & 16 deletions comps/embeddings/langchain/embedding_tei.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,8 @@
from langchain_community.embeddings import HuggingFaceHubEmbeddings
from langsmith import traceable

from comps import (
EmbedDoc768,
ServiceType,
TextDoc,
opea_microservices,
register_microservice,
register_statistics,
statistics_dict,
)
from comps import ServiceType, opea_microservices, register_microservice, register_statistics, statistics_dict
from comps.cores.proto.api_protocol import EmbeddingRequest, EmbeddingResponse, EmbeddingResponseData


@register_microservice(
Expand All @@ -24,18 +17,18 @@
endpoint="/v1/embeddings",
host="0.0.0.0",
port=6000,
input_datatype=TextDoc,
output_datatype=EmbedDoc768,
input_datatype=EmbeddingRequest,
output_datatype=EmbeddingResponse,
)
@traceable(run_type="embedding")
@register_statistics(names=["opea_service@embedding_tei_langchain"])
def embedding(input: TextDoc) -> EmbedDoc768:
def embedding(request: EmbeddingRequest) -> EmbeddingResponse:
start = time.time()
embed_vector = embeddings.embed_query(input.text)
embed_vector = embed_vector[:768] # Keep only the first 768 elements
res = EmbedDoc768(text=input.text, embedding=embed_vector)
embed_vector = embeddings.embed_query(request.input)
embed_vector = embed_vector[: request.dimensions]
response = EmbeddingResponse(data=[EmbeddingResponseData(index=0, embedding=embed_vector)])
statistics_dict["opea_service@embedding_tei_langchain"].append_latency(time.time() - start, None)
return res
return response


if __name__ == "__main__":
Expand Down
Loading
Loading