From a7a31046ea0788224ea38e163a273f94e6af7446 Mon Sep 17 00:00:00 2001 From: Jincheng Miao Date: Tue, 18 Jun 2024 11:32:35 +0800 Subject: [PATCH] Add a new embedding MosecEmbedding (#182) * Add a new embedding MosecEmbedding. Signed-off-by: Jincheng Miao * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Jincheng Miao Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: Xinyao Wang --- comps/embeddings/langchain-mosec/README.md | 33 +++++ comps/embeddings/langchain-mosec/__init__.py | 2 + .../langchain-mosec/docker/Dockerfile | 28 +++++ .../docker/docker_compose_embedding.yaml | 22 ++++ .../langchain-mosec/embedding_mosec.py | 74 +++++++++++ .../langchain-mosec/mosec-docker/Dockerfile | 23 ++++ .../langchain-mosec/mosec-docker/README.md | 43 +++++++ .../mosec-docker/server-ipex.py | 119 ++++++++++++++++++ .../mosec-docker/test-embedding.py | 18 +++ .../langchain-mosec/requirements.txt | 9 ++ tests/test_embeddings_langchain-mosec.sh | 63 ++++++++++ 11 files changed, 434 insertions(+) create mode 100644 comps/embeddings/langchain-mosec/README.md create mode 100644 comps/embeddings/langchain-mosec/__init__.py create mode 100644 comps/embeddings/langchain-mosec/docker/Dockerfile create mode 100644 comps/embeddings/langchain-mosec/docker/docker_compose_embedding.yaml create mode 100644 comps/embeddings/langchain-mosec/embedding_mosec.py create mode 100644 comps/embeddings/langchain-mosec/mosec-docker/Dockerfile create mode 100644 comps/embeddings/langchain-mosec/mosec-docker/README.md create mode 100644 comps/embeddings/langchain-mosec/mosec-docker/server-ipex.py create mode 100644 comps/embeddings/langchain-mosec/mosec-docker/test-embedding.py create mode 100644 comps/embeddings/langchain-mosec/requirements.txt create mode 100644 tests/test_embeddings_langchain-mosec.sh diff --git a/comps/embeddings/langchain-mosec/README.md b/comps/embeddings/langchain-mosec/README.md new file mode 100644 index 000000000..4ceedc2fa --- /dev/null +++ b/comps/embeddings/langchain-mosec/README.md @@ -0,0 +1,33 @@ +# build Mosec endpoint docker image + +``` +docker build --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_proxy -t langchain-mosec:latest -f comps/embeddings/langchain-mosec/mosec-docker/Dockerfile . +``` + +# build embedding microservice docker image + +``` +docker build --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_proxy -t opea/embedding-langchain-mosec:latest -f comps/embeddings/langchain-mosec/docker/Dockerfile . +``` + +# launch Mosec endpoint docker container + +``` +docker run -d --name="embedding-langchain-mosec-endpoint" -p 6001:8000 langchain-mosec:latest +``` + +# launch embedding microservice docker container + +``` +export MOSEC_EMBEDDING_ENDPOINT=http://127.0.0.1:6001 +docker run -d --name="embedding-langchain-mosec-server" -e http_proxy=$http_proxy -e https_proxy=$https_proxy -p 6000:6000 --ipc=host -e MOSEC_EMBEDDING_ENDPOINT=$MOSEC_EMBEDDING_ENDPOINT opea/embedding-langchain-mosec:latest +``` + +# run client test + +``` +curl localhost:6000/v1/embeddings \ + -X POST \ + -d '{"text":"Hello, world!"}' \ + -H 'Content-Type: application/json' +``` diff --git a/comps/embeddings/langchain-mosec/__init__.py b/comps/embeddings/langchain-mosec/__init__.py new file mode 100644 index 000000000..916f3a44b --- /dev/null +++ b/comps/embeddings/langchain-mosec/__init__.py @@ -0,0 +1,2 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 diff --git a/comps/embeddings/langchain-mosec/docker/Dockerfile b/comps/embeddings/langchain-mosec/docker/Dockerfile new file mode 100644 index 000000000..2fa2e7036 --- /dev/null +++ b/comps/embeddings/langchain-mosec/docker/Dockerfile @@ -0,0 +1,28 @@ + +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +FROM langchain/langchain:latest + +RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ + libgl1-mesa-glx \ + libjemalloc-dev \ + vim + +RUN useradd -m -s /bin/bash user && \ + mkdir -p /home/user && \ + chown -R user /home/user/ + +USER user + +COPY comps /home/user/comps + +RUN pip install --no-cache-dir --upgrade pip && \ + pip install --no-cache-dir -r /home/user/comps/embeddings/langchain-mosec/requirements.txt + +ENV PYTHONPATH=$PYTHONPATH:/home/user + +WORKDIR /home/user/comps/embeddings/langchain-mosec + +ENTRYPOINT ["python", "embedding_mosec.py"] + diff --git a/comps/embeddings/langchain-mosec/docker/docker_compose_embedding.yaml b/comps/embeddings/langchain-mosec/docker/docker_compose_embedding.yaml new file mode 100644 index 000000000..fec800655 --- /dev/null +++ b/comps/embeddings/langchain-mosec/docker/docker_compose_embedding.yaml @@ -0,0 +1,22 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +version: "3.8" + +services: + embedding: + image: opea/embedding-langchain-mosec:latest + container_name: embedding-langchain-mosec-server + ports: + - "6000:6000" + ipc: host + environment: + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + MOSEC_EMBEDDING_ENDPOINT: ${MOSEC_EMBEDDING_ENDPOINT} + LANGCHAIN_API_KEY: ${LANGCHAIN_API_KEY} + restart: unless-stopped + +networks: + default: + driver: bridge diff --git a/comps/embeddings/langchain-mosec/embedding_mosec.py b/comps/embeddings/langchain-mosec/embedding_mosec.py new file mode 100644 index 000000000..2e033ccc0 --- /dev/null +++ b/comps/embeddings/langchain-mosec/embedding_mosec.py @@ -0,0 +1,74 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import os +import time +from typing import List, Optional + +from langchain_community.embeddings import OpenAIEmbeddings +from langsmith import traceable + +from comps import ( + EmbedDoc768, + ServiceType, + TextDoc, + opea_microservices, + register_microservice, + register_statistics, + statistics_dict, +) + + +class MosecEmbeddings(OpenAIEmbeddings): + def _get_len_safe_embeddings( + self, texts: List[str], *, engine: str, chunk_size: Optional[int] = None + ) -> List[List[float]]: + _chunk_size = chunk_size or self.chunk_size + batched_embeddings: List[List[float]] = [] + response = self.client.create(input=texts, **self._invocation_params) + if not isinstance(response, dict): + response = response.model_dump() + batched_embeddings.extend(r["embedding"] for r in response["data"]) + + _cached_empty_embedding: Optional[List[float]] = None + + def empty_embedding() -> List[float]: + nonlocal _cached_empty_embedding + if _cached_empty_embedding is None: + average_embedded = self.client.create(input="", **self._invocation_params) + if not isinstance(average_embedded, dict): + average_embedded = average_embedded.model_dump() + _cached_empty_embedding = average_embedded["data"][0]["embedding"] + return _cached_empty_embedding + + return [e if e is not None else empty_embedding() for e in batched_embeddings] + + +@register_microservice( + name="opea_service@embedding_mosec", + service_type=ServiceType.EMBEDDING, + endpoint="/v1/embeddings", + host="0.0.0.0", + port=6000, + input_datatype=TextDoc, + output_datatype=EmbedDoc768, +) +@traceable(run_type="embedding") +@register_statistics(names=["opea_service@embedding_mosec"]) +def embedding(input: TextDoc) -> EmbedDoc768: + start = time.time() + embed_vector = embeddings.embed_query(input.text) + embed_vector = embed_vector[:768] # Keep only the first 768 elements + res = EmbedDoc768(text=input.text, embedding=embed_vector) + statistics_dict["opea_service@embedding_mosec"].append_latency(time.time() - start, None) + return res + + +if __name__ == "__main__": + MOSEC_EMBEDDING_ENDPOINT = os.environ.get("MOSEC_EMBEDDING_ENDPOINT", "http://127.0.0.1:8080") + os.environ["OPENAI_API_BASE"] = MOSEC_EMBEDDING_ENDPOINT + os.environ["OPENAI_API_KEY"] = "Dummy key" + MODEL_ID = "/root/bge-large-zh" + embeddings = MosecEmbeddings(model=MODEL_ID) + print("Mosec Embedding initialized.") + opea_microservices["opea_service@embedding_mosec"].start() diff --git a/comps/embeddings/langchain-mosec/mosec-docker/Dockerfile b/comps/embeddings/langchain-mosec/mosec-docker/Dockerfile new file mode 100644 index 000000000..eb1e510a7 --- /dev/null +++ b/comps/embeddings/langchain-mosec/mosec-docker/Dockerfile @@ -0,0 +1,23 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +From ubuntu:22.04 +ARG DEBIAN_FRONTEND=noninteractive + +ENV GLIBC_TUNABLES glibc.cpu.x86_shstk=permissive + +COPY comps /root/comps + +RUN apt update && apt install -y python3 python3-pip +RUN pip3 install torch==2.2.2 torchvision --index-url https://download.pytorch.org/whl/cpu +RUN pip3 install intel-extension-for-pytorch==2.2.0 +RUN pip3 install transformers +RUN pip3 install llmspec mosec + +RUN cd /root/ && export HF_ENDPOINT=https://hf-mirror.com && huggingface-cli download --resume-download BAAI/bge-large-zh --local-dir /root/bge-large-zh + +ENV EMB_MODEL="/root/bge-large-zh/" + +WORKDIR /root/comps/embeddings/langchain-mosec/mosec-docker + +CMD ["python3", "server-ipex.py"] diff --git a/comps/embeddings/langchain-mosec/mosec-docker/README.md b/comps/embeddings/langchain-mosec/mosec-docker/README.md new file mode 100644 index 000000000..2f87dd30b --- /dev/null +++ b/comps/embeddings/langchain-mosec/mosec-docker/README.md @@ -0,0 +1,43 @@ +# Embedding Server + +## 1. Introduction + +This service has an OpenAI compatible restful API to extract text features. +It is dedicated to be used on Xeon to accelerate embedding model serving. +Currently the local model is BGE-large-zh. + +## 2. Quick Start + +### 2.1 Build Docker image + +```shell +docker build -t embedding:latest . +``` + +### 2.2 Launch server + +```shell +docker run -itd -p 8000:8000 embedding:latest +``` + +### 2.3 Client test + +- Restful API by curl + +```shell +curl -X POST http://127.0.0.1:8000/v1/embeddings -H "Content-Type: application/json" -d '{ "model": "/root/bge-large-zh/", "input": "hello world"}' +``` + +- generate embedding from python + +```python +DEFAULT_MODEL = "/root/bge-large-zh/" +SERVICE_URL = "http://127.0.0.1:8000" +INPUT_STR = "Hello world!" + +client = Client(api_key="fake", base_url=SERVICE_URL) +emb = client.embeddings.create( + model=DEFAULT_MODEL, + input=INPUT_STR, +) +``` diff --git a/comps/embeddings/langchain-mosec/mosec-docker/server-ipex.py b/comps/embeddings/langchain-mosec/mosec-docker/server-ipex.py new file mode 100644 index 000000000..03d926f6b --- /dev/null +++ b/comps/embeddings/langchain-mosec/mosec-docker/server-ipex.py @@ -0,0 +1,119 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import base64 +import os +from typing import List, Union + +import intel_extension_for_pytorch as ipex +import numpy as np +import torch # type: ignore +import torch.nn.functional as F # type: ignore +import transformers # type: ignore +from llmspec import EmbeddingData, EmbeddingRequest, EmbeddingResponse, TokenUsage +from mosec import ClientError, Runtime, Server, Worker + +DEFAULT_MODEL = "/root/bge-large-zh/" + + +class Embedding(Worker): + def __init__(self): + self.model_name = os.environ.get("EMB_MODEL", DEFAULT_MODEL) + self.tokenizer = transformers.AutoTokenizer.from_pretrained(self.model_name) + self.model = transformers.AutoModel.from_pretrained(self.model_name) + self.device = torch.cuda.current_device() if torch.cuda.is_available() else "cpu" + + self.model = self.model.to(self.device) + self.model.eval() + + # jit trace model + self.model = ipex.optimize(self.model, dtype=torch.bfloat16) + vocab_size = self.model.config.vocab_size + batch_size = 16 + seq_length = 512 + d = torch.randint(vocab_size, size=[batch_size, seq_length]) + t = torch.randint(0, 1, size=[batch_size, seq_length]) + m = torch.randint(1, 2, size=[batch_size, seq_length]) + self.model = torch.jit.trace(self.model, [d, t, m], check_trace=False, strict=False) + self.model = torch.jit.freeze(self.model) + self.model(d, t, m) + + def get_embedding_with_token_count(self, sentences: Union[str, List[Union[str, List[int]]]]): + # Mean Pooling - Take attention mask into account for correct averaging + def mean_pooling(model_output, attention_mask): + # First element of model_output contains all token embeddings + token_embeddings = model_output["last_hidden_state"] + input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() + return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp( + input_mask_expanded.sum(1), min=1e-9 + ) + + # Tokenize sentences + # TODO: support `List[List[int]]` input + encoded_input = self.tokenizer(sentences, padding=True, truncation=True, return_tensors="pt") + inputs = encoded_input.to(self.device) + token_count = inputs["attention_mask"].sum(dim=1).tolist() + # Compute token embeddings + model_output = self.model(**inputs) + # Perform pooling + sentence_embeddings = mean_pooling(model_output, inputs["attention_mask"]) + # Normalize embeddings + sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1) + + return token_count, sentence_embeddings + + def deserialize(self, data: bytes) -> EmbeddingRequest: + return EmbeddingRequest.from_bytes(data) + + def serialize(self, data: EmbeddingResponse) -> bytes: + return data.to_json() + + def forward(self, data: List[EmbeddingRequest]) -> List[EmbeddingResponse]: + inputs = [] + inputs_lens = [] + for d in data: + inputs.extend(d.input if isinstance(d.input, list) else [d.input]) + inputs_lens.append(len(d.input) if isinstance(d.input, list) else 1) + token_cnt, embeddings = self.get_embedding_with_token_count(inputs) + + embeddings = embeddings.detach() + if self.device != "cpu": + embeddings = embeddings.cpu() + embeddings = embeddings.numpy() + embeddings = [emb.tolist() for emb in embeddings] + + resp = [] + emb_idx = 0 + for lens in inputs_lens: + token_count = sum(token_cnt[emb_idx : emb_idx + lens]) + resp.append( + EmbeddingResponse( + data=[ + EmbeddingData(embedding=emb, index=i) + for i, emb in enumerate(embeddings[emb_idx : emb_idx + lens]) + ], + model=self.model_name, + usage=TokenUsage( + prompt_tokens=token_count, + # No completions performed, only embeddings generated. + completion_tokens=0, + total_tokens=token_count, + ), + ) + ) + emb_idx += lens + return resp + + +if __name__ == "__main__": + MAX_BATCH_SIZE = int(os.environ.get("MAX_BATCH_SIZE", 128)) + MAX_WAIT_TIME = int(os.environ.get("MAX_WAIT_TIME", 10)) + server = Server() + emb = Runtime(Embedding, max_batch_size=MAX_BATCH_SIZE, max_wait_time=MAX_WAIT_TIME) + server.register_runtime( + { + "/v1/embeddings": [emb], + "/embeddings": [emb], + } + ) + server.run() diff --git a/comps/embeddings/langchain-mosec/mosec-docker/test-embedding.py b/comps/embeddings/langchain-mosec/mosec-docker/test-embedding.py new file mode 100644 index 000000000..4334249b9 --- /dev/null +++ b/comps/embeddings/langchain-mosec/mosec-docker/test-embedding.py @@ -0,0 +1,18 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +"""OpenAI embedding client example.""" + +from openai import Client + +DEFAULT_MODEL = "/root/bge-large-zh/" +SERVICE_URL = "http://127.0.0.1:8000" +INPUT_STR = "Hello world!" + +client = Client(api_key="fake", base_url=SERVICE_URL) +emb = client.embeddings.create( + model=DEFAULT_MODEL, + input=INPUT_STR, +) + +print(len(emb.data)) # type: ignore +print(emb.data[0].embedding) # type: ignore diff --git a/comps/embeddings/langchain-mosec/requirements.txt b/comps/embeddings/langchain-mosec/requirements.txt new file mode 100644 index 000000000..65c79959e --- /dev/null +++ b/comps/embeddings/langchain-mosec/requirements.txt @@ -0,0 +1,9 @@ +docarray[full] +fastapi +langchain +langchain_community +openai +opentelemetry-api +opentelemetry-exporter-otlp +opentelemetry-sdk +shortuuid diff --git a/tests/test_embeddings_langchain-mosec.sh b/tests/test_embeddings_langchain-mosec.sh new file mode 100644 index 000000000..1381a6dcb --- /dev/null +++ b/tests/test_embeddings_langchain-mosec.sh @@ -0,0 +1,63 @@ +#!/bin/bash +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +set -xe + +WORKPATH=$(dirname "$PWD") +ip_address=$(hostname -I | awk '{print $1}') + +function build_mosec_docker_images() { + cd $WORKPATH + echo $(pwd) + docker build --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_proxy --no-cache -t langchain-mosec:comps -f comps/embeddings/langchain-mosec/mosec-docker/Dockerfile . +} + +function build_docker_images() { + cd $WORKPATH + echo $(pwd) + docker build --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_proxy --no-cache -t opea/embedding-langchain-mosec:comps -f comps/embeddings/langchain-mosec/docker/Dockerfile . +} + +function start_service() { + mosec_endpoint=5001 + model="BAAI/bge-large-en-v1.5" + unset http_proxy + docker run -d --name="test-comps-embedding-langchain-mosec-endpoint" -p $mosec_endpoint:8000 langchain-mosec:comps + export MOSEC_EMBEDDING_ENDPOINT="http://${ip_address}:${mosec_endpoint}" + mosec_service_port=5002 + docker run -d --name="test-comps-embedding-langchain-mosec-server" -e http_proxy=$http_proxy -e https_proxy=$https_proxy -p ${mosec_service_port}:6000 --ipc=host -e MOSEC_EMBEDDING_ENDPOINT=$MOSEC_EMBEDDING_ENDPOINT opea/embedding-langchain-mosec:comps + sleep 3m +} + +function validate_microservice() { + mosec_service_port=5002 + http_proxy="" curl http://${ip_address}:$mosec_service_port/v1/embeddings \ + -X POST \ + -d '{"text":"What is Deep Learning?"}' \ + -H 'Content-Type: application/json' +} + +function stop_docker() { + cid=$(docker ps -aq --filter "name=test-comps-embedding-langchain-mosec-*") + if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid && sleep 1s; fi +} + +function main() { + + stop_docker + + build_mosec_docker_images + + build_docker_images + + start_service + + validate_microservice + + stop_docker + echo y | docker system prune + +} + +main