From ad685edf96cd8d510687007fd7f667a19cee6fb4 Mon Sep 17 00:00:00 2001 From: Tiep Le Date: Thu, 22 Aug 2024 22:18:10 +0000 Subject: [PATCH 1/4] multimodal embedding for MM RAG for videos Signed-off-by: Tiep Le --- comps/__init__.py | 4 + comps/cores/proto/docarray.py | 38 ++- .../multimodal_embeddings/README.md | 185 +++++++++++++ .../multimodal_embeddings/__init__.py | 2 + .../bridgetower/__init__.py | 5 + .../bridgetower/bridgetower_custom.py | 243 ++++++++++++++++++ .../bridgetower/bridgetower_embedding.py | 119 +++++++++ .../bridgetower/bridgetower_server.py | 153 +++++++++++ .../bridgetower/docker/Dockerfile | 25 ++ .../bridgetower/docker/Dockerfile_hpu | 29 +++ ...ompose_bridgetower_embedding_endpoint.yaml | 19 ++ .../bridgetower/utils.py | 90 +++++++ .../multimodal_langchain/__init__.py | 2 + .../multimodal_langchain/docker/Dockerfile | 29 +++ .../docker_compose_multimodal_embedding.yaml | 21 ++ .../local_mm_embedding.py | 61 +++++ .../multimodal_langchain/mm_embedding_mmei.py | 84 ++++++ .../multimodal_langchain/requirements.txt | 15 ++ ...est_multimodal_embeddings_langchain_cpu.sh | 111 ++++++++ ...est_multimodal_embeddings_langchain_hpu.sh | 111 ++++++++ 20 files changed, 1344 insertions(+), 2 deletions(-) create mode 100644 comps/embeddings/multimodal_embeddings/README.md create mode 100644 comps/embeddings/multimodal_embeddings/__init__.py create mode 100644 comps/embeddings/multimodal_embeddings/bridgetower/__init__.py create mode 100644 comps/embeddings/multimodal_embeddings/bridgetower/bridgetower_custom.py create mode 100644 comps/embeddings/multimodal_embeddings/bridgetower/bridgetower_embedding.py create mode 100644 comps/embeddings/multimodal_embeddings/bridgetower/bridgetower_server.py create mode 100644 comps/embeddings/multimodal_embeddings/bridgetower/docker/Dockerfile create mode 100644 comps/embeddings/multimodal_embeddings/bridgetower/docker/Dockerfile_hpu create mode 100644 comps/embeddings/multimodal_embeddings/bridgetower/docker/docker_compose_bridgetower_embedding_endpoint.yaml create mode 100644 comps/embeddings/multimodal_embeddings/bridgetower/utils.py create mode 100644 comps/embeddings/multimodal_embeddings/multimodal_langchain/__init__.py create mode 100644 comps/embeddings/multimodal_embeddings/multimodal_langchain/docker/Dockerfile create mode 100644 comps/embeddings/multimodal_embeddings/multimodal_langchain/docker/docker_compose_multimodal_embedding.yaml create mode 100644 comps/embeddings/multimodal_embeddings/multimodal_langchain/local_mm_embedding.py create mode 100644 comps/embeddings/multimodal_embeddings/multimodal_langchain/mm_embedding_mmei.py create mode 100644 comps/embeddings/multimodal_embeddings/multimodal_langchain/requirements.txt create mode 100644 tests/test_multimodal_embeddings_langchain_cpu.sh create mode 100644 tests/test_multimodal_embeddings_langchain_hpu.sh diff --git a/comps/__init__.py b/comps/__init__.py index a5d00f9e0..10c5835fc 100644 --- a/comps/__init__.py +++ b/comps/__init__.py @@ -19,6 +19,10 @@ GraphDoc, LVMDoc, LVMVideoDoc, + ImageDoc, + TextImageDoc, + MultimodalDoc, + EmbedMultimodalDoc, ) # Constants diff --git a/comps/cores/proto/docarray.py b/comps/cores/proto/docarray.py index 0d397dcb7..4d2c427ae 100644 --- a/comps/cores/proto/docarray.py +++ b/comps/cores/proto/docarray.py @@ -6,7 +6,7 @@ import numpy as np from docarray import BaseDoc, DocList from docarray.documents import AudioDoc, VideoDoc -from docarray.typing import AudioUrl +from docarray.typing import AudioUrl, ImageUrl from pydantic import Field, conint, conlist, field_validator @@ -17,7 +17,30 @@ class TopologyInfo: class TextDoc(BaseDoc, TopologyInfo): - text: str + text: str = None + + +class ImageDoc(BaseDoc): + url: Optional[ImageUrl] = Field( + description="The path to the image. It can be remote (Web) URL, or a local file path", + default=None, + ) + base64_image: Optional[str] = Field( + description="The base64-based encoding of the image", + default=None, + ) + + +class TextImageDoc(BaseDoc): + image: ImageDoc = None + text: TextDoc = None + + +MultimodalDoc = Union[ + TextDoc, + ImageDoc, + TextImageDoc, +] class Base64ByteStrDoc(BaseDoc): @@ -43,6 +66,17 @@ class EmbedDoc(BaseDoc): score_threshold: float = 0.2 +class EmbedMultimodalDoc(EmbedDoc): + # extend EmbedDoc with these attributes + url: Optional[ImageUrl] = Field( + description="The path to the image. It can be remote (Web) URL, or a local file path.", + default=None, + ) + base64_image: Optional[str] = Field( + description="The base64-based encoding of the image.", + default=None, + ) + class Audio2TextDoc(AudioDoc): url: Optional[AudioUrl] = Field( description="The path to the audio.", diff --git a/comps/embeddings/multimodal_embeddings/README.md b/comps/embeddings/multimodal_embeddings/README.md new file mode 100644 index 000000000..dd926220e --- /dev/null +++ b/comps/embeddings/multimodal_embeddings/README.md @@ -0,0 +1,185 @@ +# Multimodal Embeddings Microservice + +The Multimodal Embedding Microservice is designed to efficiently convert pairs of textual string and image into vectorized embeddings, facilitating seamless integration into various machine learning and data processing workflows. This service utilizes advanced algorithms to generate high-quality embeddings that capture the joint semantic essence of the input text-and-image pairs, making it ideal for applications in multi-modal data processing, information retrieval, and similar fields. + +Key Features: + +**High Performance**: Optimized for quick and reliable conversion of textual data and image inputs into vector embeddings. + +**Scalability**: Built to handle high volumes of requests simultaneously, ensuring robust performance even under heavy loads. + +**Ease of Integration**: Provides a simple and intuitive API, allowing for straightforward integration into existing systems and workflows. + +**Customizable**: Supports configuration and customization to meet specific use case requirements, including different embedding models and preprocessing techniques. + +Users are albe to configure and build embedding-related services according to their actual needs. + +## 🚀1. Start Microservice with Python (Option 1) + +Currently, we provide two ways to implement the multimodal embedding service: + +1. Build the multimodal embedding model **locally** from the server, which is faster, but takes up memory on the local server. +2. Build it based on the multimodal embedding inference endpoint (**MMEI endpoint**), which provides more flexibility, but may bring some network latency. + +For both of the implementations, you need to install requirements first. + +### 1.1 Install Requirements + +```bash +# run with langchain +pip install -r multimodal_langchain/requirements.txt +``` + +### 1.2 Start Embedding Service + +You can select one of the following to start the multimodal embedding service: + +**Start Multimodal Embedding Service with MMEI** + +First, you need to start a MMEI service. + +```bash +export your_mmei_port=8080 +export EMBEDDER_PORT=$your_mmei_port +``` + +Currently, we employ [**BridgeTower**](https://huggingface.co/BridgeTower/bridgetower-large-itm-mlm-gaudi) model for MMEI and provide two ways to start MMEI: + +1. Start MMEI on Gaudi2 HPU +2. Start MMEI on Xeon CPU (if Gaudi2 HPU is not available) + +- Gaudi2 HPU + +```bash +cd ../../.. +docker build -t opea/bridgetower-embedder:latest --build-arg EMBEDDER_PORT=$EMBEDDER_PORT --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/embeddings/multimodal_embeddings/bridgetower/docker/Dockerfile_hpu . +cd comps/embeddings/multimodal_embeddings/bridgetower/docker/ +docker compose -f docker_compose_bridgetower_embedding_endpoint.yaml up -d +``` + +- Xeon CPU + +```bash +cd ../../.. +docker build -t opea/bridgetower-embedder:latest --build-arg EMBEDDER_PORT=$EMBEDDER_PORT --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/embeddings/multimodal_embeddings/bridgetower/docker/Dockerfile . +cd comps/embeddings/multimodal_embeddings/bridgetower/docker/ +docker compose -f docker_compose_bridgetower_embedding_endpoint.yaml up -d +``` + +Then you need to test your MMEI service using the following commands: + +```bash +curl http://localhost:$your_mmei_port/v1/encode \ + -X POST \ + -H "Content-Type:application/json" \ + -d '{"text":"This is example"}' +``` + +Start the embedding service with MMEI_EMBEDDING_ENDPOINT. + +```bash +# run with langchain +cd multimodal_langchain +export MMEI_EMBEDDING_ENDPOINT="http://localhost:$your_mmei_port/v1/encode" +export your_embedding_port_microservice=6600 +export MM_EMBEDDING_PORT_MICROSERVICE=$your_embedding_port_microservice +python mm_embedding_mmei.py +``` + +**Start Embedding Service with Local Model** + +```bash +# run with langchain +cd multimodal_langchain +export your_embedding_port_microservice=6600 +export MM_EMBEDDING_PORT_MICROSERVICE=$your_embedding_port_microservice +python local_mm_embedding.py +``` + +## 🚀2. Start Microservice with Docker (Option 2) + +### 2.1 Start Multimodal Embedding Inference (MMEI) Service + +First, you need to start a MMEI service. + +```bash +export your_mmei_port=8080 +export EMBEDDER_PORT=$your_mmei_port +``` + +Currently, we employ [**BridgeTower**](https://huggingface.co/BridgeTower/bridgetower-large-itm-mlm-gaudi) model for MMEI and provide two ways to start MMEI: + +1. Start MMEI on Gaudi2 HPU +2. Start MMEI on Xeon CPU (if Gaudi2 HPU is not available) + +- Gaudi2 HPU + +```bash +cd ../../.. +docker build -t opea/bridgetower-embedder:latest --build-arg EMBEDDER_PORT=$EMBEDDER_PORT --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/embeddings/multimodal_embeddings/bridgetower/docker/Dockerfile_hpu . +cd comps/embeddings/multimodal_embeddings/bridgetower/docker/ +docker compose -f docker_compose_bridgetower_embedding_endpoint.yaml up -d +``` + +- Xeon CPU + +```bash +cd ../../.. +docker build -t opea/bridgetower-embedder:latest --build-arg EMBEDDER_PORT=$EMBEDDER_PORT --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/embeddings/multimodal_embeddings/bridgetower/docker/Dockerfile . +cd comps/embeddings/multimodal_embeddings/bridgetower/docker/ +docker compose -f docker_compose_bridgetower_embedding_endpoint.yaml up -d +``` + +Then you need to test your MMEI service using the following commands: + +```bash +curl http://localhost:$your_mmei_port/v1/encode \ + -X POST \ + -H "Content-Type:application/json" \ + -d '{"text":"This is example"}' +``` + +Export the `MMEI_EMBEDDING_ENDPOINT` for later usage: + +```bash +export ip_address=$(hostname -I | awk '{print $1}') +export MMEI_EMBEDDING_ENDPOINT="http://$ip_address:$your_mmei_port/v1/encode" +``` + +### 2.2 Build Docker Image + +#### Build Langchain Docker + +```bash +cd ../../.. +docker build -t opea/embedding-multimodal:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/embeddings/multimodal_embeddings/multimodal_langchain/docker/Dockerfile . +``` + +### 2.3 Run Docker with Docker Compose + +```bash +cd multimodal_langchain/docker +export your_embedding_port_microservice=6600 +export MM_EMBEDDING_PORT_MICROSERVICE=$your_embedding_port_microservice +docker compose -f docker_compose_multimodal_embedding.yaml up -d +``` + +## 🚀3. Consume Embedding Service + +### 2.2 Consume Embedding Service + +**Compute a joint embedding of an image-text pair** + +```bash +curl -X POST http://0.0.0.0:6600/v1/embeddings \ + -H "Content-Type: application/json" \ + -d '{"text": {"text" : "This is some sample text."}, "image" : {"url": "https://github.com/docarray/docarray/blob/main/tests/toydata/image-data/apple.png?raw=true"}}' +``` + +**Compute an embedding of a text** + +```bash +curl -X POST http://0.0.0.0:6600/v1/embeddings \ + -H "Content-Type: application/json" \ + -d '{"text" : "This is some sample text."}' +``` \ No newline at end of file diff --git a/comps/embeddings/multimodal_embeddings/__init__.py b/comps/embeddings/multimodal_embeddings/__init__.py new file mode 100644 index 000000000..4582b4f9a --- /dev/null +++ b/comps/embeddings/multimodal_embeddings/__init__.py @@ -0,0 +1,2 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 \ No newline at end of file diff --git a/comps/embeddings/multimodal_embeddings/bridgetower/__init__.py b/comps/embeddings/multimodal_embeddings/bridgetower/__init__.py new file mode 100644 index 000000000..e5d90e19c --- /dev/null +++ b/comps/embeddings/multimodal_embeddings/bridgetower/__init__.py @@ -0,0 +1,5 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +from .bridgetower_embedding import BridgeTowerEmbedding +from .bridgetower_custom import BridgeTowerTextFeatureExtractor, BridgeTowerForITC \ No newline at end of file diff --git a/comps/embeddings/multimodal_embeddings/bridgetower/bridgetower_custom.py b/comps/embeddings/multimodal_embeddings/bridgetower/bridgetower_custom.py new file mode 100644 index 000000000..a9260a7b1 --- /dev/null +++ b/comps/embeddings/multimodal_embeddings/bridgetower/bridgetower_custom.py @@ -0,0 +1,243 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +from collections import OrderedDict +from typing import List, Optional, Tuple, Union + +import torch +import torch.nn.functional as F +from torch import nn +from torchvision import transforms +from torchvision.transforms import CenterCrop, Compose, Normalize, Resize, ToTensor +from transformers import BridgeTowerModel, BridgeTowerPreTrainedModel +from transformers.modeling_outputs import SequenceClassifierOutput +from transformers.models.bridgetower.modeling_bridgetower import ( + BridgeTowerContrastiveHead, + BridgeTowerTextModel, + BridgeTowerVisionModel, +) + + +class LayerNorm(nn.LayerNorm): + """Subclass torch's LayerNorm to handle fp16.""" + + def forward(self, x: torch.Tensor): + orig_type = x.dtype + ret = super().forward(x.type(torch.float32)) + return ret.type(orig_type) + + +class BridgeTowerImageFeatureExtractor(nn.Module): + def __init__( + self, + patch_size=14, + width=1024, + resolution_after=294, + ckpt_path=None, + ): + super().__init__() + + self.conv1 = nn.Conv2d(in_channels=3, out_channels=width, kernel_size=patch_size, stride=patch_size, bias=False) + + scale = width**-0.5 + self.class_embedding = nn.Parameter(scale * torch.randn(width)) + self.positional_embedding = nn.Parameter(scale * torch.randn((resolution_after // patch_size) ** 2 + 1, width)) + self.ln_pre = LayerNorm(width) + + if ckpt_path is not None: + sd = torch.load(ckpt_path) + if "state_dict" in sd: + sd = sd["state_dict"] + print(f"Loading feature extractor checkpoint from {ckpt_path}") + self.load_state_dict(sd) + + def forward(self, x: torch.Tensor): + x = self.conv1(x) # shape = [*, width, grid, grid] + x = x.reshape(x.shape[0], x.shape[1], -1) # shape = [*, width, grid ** 2] + x = x.permute(0, 2, 1) # shape = [*, grid ** 2, width] + t = self.class_embedding.to(x.dtype) + torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device) + x = torch.cat([t, x], dim=1) # shape = [*, grid ** 2 + 1, width] + x = x + self.positional_embedding.to(x.dtype) + x = self.ln_pre(x) + x = x.permute(1, 0, 2) # NLD -> LND + return x + + +class BridgeTowerITCHead(nn.Module): + def __init__(self, hidden_size, embed_size): + super().__init__() + self.fc = nn.Linear(hidden_size, embed_size) + + def forward(self, x): + x = self.fc(x) + return x + + +class _BridgeTowerTextModelWrapper(nn.Module): + def __init__(self, config): + super().__init__() + self.text_model = BridgeTowerTextModel(config) + + def forward(self, **kwargs): + return self.text_model(**kwargs) + + +class _BridgeTowerVisionModelWrapper(nn.Module): + def __init__(self, config): + super().__init__() + self.vision_model = BridgeTowerVisionModel(config.vision_config) + + if config.share_cross_modal_transformer_layers: + self.cross_modal_image_transform = nn.Linear(config.vision_config.hidden_size, config.hidden_size) + else: + self.cross_modal_image_transform = nn.ModuleList( + [ + nn.Linear(config.vision_config.hidden_size, config.hidden_size) + for _ in range(config.num_hidden_layers) + ] + ) + self.token_type_embeddings = nn.Embedding(2, config.hidden_size) + + def forward(self, **kwargs): + return self.vision_model(**kwargs) + + +class BridgeTowerVisionFeatureExtractor(BridgeTowerPreTrainedModel): + def __init__(self, config): + super().__init__(config) + + self.bridgetower = _BridgeTowerVisionModelWrapper(config) + self.itc_image_head = BridgeTowerContrastiveHead(config.hidden_size, config.contrastive_hidden_size) + + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + labels: Optional[torch.LongTensor] = None, + ): + + outputs = self.bridgetower(input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=True) + final_hidden_cls = outputs.hidden_states[-1][:, 0, :] + + image_embeds_with_ln = self.bridgetower.vision_model.visual.forward_post(final_hidden_cls) + image_token_type_embeddings = self.bridgetower.token_type_embeddings( + torch.full((1,), 1, dtype=torch.long, device=self.bridgetower.token_type_embeddings.weight.device) + ).expand_as(image_embeds_with_ln) + + image_embeds = self.bridgetower.cross_modal_image_transform(image_embeds_with_ln) + image_token_type_embeddings + + final_hidden_cls = F.normalize(self.itc_image_head(image_embeds), dim=-1, p=2) + + return final_hidden_cls + + +class BridgeTowerTextFeatureExtractor(BridgeTowerPreTrainedModel): + def __init__(self, config): + super().__init__(config) + + self.bridgetower = _BridgeTowerTextModelWrapper(config.text_config) + self.itc_text_head = BridgeTowerITCHead(config.hidden_size, config.contrastive_hidden_size) + + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + labels: Optional[torch.LongTensor] = None, + ): + + outputs = self.bridgetower(input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=True) + final_hidden_cls = outputs.hidden_states[-1][:, 0, :] + final_hidden_cls = F.normalize(self.itc_text_head(final_hidden_cls), dim=-1, p=2) + + return final_hidden_cls + + +class BridgeTowerForITC(BridgeTowerPreTrainedModel): + def __init__(self, config): + super().__init__(config) + + self.bridgetower = BridgeTowerModel(config) + + self.itc_text_head = BridgeTowerITCHead(config.hidden_size, config.contrastive_hidden_size) + self.itc_image_head = BridgeTowerITCHead(config.hidden_size, config.contrastive_hidden_size) + self.itc_cross_modal_head = BridgeTowerITCHead(config.hidden_size * 2, config.contrastive_hidden_size) + + # Initialize weights and apply final processing + self.post_init() + + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + pixel_values: Optional[torch.FloatTensor] = None, + pixel_mask: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + image_embeds: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + labels: Optional[torch.LongTensor] = None, + ) -> Union[SequenceClassifierOutput, Tuple[torch.FloatTensor]]: + + assert output_hidden_states, "output_hidden_states should be set to True for BridgeTowerForITC" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.bridgetower( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + pixel_values=pixel_values, + pixel_mask=pixel_mask, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + image_embeds=image_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + pooler_output = outputs.pooler_output if return_dict else outputs[2] + + hidden_states_txt, hidden_states_img, hidden_states_cross_modal = outputs.hidden_states + + final_hidden_txt = hidden_states_txt[-1] + final_hidden_img = hidden_states_img[-1] + + image_embeds_with_ln = self.bridgetower.vision_model.visual.forward_post(final_hidden_img) + image_token_type_embeddings = self.bridgetower.token_type_embeddings( + torch.full((1,), 1, dtype=torch.long, device=self.bridgetower.token_type_embeddings.weight.device) + ).expand_as(image_embeds_with_ln) + + final_hidden_img = ( + self.bridgetower.cross_modal_image_transform(image_embeds_with_ln) + image_token_type_embeddings + ) + + final_hidden_txt = F.normalize(self.itc_text_head(final_hidden_txt[:, 0, :]), dim=-1, p=2) + final_hidden_img = F.normalize(self.itc_image_head(final_hidden_img[:, 0, :]), dim=-1, p=2) + final_hidden_cross = F.normalize(self.itc_cross_modal_head(pooler_output), dim=-1, p=2) + + logits = torch.stack([final_hidden_txt, final_hidden_img, final_hidden_cross], dim=-2) + + if not return_dict: + return tuple(logits) + + return SequenceClassifierOutput( + loss=None, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) \ No newline at end of file diff --git a/comps/embeddings/multimodal_embeddings/bridgetower/bridgetower_embedding.py b/comps/embeddings/multimodal_embeddings/bridgetower/bridgetower_embedding.py new file mode 100644 index 000000000..99f9fb6dd --- /dev/null +++ b/comps/embeddings/multimodal_embeddings/bridgetower/bridgetower_embedding.py @@ -0,0 +1,119 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +from typing import Any, List + +import torch +from langchain_core.embeddings import Embeddings +from langchain_core.pydantic_v1 import BaseModel, Extra +from PIL import Image +from transformers import BridgeTowerProcessor + +from .bridgetower_custom import BridgeTowerForITC, BridgeTowerTextFeatureExtractor + + +class BridgeTowerEmbedding(BaseModel, Embeddings): + """BridgeTower embedding model.""" + + model_name: str = "BridgeTower/bridgetower-large-itm-mlm-itc" + device: str = "cpu" + TEXT_MODEL: Any + PROCESSOR: Any + MODEL: Any + + def __init__(self, **kwargs: Any): + """Initialize the BridgeTowerEmbedding class.""" + super().__init__(**kwargs) + + if "device" in kwargs: + if kwargs["device"] == "hpu": + try: + import habana_frameworks.torch.core as htcore + + self.device = torch.device("hpu") + except ImportError: + self.device = "cpu" + elif kwargs["device"] == "gpu": + if torch.cuda.is_available(): + self.device = "cuda" + else: + self.device = "cpu" + + self.TEXT_MODEL = BridgeTowerTextFeatureExtractor.from_pretrained(self.model_name).to(self.device) + self.PROCESSOR = BridgeTowerProcessor.from_pretrained(self.model_name) + self.MODEL = BridgeTowerForITC.from_pretrained(self.model_name).to(self.device) + + class Config: + """Configuration for this pydantic object.""" + + extra = Extra.forbid + + def embed_documents(self, texts: List[str]) -> List[List[float]]: + """Embed a list of documents using BridgeTower. + Args: + texts: The list of texts to embed. + Returns: + List of embeddings, one for each text. + """ + encodings = self.PROCESSOR.tokenizer(texts, return_tensors="pt").to(self.device) + with torch.no_grad(): + outputs = self.TEXT_MODEL(**encodings) + embeddings = outputs.cpu().numpy().tolist() + return embeddings + + def embed_query(self, text: str) -> List[float]: + """Embed a query using BridgeTower. + Args: + text: The text to embed. + Returns: + Embeddings for the text. + """ + return self.embed_documents([text])[0] + + def embed_image_text_pairs(self, texts: List[str], images: list[Image], batch_size=2) -> List[List[float]]: # type: ignore + """Embed a list of image-text pairs using BridgeTower. + Args: + texts: The list of texts to embed. + images: The list of path-to-images to embed + batch_size: the batch size to process, default to 2 + Returns: + List of embeddings, one for each image-text pairs. + """ + + # the length of texts must be equal to the length of images + assert len(texts) == len(images), "the len of captions should be equal to the len of images" + + image_list = [] + text_list = [] + embeddings = [] + for pil_img, text in zip(images, texts): + # print(path_to_img) + # img = read_image(path_to_img, mode=ImageReadMode.RGB) + # img = transform.to_pil_image(img) + + img = pil_img.convert("RGB") + image_list.append(img) + text_list.append(text) + if len(text_list) == batch_size: + batch = self.PROCESSOR( + image_list, text_list, return_tensors="pt", max_length=200, padding="max_length", truncation=True + ).to(self.device) + with torch.no_grad(): + batch_embeddings = self.MODEL(**batch, output_hidden_states=True) + + for i in range(len(text_list)): + embeddings.append(batch_embeddings.logits[i, 2, :].detach().cpu().numpy().tolist()) + image_list = [] + text_list = [] + # embedding the remaining + if len(text_list) > 0: + batch = self.PROCESSOR( + image_list, text_list, return_tensors="pt", max_length=100, padding="max_length", truncation=True + ).to(self.device) + with torch.no_grad(): + batch_embeddings = self.MODEL(**batch, output_hidden_states=True) + for i in range(len(text_list)): + embeddings.append(batch_embeddings.logits[i, 2, :].detach().cpu().numpy().tolist()) + image_list = [] + text_list = [] + return embeddings \ No newline at end of file diff --git a/comps/embeddings/multimodal_embeddings/bridgetower/bridgetower_server.py b/comps/embeddings/multimodal_embeddings/bridgetower/bridgetower_server.py new file mode 100644 index 000000000..200773203 --- /dev/null +++ b/comps/embeddings/multimodal_embeddings/bridgetower/bridgetower_server.py @@ -0,0 +1,153 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import argparse +import asyncio +import base64 +import os +import uuid +from functools import partial +from io import BytesIO +from typing import List + +import PIL +import PIL.Image +import requests +import uvicorn +from fastapi import BackgroundTasks, FastAPI, Request +from fastapi.responses import JSONResponse, Response +from utils import build_logger + +from comps.embeddings.multimodal_embeddings.bridgetower import BridgeTowerEmbedding + +worker_id = str(uuid.uuid4())[:6] +print(f"worker_id: {worker_id}") +logger = build_logger("embedding_worker", f"bridgetower_embedding_worker_{worker_id}.log") +model_semaphore = None +global_counter = 0 + +model_name_or_path = None +model_dtype = None +use_hpu_graphs = True + + +app = FastAPI() + + +def release_model_semaphore(fn=None): + model_semaphore.release() + if fn is not None: + fn() + + +def get_queue_length(): + if model_semaphore is None: + return 0 + else: + return ( + args.limit_model_concurrency + - model_semaphore._value + + (len(model_semaphore._waiters) if model_semaphore._waiters is not None else 0) + ) + + +def get_status(): + return { + "model_names": [model_name_or_path], + "speed": 1, + "queue_length": get_queue_length(), + "global_counter": global_counter, + } + + +@app.get("/v1/health_check") +async def health() -> Response: + """Health check.""" + return Response(status_code=200, content=b'{"message" : "BridgeTower server is running..."}') + + +@app.post("/v1/encode") +async def encode(request: Request) -> Response: + global model_semaphore, global_counter + global_counter += 1 + + request_dict = await request.json() + if model_semaphore is None: + model_semaphore = asyncio.Semaphore(args.limit_model_concurrency) + await model_semaphore.acquire() + + text = request_dict.pop("text") + image = None + if "img_b64_str" in request_dict.keys(): + img_b64_str = request_dict.pop("img_b64_str") + image = PIL.Image.open(BytesIO(base64.b64decode(img_b64_str))) + if image is None: + # embed text only + embeddings = embedder.embed_documents([text])[0] + else: + # embed image and text pair + embeddings = embedder.embed_image_text_pairs([text], [image], batch_size=1)[0] + + background_tasks = BackgroundTasks() + background_tasks.add_task(partial(release_model_semaphore)) + return JSONResponse( + status_code=200, + content={ + "embedding": embeddings, + }, + background=background_tasks, + ) + + +@app.post("/v1/worker_get_status") +async def get_woker_status(): + return get_status() + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--host", type=str, default="0.0.0.0") + parser.add_argument("--model_name_or_path", type=str, default="BridgeTower/bridgetower-large-itm-mlm-itc") + parser.add_argument("--warmup", type=int, default=1, help="Number of warmup iterations for benchmarking.") + parser.add_argument("--device", type=str, default="cpu") + parser.add_argument("--limit-model-concurrency", type=int, default=5) + + args = parser.parse_args() + # get port from env variable if exist + args.port = int(os.getenv("PORT", 8080)) + + print(f"device: {args.device}") + logger.info(f"args: {args}") + + if args.device == "hpu": + try: + import habana_frameworks.torch.core as htcore + except ImportError: + print("device: hpu is not available. Using cpu instead!") + args.device = "cpu" + + model_name_or_path = args.model_name_or_path + + embedder = BridgeTowerEmbedding(device=args.device) + + # warmup + print("Warmup...") + image_paths = ["https://llava-vl.github.io/static/images/view.jpg"] + example_prompts = ["This is test image!"] + images = [] + for image_path in image_paths: + images.append(PIL.Image.open(requests.get(image_path, stream=True, timeout=3000).raw)) + for i in range(args.warmup): + embedder.embed_image_text_pairs( + example_prompts, + images, + batch_size=1, + ) + print("Done warmup...") + + uvicorn.run( + app, + host=args.host, + port=args.port, + log_level="debug", + ) \ No newline at end of file diff --git a/comps/embeddings/multimodal_embeddings/bridgetower/docker/Dockerfile b/comps/embeddings/multimodal_embeddings/bridgetower/docker/Dockerfile new file mode 100644 index 000000000..83cd41ae1 --- /dev/null +++ b/comps/embeddings/multimodal_embeddings/bridgetower/docker/Dockerfile @@ -0,0 +1,25 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +FROM python:3.10-slim +RUN useradd -m -s /bin/bash user && \ + mkdir -p /home/user && \ + chown -R user /home/user/ +USER user +# Set environment variables +ENV LANG=en_US.UTF-8 +ENV PYTHONPATH=/home/user:/usr/lib/habanalabs/:/optimum-habana + +COPY --chown=user comps /home/user/comps + +RUN pip install --no-cache-dir --upgrade pip && \ + pip install --no-cache-dir -r /home/user/comps/embeddings/multimodal_embeddings/multimodal_langchain/requirements.txt + +ENV PYTHONPATH=$PYTHONPATH:/home/user + +ARG EMBEDDER_PORT=8080 +ENV PORT=$EMBEDDER_PORT + +WORKDIR /home/user/comps/embeddings/multimodal_embeddings/bridgetower + +ENTRYPOINT ["python", "bridgetower_server.py", "--device", "cpu"] \ No newline at end of file diff --git a/comps/embeddings/multimodal_embeddings/bridgetower/docker/Dockerfile_hpu b/comps/embeddings/multimodal_embeddings/bridgetower/docker/Dockerfile_hpu new file mode 100644 index 000000000..e571ab253 --- /dev/null +++ b/comps/embeddings/multimodal_embeddings/bridgetower/docker/Dockerfile_hpu @@ -0,0 +1,29 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +# HABANA environment +FROM vault.habana.ai/gaudi-docker/1.16.1/ubuntu22.04/habanalabs/pytorch-installer-2.2.2:latest AS hpu +RUN useradd -m -s /bin/bash user && \ + mkdir -p /home/user && \ + chown -R user /home/user/ + +RUN rm -rf /etc/ssh/ssh_host* +USER user +# Set environment variables +ENV LANG=en_US.UTF-8 +ENV PYTHONPATH=/home/user:/usr/lib/habanalabs/:/optimum-habana + +COPY --chown=user comps /home/user/comps + +# Install requirements and optimum habana +RUN pip install --no-cache-dir --upgrade pip && \ + pip install --no-cache-dir -r /home/user/comps/embeddings/multimodal_embeddings/multimodal_langchain/requirements.txt && \ + pip install optimum[habana] + +ENV PYTHONPATH=$PYTHONPATH:/home/user + +ARG EMBEDDER_PORT=8080 +ENV PORT=$EMBEDDER_PORT + +WORKDIR /home/user/comps/embeddings/multimodal_embeddings/bridgetower +ENTRYPOINT ["python", "bridgetower_server.py", "--device", "hpu"] \ No newline at end of file diff --git a/comps/embeddings/multimodal_embeddings/bridgetower/docker/docker_compose_bridgetower_embedding_endpoint.yaml b/comps/embeddings/multimodal_embeddings/bridgetower/docker/docker_compose_bridgetower_embedding_endpoint.yaml new file mode 100644 index 000000000..9767490d0 --- /dev/null +++ b/comps/embeddings/multimodal_embeddings/bridgetower/docker/docker_compose_bridgetower_embedding_endpoint.yaml @@ -0,0 +1,19 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +services: + bridgetower: + image: opea/bridgetower-embedder:latest + container_name: bridgetower-embedding-server + ports: + - ${EMBEDDER_PORT}:${EMBEDDER_PORT} + ipc: host + environment: + no_proxy: ${no_proxy} + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + restart: unless-stopped + +networks: + default: + driver: bridge diff --git a/comps/embeddings/multimodal_embeddings/bridgetower/utils.py b/comps/embeddings/multimodal_embeddings/bridgetower/utils.py new file mode 100644 index 000000000..673d54dbc --- /dev/null +++ b/comps/embeddings/multimodal_embeddings/bridgetower/utils.py @@ -0,0 +1,90 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import logging +import os +import sys + +handler = None +save_log = True +LOGDIR = "." + + +def build_logger(logger_name, logger_filename): + global handler + + formatter = logging.Formatter( + fmt="%(asctime)s | %(levelname)s | %(name)s | %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + ) + + # Set the format of root handlers + if not logging.getLogger().handlers: + logging.basicConfig(level=logging.INFO) + logging.getLogger().handlers[0].setFormatter(formatter) + + # Redirect stdout and stderr to loggers + stdout_logger = logging.getLogger("stdout") + stdout_logger.setLevel(logging.INFO) + sl = StreamToLogger(stdout_logger, logging.INFO) + sys.stdout = sl + + stderr_logger = logging.getLogger("stderr") + stderr_logger.setLevel(logging.ERROR) + sl = StreamToLogger(stderr_logger, logging.ERROR) + sys.stderr = sl + + # Get logger + logger = logging.getLogger(logger_name) + logger.setLevel(logging.INFO) + + # Add a file handler for all loggers + if handler is None and save_log: + os.makedirs(LOGDIR, exist_ok=True) + filename = os.path.join(LOGDIR, logger_filename) + handler = logging.handlers.TimedRotatingFileHandler(filename, when="D", utc=True, encoding="UTF-8") + handler.setFormatter(formatter) + + for name, item in logging.root.manager.loggerDict.items(): + if isinstance(item, logging.Logger): + item.addHandler(handler) + + return logger + + +class StreamToLogger(object): + """Fake file-like stream object that redirects writes to a logger instance.""" + + def __init__(self, logger, log_level=logging.INFO): + self.terminal = sys.stdout + self.logger = logger + self.log_level = log_level + self.linebuf = "" + + def __getattr__(self, attr): + return getattr(self.terminal, attr) + + def write(self, buf): + temp_linebuf = self.linebuf + buf + self.linebuf = "" + for line in temp_linebuf.splitlines(True): + # From the io.TextIOWrapper docs: + # On output, if newline is None, any '\n' characters written + # are translated to the system default line separator. + # By default sys.stdout.write() expects '\n' newlines and then + # translates them so this is still cross platform. + if line[-1] == "\n": + self.logger.log(self.log_level, line.rstrip()) + else: + self.linebuf += line + + def flush(self): + if self.linebuf != "": + self.logger.log(self.log_level, self.linebuf.rstrip()) + self.linebuf = "" + + +def pretty_print_semaphore(semaphore): + if semaphore is None: + return "None" + return f"Semaphore(value={semaphore._value}, locked={semaphore.locked()})" diff --git a/comps/embeddings/multimodal_embeddings/multimodal_langchain/__init__.py b/comps/embeddings/multimodal_embeddings/multimodal_langchain/__init__.py new file mode 100644 index 000000000..916f3a44b --- /dev/null +++ b/comps/embeddings/multimodal_embeddings/multimodal_langchain/__init__.py @@ -0,0 +1,2 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 diff --git a/comps/embeddings/multimodal_embeddings/multimodal_langchain/docker/Dockerfile b/comps/embeddings/multimodal_embeddings/multimodal_langchain/docker/Dockerfile new file mode 100644 index 000000000..97d5906ec --- /dev/null +++ b/comps/embeddings/multimodal_embeddings/multimodal_langchain/docker/Dockerfile @@ -0,0 +1,29 @@ + +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +FROM langchain/langchain:latest + +RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ + libgl1-mesa-glx \ + libjemalloc-dev \ + vim + +RUN useradd -m -s /bin/bash user && \ + mkdir -p /home/user && \ + chown -R user /home/user/ + +USER user + +COPY comps /home/user/comps + +RUN pip install --no-cache-dir --upgrade pip && \ + pip install --no-cache-dir -r /home/user/comps/embeddings/multimodal_embeddings/multimodal_langchain/requirements.txt + +# RUN pip install --upgrade pydantic + +ENV PYTHONPATH=$PYTHONPATH:/home/user + +WORKDIR /home/user/comps/embeddings/multimodal_embeddings/multimodal_langchain + +ENTRYPOINT ["python", "mm_embedding_mmei.py"] diff --git a/comps/embeddings/multimodal_embeddings/multimodal_langchain/docker/docker_compose_multimodal_embedding.yaml b/comps/embeddings/multimodal_embeddings/multimodal_langchain/docker/docker_compose_multimodal_embedding.yaml new file mode 100644 index 000000000..314233f93 --- /dev/null +++ b/comps/embeddings/multimodal_embeddings/multimodal_langchain/docker/docker_compose_multimodal_embedding.yaml @@ -0,0 +1,21 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +services: + embedding: + image: opea/embedding-multimodal:latest + container_name: embedding-multimodal-server + ports: + - ${MM_EMBEDDING_PORT_MICROSERVICE}:${MM_EMBEDDING_PORT_MICROSERVICE} + ipc: host + environment: + no_proxy: ${no_proxy} + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + MMEI_EMBEDDING_ENDPOINT: ${MMEI_EMBEDDING_ENDPOINT} + MM_EMBEDDING_PORT_MICROSERVICE: ${MM_EMBEDDING_PORT_MICROSERVICE} + restart: unless-stopped + +networks: + default: + driver: bridge diff --git a/comps/embeddings/multimodal_embeddings/multimodal_langchain/local_mm_embedding.py b/comps/embeddings/multimodal_embeddings/multimodal_langchain/local_mm_embedding.py new file mode 100644 index 000000000..0679a1609 --- /dev/null +++ b/comps/embeddings/multimodal_embeddings/multimodal_langchain/local_mm_embedding.py @@ -0,0 +1,61 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import os + +from langsmith import traceable # type: ignore + +from comps import ( + CustomLogger, + EmbedDoc, + EmbedMultimodalDoc, + MultimodalDoc, + ServiceType, + TextDoc, + TextImageDoc, + opea_microservices, + register_microservice, +) +from comps.embeddings.multimodal_embeddings.bridgetower import BridgeTowerEmbedding + +logger = CustomLogger("local_multimodal_embedding") +logflag = os.getenv("LOGFLAG", False) + +port = int(os.getenv("MM_EMBEDDING_PORT_MICROSERVICE", 6600)) + + +@register_microservice( + name="opea_service@local_multimodal_embedding", + service_type=ServiceType.EMBEDDING, + endpoint="/v1/embeddings", + host="0.0.0.0", + port=port, + input_datatype=MultimodalDoc, + output_datatype=EmbedMultimodalDoc, +) +@traceable(run_type="embedding") +def embedding(input: MultimodalDoc) -> EmbedDoc: + if logflag: + logger.info(input) + + if isinstance(input, TextDoc): + # Handle text input + embed_vector = embeddings.embed_query(input.text) + res = EmbedDoc(text=input.text, embedding=embed_vector) + + elif isinstance(input, TextImageDoc): + # Handle text + image input + pil_image = input.image.url.load_pil() + embed_vector = embeddings.embed_image_text_pairs([input.text.text], [pil_image], batch_size=1)[0] + res = EmbedMultimodalDoc(text=input.text.text, url=input.image.url, embedding=embed_vector) + else: + raise ValueError("Invalid input type") + + if logflag: + logger.info(res) + return res + + +if __name__ == "__main__": + embeddings = BridgeTowerEmbedding() + opea_microservices["opea_service@local_multimodal_embedding"].start() diff --git a/comps/embeddings/multimodal_embeddings/multimodal_langchain/mm_embedding_mmei.py b/comps/embeddings/multimodal_embeddings/multimodal_langchain/mm_embedding_mmei.py new file mode 100644 index 000000000..fbd972a20 --- /dev/null +++ b/comps/embeddings/multimodal_embeddings/multimodal_langchain/mm_embedding_mmei.py @@ -0,0 +1,84 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import base64 +import os +import time + +import requests +from fastapi.responses import JSONResponse + +from comps import ( + CustomLogger, + EmbedDoc, + EmbedMultimodalDoc, + MultimodalDoc, + ServiceType, + TextDoc, + TextImageDoc, + opea_microservices, + register_microservice, + register_statistics, + statistics_dict, +) + +logger = CustomLogger("multimodal_embedding_mmei_langchain") +logflag = os.getenv("LOGFLAG", False) +port = int(os.getenv("MM_EMBEDDING_PORT_MICROSERVICE", 6600)) +headers = {"Content-Type": "application/json"} + + +@register_microservice( + name="opea_service@multimodal_embedding_mmei_langchain", + service_type=ServiceType.EMBEDDING, + endpoint="/v1/embeddings", + host="0.0.0.0", + port=port, + input_datatype=MultimodalDoc, + output_datatype=EmbedMultimodalDoc, +) +@register_statistics(names=["opea_service@multimodal_embedding_mmei_langchain"]) +def embedding(input: MultimodalDoc) -> EmbedDoc: + start = time.time() + if logflag: + logger.info(input) + + json = {} + if isinstance(input, TextDoc): + json["text"] = input.text + elif isinstance(input, TextImageDoc): + json["text"] = input.text.text + img_bytes = input.image.url.load_bytes() + base64_img = base64.b64encode(img_bytes).decode("utf-8") + json["img_b64_str"] = base64_img + else: + return JSONResponse(status_code=400, content={"message": "Bad request!"}) + + # call multimodal embedding endpoint + try: + response = requests.post(mmei_embedding_endpoint, headers=headers, json=json) + if response.status_code != 200: + return JSONResponse(status_code=503, content={"message": "Multimodal embedding endpoint failed!"}) + + response_json = response.json() + embed_vector = response_json["embedding"] + if isinstance(input, TextDoc): + res = EmbedDoc(text=input.text, embedding=embed_vector) + elif isinstance(input, TextImageDoc): + res = EmbedMultimodalDoc(text=input.text.text, url=input.image.url, embedding=embed_vector) + except requests.exceptions.ConnectionError: + res = JSONResponse(status_code=503, content={"message": "Multimodal embedding endpoint not started!"}) + statistics_dict["opea_service@multimodal_embedding_mmei_langchain"].append_latency(time.time() - start, None) + if logflag: + logger.info(res) + return res + + +if __name__ == "__main__": + url_endpoint = os.getenv("MMEI_EMBEDDING_HOST_ENDPOINT", "http://0.0.0.0") + port_endpoint = os.getenv("MMEI_EMBEDDING_PORT_ENDPOINT", "8080") + path_endpoint = os.getenv("MMEI_EMBEDDING_PATH_ENDPOINT", "/v1/encode") + + mmei_embedding_endpoint = os.getenv("MMEI_EMBEDDING_ENDPOINT", f"{url_endpoint}:{port_endpoint}{path_endpoint}") + logger.info(f"MMEI Gaudi Embedding initialized at {mmei_embedding_endpoint}") + opea_microservices["opea_service@multimodal_embedding_mmei_langchain"].start() diff --git a/comps/embeddings/multimodal_embeddings/multimodal_langchain/requirements.txt b/comps/embeddings/multimodal_embeddings/multimodal_langchain/requirements.txt new file mode 100644 index 000000000..74a177f9a --- /dev/null +++ b/comps/embeddings/multimodal_embeddings/multimodal_langchain/requirements.txt @@ -0,0 +1,15 @@ +docarray[full] +fastapi +huggingface_hub +langchain +langsmith +opentelemetry-api +opentelemetry-exporter-otlp +opentelemetry-sdk +prometheus-fastapi-instrumentator +pydantic==2.8.2 +shortuuid +torch +torchvision +transformers +uvicorn diff --git a/tests/test_multimodal_embeddings_langchain_cpu.sh b/tests/test_multimodal_embeddings_langchain_cpu.sh new file mode 100644 index 000000000..6e1294223 --- /dev/null +++ b/tests/test_multimodal_embeddings_langchain_cpu.sh @@ -0,0 +1,111 @@ +#!/bin/bash +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +set -x + +WORKPATH=$(dirname "$PWD") +ip_address=$(hostname -I | awk '{print $1}') +export your_mmei_port=8089 +export EMBEDDER_PORT=$your_mmei_port +export MMEI_EMBEDDING_ENDPOINT="http://$ip_address:$your_mmei_port/v1/encode" +export your_embedding_port_microservice=6609 +export MM_EMBEDDING_PORT_MICROSERVICE=$your_embedding_port_microservice +unset http_proxy + +function build_mmei_docker_images() { + cd $WORKPATH + echo $(pwd) + docker build --no-cache -t opea/bridgetower-embedder:latest --build-arg EMBEDDER_PORT=$EMBEDDER_PORT --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/embeddings/multimodal_embeddings/bridgetower/docker/Dockerfile . + + if [ $? -ne 0 ]; then + echo "opea/bridgetower-embedder built fail" + exit 1 + else + echo "opea/bridgetower-embedder built successful" + fi +} + +function build_embedding_service_images() { + cd $WORKPATH + echo $(pwd) + docker build --no-cache -t opea/embedding-multimodal:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/embeddings/multimodal_embeddings/multimodal_langchain/docker/Dockerfile . + + if [ $? -ne 0 ]; then + echo "opea/embedding-multimodal built fail" + exit 1 + else + echo "opea/embedding-multimodal built successful" + fi +} + +function build_docker_images() { + build_mmei_docker_images + build_embedding_service_images +} + +function start_service() { + cd $WORKPATH + cd comps/embeddings/multimodal_embeddings/bridgetower/docker/ + docker compose -f docker_compose_bridgetower_embedding_endpoint.yaml up -d + cd $WORKPATH + cd comps/embeddings/multimodal_embeddings/multimodal_langchain/docker/ + docker compose -f docker_compose_multimodal_embedding.yaml up -d + sleep 2m +} +function validate_microservice_text_embedding() { + result=$(http_proxy="" curl http://${ip_address}:$MM_EMBEDDING_PORT_MICROSERVICE/v1/embeddings \ + -X POST \ + -H "Content-Type: application/json" \ + -d '{"text" : "This is some sample text."}') + + if [[ $result == *"embedding"* ]]; then + echo "Result correct." + else + echo "Result wrong. Received was $result" + docker logs bridgetower-embedding-server + docker logs embedding-multimodal-server + exit 1 + fi +} + +function validate_microservice_image_text_pair_embedding() { + result=$(http_proxy="" curl http://${ip_address}:$MM_EMBEDDING_PORT_MICROSERVICE/v1/embeddings \ + -X POST \ + -H "Content-Type: application/json" \ + -d '{"text": {"text" : "This is some sample text."}, "image" : {"url": "https://github.com/docarray/docarray/blob/main/tests/toydata/image-data/apple.png?raw=true"}}') + + if [[ $result == *"embedding"* ]]; then + echo "Result correct." + else + echo "Result wrong. Received was $result" + docker logs bridgetower-embedding-server + docker logs embedding-multimodal-server + exit 1 + fi +} + +function validate_microservice() { + validate_microservice_text_embedding + validate_microservice_image_text_pair_embedding +} + +function stop_docker() { + cid=$(docker ps -aq --filter "name=bridgetower-embedding-server" --filter "name=embedding-multimodal-server") + if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid && sleep 1s; fi +} + +function main() { + + stop_docker + + build_docker_images + start_service + + validate_microservice + + stop_docker + # echo y | docker system prune +} + +main diff --git a/tests/test_multimodal_embeddings_langchain_hpu.sh b/tests/test_multimodal_embeddings_langchain_hpu.sh new file mode 100644 index 000000000..895a75068 --- /dev/null +++ b/tests/test_multimodal_embeddings_langchain_hpu.sh @@ -0,0 +1,111 @@ +#!/bin/bash +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +set -x + +WORKPATH=$(dirname "$PWD") +ip_address=$(hostname -I | awk '{print $1}') +export your_mmei_port=8089 +export EMBEDDER_PORT=$your_mmei_port +export MMEI_EMBEDDING_ENDPOINT="http://$ip_address:$your_mmei_port/v1/encode" +export your_embedding_port_microservice=6609 +export MM_EMBEDDING_PORT_MICROSERVICE=$your_embedding_port_microservice +unset http_proxy + +function build_mmei_docker_images() { + cd $WORKPATH + echo $(pwd) + docker build --no-cache -t opea/bridgetower-embedder:latest --build-arg EMBEDDER_PORT=$EMBEDDER_PORT --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/embeddings/multimodal_embeddings/bridgetower/docker/Dockerfile_hpu . + + if [ $? -ne 0 ]; then + echo "opea/bridgetower-embedder built fail" + exit 1 + else + echo "opea/bridgetower-embedder built successful" + fi +} + +function build_embedding_service_images() { + cd $WORKPATH + echo $(pwd) + docker build --no-cache -t opea/embedding-multimodal:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/embeddings/multimodal_embeddings/multimodal_langchain/docker/Dockerfile . + + if [ $? -ne 0 ]; then + echo "opea/embedding-multimodal built fail" + exit 1 + else + echo "opea/embedding-multimodal built successful" + fi +} + +function build_docker_images() { + build_mmei_docker_images + build_embedding_service_images +} + +function start_service() { + cd $WORKPATH + cd comps/embeddings/multimodal_embeddings/bridgetower/docker/ + docker compose -f docker_compose_bridgetower_embedding_endpoint.yaml up -d + cd $WORKPATH + cd comps/embeddings/multimodal_embeddings/multimodal_langchain/docker/ + docker compose -f docker_compose_multimodal_embedding.yaml up -d + sleep 2m +} +function validate_microservice_text_embedding() { + result=$(http_proxy="" curl http://${ip_address}:$MM_EMBEDDING_PORT_MICROSERVICE/v1/embeddings \ + -X POST \ + -H "Content-Type: application/json" \ + -d '{"text" : "This is some sample text."}') + + if [[ $result == *"embedding"* ]]; then + echo "Result correct." + else + echo "Result wrong. Received was $result" + docker logs bridgetower-embedding-server + docker logs embedding-multimodal-server + exit 1 + fi +} + +function validate_microservice_image_text_pair_embedding() { + result=$(http_proxy="" curl http://${ip_address}:$MM_EMBEDDING_PORT_MICROSERVICE/v1/embeddings \ + -X POST \ + -H "Content-Type: application/json" \ + -d '{"text": {"text" : "This is some sample text."}, "image" : {"url": "https://github.com/docarray/docarray/blob/main/tests/toydata/image-data/apple.png?raw=true"}}') + + if [[ $result == *"embedding"* ]]; then + echo "Result correct." + else + echo "Result wrong. Received was $result" + docker logs bridgetower-embedding-server + docker logs embedding-multimodal-server + exit 1 + fi +} + +function validate_microservice() { + validate_microservice_text_embedding + validate_microservice_image_text_pair_embedding +} + +function stop_docker() { + cid=$(docker ps -aq --filter "name=bridgetower-embedding-server" --filter "name=embedding-multimodal-server") + if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid && sleep 1s; fi +} + +function main() { + + stop_docker + + build_docker_images + start_service + + validate_microservice + + stop_docker + # echo y | docker system prune +} + +main From 5702683115efbc427f07ce13925a93a1afb5556d Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 22 Aug 2024 22:22:04 +0000 Subject: [PATCH 2/4] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- comps/cores/proto/docarray.py | 3 ++- comps/embeddings/multimodal_embeddings/README.md | 2 +- comps/embeddings/multimodal_embeddings/__init__.py | 2 +- .../embeddings/multimodal_embeddings/bridgetower/__init__.py | 2 +- .../multimodal_embeddings/bridgetower/bridgetower_custom.py | 2 +- .../bridgetower/bridgetower_embedding.py | 5 ++++- .../multimodal_embeddings/bridgetower/bridgetower_server.py | 2 +- 7 files changed, 11 insertions(+), 7 deletions(-) diff --git a/comps/cores/proto/docarray.py b/comps/cores/proto/docarray.py index 4d2c427ae..aa4caf179 100644 --- a/comps/cores/proto/docarray.py +++ b/comps/cores/proto/docarray.py @@ -76,7 +76,8 @@ class EmbedMultimodalDoc(EmbedDoc): description="The base64-based encoding of the image.", default=None, ) - + + class Audio2TextDoc(AudioDoc): url: Optional[AudioUrl] = Field( description="The path to the audio.", diff --git a/comps/embeddings/multimodal_embeddings/README.md b/comps/embeddings/multimodal_embeddings/README.md index dd926220e..c2cf2b875 100644 --- a/comps/embeddings/multimodal_embeddings/README.md +++ b/comps/embeddings/multimodal_embeddings/README.md @@ -182,4 +182,4 @@ curl -X POST http://0.0.0.0:6600/v1/embeddings \ curl -X POST http://0.0.0.0:6600/v1/embeddings \ -H "Content-Type: application/json" \ -d '{"text" : "This is some sample text."}' -``` \ No newline at end of file +``` diff --git a/comps/embeddings/multimodal_embeddings/__init__.py b/comps/embeddings/multimodal_embeddings/__init__.py index 4582b4f9a..916f3a44b 100644 --- a/comps/embeddings/multimodal_embeddings/__init__.py +++ b/comps/embeddings/multimodal_embeddings/__init__.py @@ -1,2 +1,2 @@ # Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 \ No newline at end of file +# SPDX-License-Identifier: Apache-2.0 diff --git a/comps/embeddings/multimodal_embeddings/bridgetower/__init__.py b/comps/embeddings/multimodal_embeddings/bridgetower/__init__.py index e5d90e19c..e64366189 100644 --- a/comps/embeddings/multimodal_embeddings/bridgetower/__init__.py +++ b/comps/embeddings/multimodal_embeddings/bridgetower/__init__.py @@ -2,4 +2,4 @@ # SPDX-License-Identifier: Apache-2.0 from .bridgetower_embedding import BridgeTowerEmbedding -from .bridgetower_custom import BridgeTowerTextFeatureExtractor, BridgeTowerForITC \ No newline at end of file +from .bridgetower_custom import BridgeTowerTextFeatureExtractor, BridgeTowerForITC diff --git a/comps/embeddings/multimodal_embeddings/bridgetower/bridgetower_custom.py b/comps/embeddings/multimodal_embeddings/bridgetower/bridgetower_custom.py index a9260a7b1..0a89c3fa9 100644 --- a/comps/embeddings/multimodal_embeddings/bridgetower/bridgetower_custom.py +++ b/comps/embeddings/multimodal_embeddings/bridgetower/bridgetower_custom.py @@ -240,4 +240,4 @@ def forward( logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions, - ) \ No newline at end of file + ) diff --git a/comps/embeddings/multimodal_embeddings/bridgetower/bridgetower_embedding.py b/comps/embeddings/multimodal_embeddings/bridgetower/bridgetower_embedding.py index 99f9fb6dd..096c5501d 100644 --- a/comps/embeddings/multimodal_embeddings/bridgetower/bridgetower_embedding.py +++ b/comps/embeddings/multimodal_embeddings/bridgetower/bridgetower_embedding.py @@ -50,6 +50,7 @@ class Config: def embed_documents(self, texts: List[str]) -> List[List[float]]: """Embed a list of documents using BridgeTower. + Args: texts: The list of texts to embed. Returns: @@ -63,6 +64,7 @@ def embed_documents(self, texts: List[str]) -> List[List[float]]: def embed_query(self, text: str) -> List[float]: """Embed a query using BridgeTower. + Args: text: The text to embed. Returns: @@ -72,6 +74,7 @@ def embed_query(self, text: str) -> List[float]: def embed_image_text_pairs(self, texts: List[str], images: list[Image], batch_size=2) -> List[List[float]]: # type: ignore """Embed a list of image-text pairs using BridgeTower. + Args: texts: The list of texts to embed. images: The list of path-to-images to embed @@ -116,4 +119,4 @@ def embed_image_text_pairs(self, texts: List[str], images: list[Image], batch_si embeddings.append(batch_embeddings.logits[i, 2, :].detach().cpu().numpy().tolist()) image_list = [] text_list = [] - return embeddings \ No newline at end of file + return embeddings diff --git a/comps/embeddings/multimodal_embeddings/bridgetower/bridgetower_server.py b/comps/embeddings/multimodal_embeddings/bridgetower/bridgetower_server.py index 200773203..62e70c74f 100644 --- a/comps/embeddings/multimodal_embeddings/bridgetower/bridgetower_server.py +++ b/comps/embeddings/multimodal_embeddings/bridgetower/bridgetower_server.py @@ -150,4 +150,4 @@ async def get_woker_status(): host=args.host, port=args.port, log_level="debug", - ) \ No newline at end of file + ) From 893132b7e4f523544756ebfeccfc51d3cfe69dce Mon Sep 17 00:00:00 2001 From: Tiep Le Date: Thu, 29 Aug 2024 02:59:57 +0000 Subject: [PATCH 3/4] remove langsmith Signed-off-by: Tiep Le --- .../multimodal_langchain/local_mm_embedding.py | 3 --- .../multimodal_langchain/requirements.txt | 1 - 2 files changed, 4 deletions(-) diff --git a/comps/embeddings/multimodal_embeddings/multimodal_langchain/local_mm_embedding.py b/comps/embeddings/multimodal_embeddings/multimodal_langchain/local_mm_embedding.py index 0679a1609..7327284a8 100644 --- a/comps/embeddings/multimodal_embeddings/multimodal_langchain/local_mm_embedding.py +++ b/comps/embeddings/multimodal_embeddings/multimodal_langchain/local_mm_embedding.py @@ -3,8 +3,6 @@ import os -from langsmith import traceable # type: ignore - from comps import ( CustomLogger, EmbedDoc, @@ -33,7 +31,6 @@ input_datatype=MultimodalDoc, output_datatype=EmbedMultimodalDoc, ) -@traceable(run_type="embedding") def embedding(input: MultimodalDoc) -> EmbedDoc: if logflag: logger.info(input) diff --git a/comps/embeddings/multimodal_embeddings/multimodal_langchain/requirements.txt b/comps/embeddings/multimodal_embeddings/multimodal_langchain/requirements.txt index 74a177f9a..cc9d77a43 100644 --- a/comps/embeddings/multimodal_embeddings/multimodal_langchain/requirements.txt +++ b/comps/embeddings/multimodal_embeddings/multimodal_langchain/requirements.txt @@ -2,7 +2,6 @@ docarray[full] fastapi huggingface_hub langchain -langsmith opentelemetry-api opentelemetry-exporter-otlp opentelemetry-sdk From ae47db638587f7f6be0ceb621cc74e31c8a4c343 Mon Sep 17 00:00:00 2001 From: Tiep Le Date: Thu, 29 Aug 2024 06:21:07 +0000 Subject: [PATCH 4/4] update the error message per PR reviewer Signed-off-by: Tiep Le --- .../multimodal_embeddings/bridgetower/bridgetower_embedding.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comps/embeddings/multimodal_embeddings/bridgetower/bridgetower_embedding.py b/comps/embeddings/multimodal_embeddings/bridgetower/bridgetower_embedding.py index 096c5501d..f61d8e1c3 100644 --- a/comps/embeddings/multimodal_embeddings/bridgetower/bridgetower_embedding.py +++ b/comps/embeddings/multimodal_embeddings/bridgetower/bridgetower_embedding.py @@ -84,7 +84,7 @@ def embed_image_text_pairs(self, texts: List[str], images: list[Image], batch_si """ # the length of texts must be equal to the length of images - assert len(texts) == len(images), "the len of captions should be equal to the len of images" + assert len(texts) == len(images), "the number of captions should be equal to the number of images" image_list = [] text_list = []