forked from opea-project/GenAIComps
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Qdrant retriever microservice (opea-project#216)
* PGvector service (opea-project#86) * Support PGvector service Signed-off-by: V, Ganesan <[email protected]> Signed-off-by: gadmarkovits <[email protected]> Signed-off-by: Daniel Whitenack <[email protected]>
- Loading branch information
1 parent
ee6480b
commit c65094f
Showing
8 changed files
with
366 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
# Retriever Microservice with Qdrant | ||
|
||
# 🚀Start Microservice with Python | ||
|
||
## Install Requirements | ||
|
||
```bash | ||
pip install -r requirements.txt | ||
``` | ||
|
||
## Start Qdrant Server | ||
|
||
Please refer to this [readme](../../../vectorstores/langchain/qdrant/README.md). | ||
|
||
## Setup Environment Variables | ||
|
||
```bash | ||
export http_proxy=${your_http_proxy} | ||
export https_proxy=${your_https_proxy} | ||
export QDRANT_HOST=${your_qdrant_host_ip} | ||
export QDRANT_PORT=6333 | ||
export EMBED_DIMENSION=${your_embedding_dimension} | ||
export INDEX_NAME=${your_index_name} | ||
export TEI_EMBEDDING_ENDPOINT=${your_tei_endpoint} | ||
``` | ||
|
||
## Start Retriever Service | ||
|
||
```bash | ||
export TEI_EMBEDDING_ENDPOINT="http://${your_ip}:6060" | ||
python haystack/qdrant/retriever_qdrant.py | ||
``` | ||
|
||
# 🚀Start Microservice with Docker | ||
|
||
## Build Docker Image | ||
|
||
```bash | ||
cd ../../ | ||
docker build -t opea/retriever-qdrant:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/retrievers/haystack/qdrant/docker/Dockerfile . | ||
``` | ||
|
||
## Run Docker with CLI | ||
|
||
```bash | ||
docker run -d --name="retriever-qdrant-server" -p 7000:7000 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e TEI_EMBEDDING_ENDPOINT=${your_tei_endpoint} -e QDRANT_HOST=${your_qdrant_host_ip} -e QDRANT_PORT=${your_qdrant_port} opea/retriever-qdrant:latest | ||
``` | ||
|
||
# 🚀3. Consume Retriever Service | ||
|
||
## 3.1 Check Service Status | ||
|
||
```bash | ||
curl http://${your_ip}:7000/v1/health_check \ | ||
-X GET \ | ||
-H 'Content-Type: application/json' | ||
``` | ||
|
||
## 3.2 Consume Embedding Service | ||
|
||
To consume the Retriever Microservice, you can generate a mock embedding vector of length 768 with Python. | ||
|
||
```bash | ||
your_embedding=$(python -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)") | ||
curl http://${your_ip}:7000/v1/retrieval \ | ||
-X POST \ | ||
-d "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${your_embedding}}" \ | ||
-H 'Content-Type: application/json' | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
# Copyright (C) 2024 Intel Corporation | ||
# SPDX-License-Identifier: Apache-2.0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
|
||
# Copyright (C) 2024 Intel Corporation | ||
# SPDX-License-Identifier: Apache-2.0 | ||
|
||
FROM python:3.11-slim | ||
|
||
RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ | ||
libgl1-mesa-glx \ | ||
libjemalloc-dev \ | ||
vim | ||
|
||
RUN useradd -m -s /bin/bash user && \ | ||
mkdir -p /home/user && \ | ||
chown -R user /home/user/ | ||
|
||
USER user | ||
|
||
COPY comps /home/user/comps | ||
|
||
RUN python -m pip install --no-cache-dir --upgrade pip && \ | ||
python -m pip install --no-cache-dir -r /home/user/comps/retrievers/haystack/qdrant/requirements.txt | ||
|
||
ENV PYTHONPATH=$PYTHONPATH:/home/user | ||
|
||
WORKDIR /home/user/comps/retrievers/haystack/qdrant | ||
|
||
ENTRYPOINT ["python", "retriever_qdrant.py"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,110 @@ | ||
#!/usr/bin/env python | ||
# -*- coding: utf-8 -*- | ||
# Copyright (C) 2024 Intel Corporation | ||
# SPDX-License-Identifier: Apache-2.0 | ||
|
||
# | ||
|
||
import argparse | ||
import io | ||
import os | ||
import uuid | ||
|
||
import numpy as np | ||
from haystack.components.embedders import HuggingFaceTEIDocumentEmbedder, SentenceTransformersDocumentEmbedder | ||
from haystack.dataclasses.document import Document | ||
from haystack_integrations.document_stores.qdrant import QdrantDocumentStore | ||
from langchain.text_splitter import RecursiveCharacterTextSplitter | ||
from PIL import Image | ||
from qdrant_config import EMBED_DIMENSION, EMBED_ENDPOINT, EMBED_MODEL, INDEX_NAME, QDRANT_HOST, QDRANT_PORT | ||
|
||
|
||
def pdf_loader(file_path): | ||
try: | ||
import easyocr | ||
import fitz | ||
except ImportError: | ||
raise ImportError( | ||
"`PyMuPDF` or 'easyocr' package is not found, please install it with " | ||
"`pip install pymupdf or pip install easyocr.`" | ||
) | ||
|
||
doc = fitz.open(file_path) | ||
reader = easyocr.Reader(["en"]) | ||
result = "" | ||
for i in range(doc.page_count): | ||
page = doc.load_page(i) | ||
pagetext = page.get_text().strip() | ||
if pagetext: | ||
result = result + pagetext | ||
if len(doc.get_page_images(i)) > 0: | ||
for img in doc.get_page_images(i): | ||
if img: | ||
pageimg = "" | ||
xref = img[0] | ||
img_data = doc.extract_image(xref) | ||
img_bytes = img_data["image"] | ||
pil_image = Image.open(io.BytesIO(img_bytes)) | ||
img = np.array(pil_image) | ||
img_result = reader.readtext(img, paragraph=True, detail=0) | ||
pageimg = pageimg + ", ".join(img_result).strip() | ||
if pageimg.endswith("!") or pageimg.endswith("?") or pageimg.endswith("."): | ||
pass | ||
else: | ||
pageimg = pageimg + "." | ||
result = result + pageimg | ||
return result | ||
|
||
|
||
def ingest_documents(folder_path, tag): | ||
"""Ingest PDF to Qdrant from the a given path.""" | ||
# Load list of pdfs | ||
doc_path = [os.path.join(folder_path, file) for file in os.listdir(folder_path)][0] | ||
|
||
print("Parsing...", doc_path) | ||
|
||
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=100, add_start_index=True) | ||
content = pdf_loader(doc_path) | ||
chunks = text_splitter.split_text(content) | ||
|
||
print("Done preprocessing. Created ", len(chunks), " chunks of the original pdf") | ||
# Create vectorstore | ||
if EMBED_ENDPOINT: | ||
# create embeddings using TEI endpoint service | ||
embedder = HuggingFaceTEIDocumentEmbedder(url=EMBED_ENDPOINT) | ||
else: | ||
# create embeddings using local embedding model | ||
embedder = SentenceTransformersDocumentEmbedder(model=EMBED_MODEL) | ||
embedder.warm_up() | ||
|
||
# Initialize Qdrant store | ||
qdrant_store = QdrantDocumentStore( | ||
host=QDRANT_HOST, | ||
port=QDRANT_PORT, | ||
embedding_dim=EMBED_DIMENSION, | ||
index=INDEX_NAME, | ||
embedding_field="embedding", | ||
similarity="cosine", | ||
recreate_index=True, | ||
) | ||
|
||
# Batch size | ||
batch_size = 32 | ||
num_chunks = len(chunks) | ||
for i in range(0, num_chunks, batch_size): | ||
batch_chunks = chunks[i : i + batch_size] | ||
batch_texts = [f"Tag: {tag}. " + chunk for chunk in batch_chunks] | ||
documents = [Document(id=str(uuid.uuid4()), content=content) for content in batch_texts] | ||
documents_with_embeddings = embedder.run(documents)["documents"] | ||
qdrant_store.write_documents(documents_with_embeddings) | ||
|
||
print(f"Processed batch {i//batch_size + 1}/{(num_chunks-1)//batch_size + 1}") | ||
|
||
|
||
if __name__ == "__main__": | ||
parser = argparse.ArgumentParser(description="Ingest documents from a specified folder with a tag") | ||
parser.add_argument("folder_path", type=str, help="Path to the folder containing documents") | ||
parser.add_argument("--tag", type=str, default="", help="Tag to be used as an identifier") | ||
|
||
args = parser.parse_args() | ||
ingest_documents(args.folder_path, args.tag) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
# Copyright (C) 2024 Intel Corporation | ||
# SPDX-License-Identifier: Apache-2.0 | ||
|
||
import os | ||
|
||
# Embedding model | ||
EMBED_MODEL = os.getenv("EMBED_MODEL", "BAAI/bge-base-en-v1.5") | ||
|
||
# Embedding dimension | ||
EMBED_DIMENSION = os.getenv("EMBED_DIMENSION", 768) | ||
|
||
# Embedding endpoints | ||
EMBED_ENDPOINT = os.getenv("TEI_EMBEDDING_ENDPOINT", "") | ||
|
||
# Qdrant Connection Information | ||
QDRANT_HOST = os.getenv("QDRANT_HOST", "localhost") | ||
QDRANT_PORT = int(os.getenv("QDRANT_PORT", 6333)) | ||
|
||
# Vector Index Configuration | ||
INDEX_NAME = os.getenv("INDEX_NAME", "rag-qdrant") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
docarray[full] | ||
easyocr | ||
fastapi | ||
haystack-ai | ||
langchain_community | ||
langsmith | ||
opentelemetry-api | ||
opentelemetry-exporter-otlp | ||
opentelemetry-sdk | ||
pymupdf | ||
qdrant-haystack | ||
sentence_transformers | ||
shortuuid |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
# Copyright (C) 2024 Intel Corporation | ||
# SPDX-License-Identifier: Apache-2.0 | ||
|
||
from haystack.components.embedders import HuggingFaceTEITextEmbedder, SentenceTransformersTextEmbedder | ||
from haystack_integrations.components.retrievers.qdrant import QdrantEmbeddingRetriever | ||
from haystack_integrations.document_stores.qdrant import QdrantDocumentStore | ||
from langsmith import traceable | ||
from qdrant_config import EMBED_DIMENSION, EMBED_ENDPOINT, EMBED_MODEL, INDEX_NAME, QDRANT_HOST, QDRANT_PORT | ||
|
||
from comps import EmbedDoc768, SearchedDoc, ServiceType, TextDoc, opea_microservices, register_microservice | ||
|
||
|
||
# Create a pipeline for querying a Qdrant document store | ||
def initialize_qdrant_retriever() -> QdrantEmbeddingRetriever: | ||
qdrant_store = QdrantDocumentStore( | ||
host=QDRANT_HOST, port=QDRANT_PORT, embedding_dim=EMBED_DIMENSION, index=INDEX_NAME, recreate_index=False | ||
) | ||
|
||
retriever = QdrantEmbeddingRetriever(document_store=qdrant_store) | ||
|
||
return retriever | ||
|
||
|
||
@register_microservice( | ||
name="opea_service@retriever_qdrant", | ||
service_type=ServiceType.RETRIEVER, | ||
endpoint="/v1/retrieval", | ||
host="0.0.0.0", | ||
port=7000, | ||
) | ||
@traceable(run_type="retriever") | ||
def retrieve(input: EmbedDoc768) -> SearchedDoc: | ||
search_res = retriever.run(query_embedding=input.embedding)["documents"] | ||
searched_docs = [TextDoc(text=r.content) for r in search_res] | ||
result = SearchedDoc(retrieved_docs=searched_docs, initial_query=input.text) | ||
return result | ||
|
||
|
||
if __name__ == "__main__": | ||
if EMBED_ENDPOINT: | ||
# create embeddings using TEI endpoint service | ||
embedder = HuggingFaceTEITextEmbedder(url=EMBED_ENDPOINT) | ||
else: | ||
# create embeddings using local embedding model | ||
embedder = SentenceTransformersTextEmbedder(model=EMBED_MODEL) | ||
embedder.warm_up() | ||
|
||
retriever = initialize_qdrant_retriever() | ||
opea_microservices["opea_service@retriever_qdrant"].start() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,76 @@ | ||
#!/bin/bash | ||
# Copyright (C) 2024 Intel Corporation | ||
# SPDX-License-Identifier: Apache-2.0 | ||
|
||
set -xe | ||
|
||
WORKPATH=$(dirname "$PWD") | ||
ip_address=$(hostname -I | awk '{print $1}') | ||
function build_docker_images() { | ||
cd $WORKPATH | ||
docker build --no-cache -t opea/retriever-qdrant:comps --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/retrievers/haystack/qdrant/docker/Dockerfile . | ||
} | ||
|
||
function start_service() { | ||
# qdrant | ||
docker run -d --name test-qdrant-vector-db -p 5010:6333 -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy qdrant/qdrant | ||
sleep 10s | ||
|
||
# tei endpoint | ||
tei_endpoint=5008 | ||
model="BAAI/bge-base-en-v1.5" | ||
docker run -d --name="test-comps-retriever-tei-endpoint" -p $tei_endpoint:80 -v ./data:/data --pull always ghcr.io/huggingface/text-embeddings-inference:cpu-1.2 --model-id $model | ||
sleep 30s | ||
export TEI_EMBEDDING_ENDPOINT="http://${ip_address}:${tei_endpoint}" | ||
|
||
# qdrant retriever | ||
export QDRANT_HOST="${ip_address}" | ||
export QDRANT_PORT=5010 | ||
export INDEX_NAME="rag-qdrant" | ||
retriever_port=5009 | ||
unset http_proxy | ||
docker run -d --name="test-comps-retriever-qdrant-server" -p ${retriever_port}:7000 --ipc=host -e TEI_EMBEDDING_ENDPOINT=$TEI_EMBEDDING_ENDPOINT -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e QDRANT_HOST=$QDRANT_HOST -e QDRANT_PORT=$QDRANT_PORT -e INDEX_NAME=$INDEX_NAME opea/retriever-qdrant:comps | ||
|
||
sleep 3m | ||
} | ||
|
||
function validate_microservice() { | ||
retriever_port=5009 | ||
export PATH="${HOME}/miniforge3/bin:$PATH" | ||
source activate | ||
test_embedding=$(python -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)") | ||
http_proxy='' curl http://${ip_address}:$retriever_port/v1/retrieval \ | ||
-X POST \ | ||
-d "{\"text\":\"test\",\"embedding\":${test_embedding}}" \ | ||
-H 'Content-Type: application/json' | ||
docker logs test-comps-retriever-qdrant-server | ||
docker logs test-comps-retriever-tei-endpoint | ||
} | ||
|
||
function stop_docker() { | ||
cid_retrievers=$(docker ps -aq --filter "name=test-comps-retrievers*") | ||
if [[ ! -z "$cid_retrievers" ]]; then | ||
docker stop $cid_retrievers && docker rm $cid_retrievers && sleep 1s | ||
fi | ||
|
||
cid_qdrant=$(docker ps -aq --filter "name=test-qdrant-vector-db") | ||
if [[ ! -z "$cid_qdrant" ]]; then | ||
docker stop $cid_qdrant && docker rm $cid_qdrant && sleep 1s | ||
fi | ||
} | ||
|
||
function main() { | ||
|
||
stop_docker | ||
|
||
build_docker_images | ||
start_service | ||
|
||
validate_microservice | ||
|
||
stop_docker | ||
echo y | docker system prune | ||
|
||
} | ||
|
||
main |