From 9b658f4f8b83575c9acc8c9f4f24db2c0a5bf52f Mon Sep 17 00:00:00 2001 From: gadmarkovits Date: Tue, 9 Jul 2024 07:52:40 +0300 Subject: [PATCH] Qdrant retriever microservice (#216) * PGvector service (#86) * Support PGvector service Signed-off-by: V, Ganesan Signed-off-by: gadmarkovits --- comps/retrievers/haystack/qdrant/README.md | 69 +++++++++++ comps/retrievers/haystack/qdrant/__init__.py | 2 + .../haystack/qdrant/docker/Dockerfile | 27 +++++ comps/retrievers/haystack/qdrant/ingest.py | 110 ++++++++++++++++++ .../haystack/qdrant/qdrant_config.py | 20 ++++ .../haystack/qdrant/requirements.txt | 13 +++ .../haystack/qdrant/retriever_qdrant.py | 49 ++++++++ tests/test_retrievers_haystack_qdrant.sh | 76 ++++++++++++ 8 files changed, 366 insertions(+) create mode 100644 comps/retrievers/haystack/qdrant/README.md create mode 100644 comps/retrievers/haystack/qdrant/__init__.py create mode 100644 comps/retrievers/haystack/qdrant/docker/Dockerfile create mode 100644 comps/retrievers/haystack/qdrant/ingest.py create mode 100644 comps/retrievers/haystack/qdrant/qdrant_config.py create mode 100644 comps/retrievers/haystack/qdrant/requirements.txt create mode 100644 comps/retrievers/haystack/qdrant/retriever_qdrant.py create mode 100644 tests/test_retrievers_haystack_qdrant.sh diff --git a/comps/retrievers/haystack/qdrant/README.md b/comps/retrievers/haystack/qdrant/README.md new file mode 100644 index 000000000..70d2845ed --- /dev/null +++ b/comps/retrievers/haystack/qdrant/README.md @@ -0,0 +1,69 @@ +# Retriever Microservice with Qdrant + +# 🚀Start Microservice with Python + +## Install Requirements + +```bash +pip install -r requirements.txt +``` + +## Start Qdrant Server + +Please refer to this [readme](../../../vectorstores/langchain/qdrant/README.md). + +## Setup Environment Variables + +```bash +export http_proxy=${your_http_proxy} +export https_proxy=${your_https_proxy} +export QDRANT_HOST=${your_qdrant_host_ip} +export QDRANT_PORT=6333 +export EMBED_DIMENSION=${your_embedding_dimension} +export INDEX_NAME=${your_index_name} +export TEI_EMBEDDING_ENDPOINT=${your_tei_endpoint} +``` + +## Start Retriever Service + +```bash +export TEI_EMBEDDING_ENDPOINT="http://${your_ip}:6060" +python haystack/qdrant/retriever_qdrant.py +``` + +# 🚀Start Microservice with Docker + +## Build Docker Image + +```bash +cd ../../ +docker build -t opea/retriever-qdrant:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/retrievers/haystack/qdrant/docker/Dockerfile . +``` + +## Run Docker with CLI + +```bash +docker run -d --name="retriever-qdrant-server" -p 7000:7000 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e TEI_EMBEDDING_ENDPOINT=${your_tei_endpoint} -e QDRANT_HOST=${your_qdrant_host_ip} -e QDRANT_PORT=${your_qdrant_port} opea/retriever-qdrant:latest +``` + +# 🚀3. Consume Retriever Service + +## 3.1 Check Service Status + +```bash +curl http://${your_ip}:7000/v1/health_check \ + -X GET \ + -H 'Content-Type: application/json' +``` + +## 3.2 Consume Embedding Service + +To consume the Retriever Microservice, you can generate a mock embedding vector of length 768 with Python. + +```bash +your_embedding=$(python -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)") +curl http://${your_ip}:7000/v1/retrieval \ + -X POST \ + -d "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${your_embedding}}" \ + -H 'Content-Type: application/json' +``` diff --git a/comps/retrievers/haystack/qdrant/__init__.py b/comps/retrievers/haystack/qdrant/__init__.py new file mode 100644 index 000000000..916f3a44b --- /dev/null +++ b/comps/retrievers/haystack/qdrant/__init__.py @@ -0,0 +1,2 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 diff --git a/comps/retrievers/haystack/qdrant/docker/Dockerfile b/comps/retrievers/haystack/qdrant/docker/Dockerfile new file mode 100644 index 000000000..e9916c8db --- /dev/null +++ b/comps/retrievers/haystack/qdrant/docker/Dockerfile @@ -0,0 +1,27 @@ + +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +FROM python:3.11-slim + +RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ + libgl1-mesa-glx \ + libjemalloc-dev \ + vim + +RUN useradd -m -s /bin/bash user && \ + mkdir -p /home/user && \ + chown -R user /home/user/ + +USER user + +COPY comps /home/user/comps + +RUN python -m pip install --no-cache-dir --upgrade pip && \ + python -m pip install --no-cache-dir -r /home/user/comps/retrievers/haystack/qdrant/requirements.txt + +ENV PYTHONPATH=$PYTHONPATH:/home/user + +WORKDIR /home/user/comps/retrievers/haystack/qdrant + +ENTRYPOINT ["python", "retriever_qdrant.py"] \ No newline at end of file diff --git a/comps/retrievers/haystack/qdrant/ingest.py b/comps/retrievers/haystack/qdrant/ingest.py new file mode 100644 index 000000000..d14dfbbb0 --- /dev/null +++ b/comps/retrievers/haystack/qdrant/ingest.py @@ -0,0 +1,110 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +# + +import argparse +import io +import os +import uuid + +import numpy as np +from haystack.components.embedders import HuggingFaceTEIDocumentEmbedder, SentenceTransformersDocumentEmbedder +from haystack.dataclasses.document import Document +from haystack_integrations.document_stores.qdrant import QdrantDocumentStore +from langchain.text_splitter import RecursiveCharacterTextSplitter +from PIL import Image +from qdrant_config import EMBED_DIMENSION, EMBED_ENDPOINT, EMBED_MODEL, INDEX_NAME, QDRANT_HOST, QDRANT_PORT + + +def pdf_loader(file_path): + try: + import easyocr + import fitz + except ImportError: + raise ImportError( + "`PyMuPDF` or 'easyocr' package is not found, please install it with " + "`pip install pymupdf or pip install easyocr.`" + ) + + doc = fitz.open(file_path) + reader = easyocr.Reader(["en"]) + result = "" + for i in range(doc.page_count): + page = doc.load_page(i) + pagetext = page.get_text().strip() + if pagetext: + result = result + pagetext + if len(doc.get_page_images(i)) > 0: + for img in doc.get_page_images(i): + if img: + pageimg = "" + xref = img[0] + img_data = doc.extract_image(xref) + img_bytes = img_data["image"] + pil_image = Image.open(io.BytesIO(img_bytes)) + img = np.array(pil_image) + img_result = reader.readtext(img, paragraph=True, detail=0) + pageimg = pageimg + ", ".join(img_result).strip() + if pageimg.endswith("!") or pageimg.endswith("?") or pageimg.endswith("."): + pass + else: + pageimg = pageimg + "." + result = result + pageimg + return result + + +def ingest_documents(folder_path, tag): + """Ingest PDF to Qdrant from the a given path.""" + # Load list of pdfs + doc_path = [os.path.join(folder_path, file) for file in os.listdir(folder_path)][0] + + print("Parsing...", doc_path) + + text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=100, add_start_index=True) + content = pdf_loader(doc_path) + chunks = text_splitter.split_text(content) + + print("Done preprocessing. Created ", len(chunks), " chunks of the original pdf") + # Create vectorstore + if EMBED_ENDPOINT: + # create embeddings using TEI endpoint service + embedder = HuggingFaceTEIDocumentEmbedder(url=EMBED_ENDPOINT) + else: + # create embeddings using local embedding model + embedder = SentenceTransformersDocumentEmbedder(model=EMBED_MODEL) + embedder.warm_up() + + # Initialize Qdrant store + qdrant_store = QdrantDocumentStore( + host=QDRANT_HOST, + port=QDRANT_PORT, + embedding_dim=EMBED_DIMENSION, + index=INDEX_NAME, + embedding_field="embedding", + similarity="cosine", + recreate_index=True, + ) + + # Batch size + batch_size = 32 + num_chunks = len(chunks) + for i in range(0, num_chunks, batch_size): + batch_chunks = chunks[i : i + batch_size] + batch_texts = [f"Tag: {tag}. " + chunk for chunk in batch_chunks] + documents = [Document(id=str(uuid.uuid4()), content=content) for content in batch_texts] + documents_with_embeddings = embedder.run(documents)["documents"] + qdrant_store.write_documents(documents_with_embeddings) + + print(f"Processed batch {i//batch_size + 1}/{(num_chunks-1)//batch_size + 1}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Ingest documents from a specified folder with a tag") + parser.add_argument("folder_path", type=str, help="Path to the folder containing documents") + parser.add_argument("--tag", type=str, default="", help="Tag to be used as an identifier") + + args = parser.parse_args() + ingest_documents(args.folder_path, args.tag) diff --git a/comps/retrievers/haystack/qdrant/qdrant_config.py b/comps/retrievers/haystack/qdrant/qdrant_config.py new file mode 100644 index 000000000..cee448d66 --- /dev/null +++ b/comps/retrievers/haystack/qdrant/qdrant_config.py @@ -0,0 +1,20 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import os + +# Embedding model +EMBED_MODEL = os.getenv("EMBED_MODEL", "BAAI/bge-base-en-v1.5") + +# Embedding dimension +EMBED_DIMENSION = os.getenv("EMBED_DIMENSION", 768) + +# Embedding endpoints +EMBED_ENDPOINT = os.getenv("TEI_EMBEDDING_ENDPOINT", "") + +# Qdrant Connection Information +QDRANT_HOST = os.getenv("QDRANT_HOST", "localhost") +QDRANT_PORT = int(os.getenv("QDRANT_PORT", 6333)) + +# Vector Index Configuration +INDEX_NAME = os.getenv("INDEX_NAME", "rag-qdrant") diff --git a/comps/retrievers/haystack/qdrant/requirements.txt b/comps/retrievers/haystack/qdrant/requirements.txt new file mode 100644 index 000000000..8cee8ce36 --- /dev/null +++ b/comps/retrievers/haystack/qdrant/requirements.txt @@ -0,0 +1,13 @@ +docarray[full] +easyocr +fastapi +haystack-ai +langchain_community +langsmith +opentelemetry-api +opentelemetry-exporter-otlp +opentelemetry-sdk +pymupdf +qdrant-haystack +sentence_transformers +shortuuid diff --git a/comps/retrievers/haystack/qdrant/retriever_qdrant.py b/comps/retrievers/haystack/qdrant/retriever_qdrant.py new file mode 100644 index 000000000..83ee64a2e --- /dev/null +++ b/comps/retrievers/haystack/qdrant/retriever_qdrant.py @@ -0,0 +1,49 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +from haystack.components.embedders import HuggingFaceTEITextEmbedder, SentenceTransformersTextEmbedder +from haystack_integrations.components.retrievers.qdrant import QdrantEmbeddingRetriever +from haystack_integrations.document_stores.qdrant import QdrantDocumentStore +from langsmith import traceable +from qdrant_config import EMBED_DIMENSION, EMBED_ENDPOINT, EMBED_MODEL, INDEX_NAME, QDRANT_HOST, QDRANT_PORT + +from comps import EmbedDoc768, SearchedDoc, ServiceType, TextDoc, opea_microservices, register_microservice + + +# Create a pipeline for querying a Qdrant document store +def initialize_qdrant_retriever() -> QdrantEmbeddingRetriever: + qdrant_store = QdrantDocumentStore( + host=QDRANT_HOST, port=QDRANT_PORT, embedding_dim=EMBED_DIMENSION, index=INDEX_NAME, recreate_index=False + ) + + retriever = QdrantEmbeddingRetriever(document_store=qdrant_store) + + return retriever + + +@register_microservice( + name="opea_service@retriever_qdrant", + service_type=ServiceType.RETRIEVER, + endpoint="/v1/retrieval", + host="0.0.0.0", + port=7000, +) +@traceable(run_type="retriever") +def retrieve(input: EmbedDoc768) -> SearchedDoc: + search_res = retriever.run(query_embedding=input.embedding)["documents"] + searched_docs = [TextDoc(text=r.content) for r in search_res] + result = SearchedDoc(retrieved_docs=searched_docs, initial_query=input.text) + return result + + +if __name__ == "__main__": + if EMBED_ENDPOINT: + # create embeddings using TEI endpoint service + embedder = HuggingFaceTEITextEmbedder(url=EMBED_ENDPOINT) + else: + # create embeddings using local embedding model + embedder = SentenceTransformersTextEmbedder(model=EMBED_MODEL) + embedder.warm_up() + + retriever = initialize_qdrant_retriever() + opea_microservices["opea_service@retriever_qdrant"].start() diff --git a/tests/test_retrievers_haystack_qdrant.sh b/tests/test_retrievers_haystack_qdrant.sh new file mode 100644 index 000000000..6b11eba5a --- /dev/null +++ b/tests/test_retrievers_haystack_qdrant.sh @@ -0,0 +1,76 @@ +#!/bin/bash +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +set -xe + +WORKPATH=$(dirname "$PWD") +ip_address=$(hostname -I | awk '{print $1}') +function build_docker_images() { + cd $WORKPATH + docker build --no-cache -t opea/retriever-qdrant:comps --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/retrievers/haystack/qdrant/docker/Dockerfile . +} + +function start_service() { + # qdrant + docker run -d --name test-qdrant-vector-db -p 5010:6333 -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy qdrant/qdrant + sleep 10s + + # tei endpoint + tei_endpoint=5008 + model="BAAI/bge-base-en-v1.5" + docker run -d --name="test-comps-retriever-tei-endpoint" -p $tei_endpoint:80 -v ./data:/data --pull always ghcr.io/huggingface/text-embeddings-inference:cpu-1.2 --model-id $model + sleep 30s + export TEI_EMBEDDING_ENDPOINT="http://${ip_address}:${tei_endpoint}" + + # qdrant retriever + export QDRANT_HOST="${ip_address}" + export QDRANT_PORT=5010 + export INDEX_NAME="rag-qdrant" + retriever_port=5009 + unset http_proxy + docker run -d --name="test-comps-retriever-qdrant-server" -p ${retriever_port}:7000 --ipc=host -e TEI_EMBEDDING_ENDPOINT=$TEI_EMBEDDING_ENDPOINT -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e QDRANT_HOST=$QDRANT_HOST -e QDRANT_PORT=$QDRANT_PORT -e INDEX_NAME=$INDEX_NAME opea/retriever-qdrant:comps + + sleep 3m +} + +function validate_microservice() { + retriever_port=5009 + export PATH="${HOME}/miniforge3/bin:$PATH" + source activate + test_embedding=$(python -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)") + http_proxy='' curl http://${ip_address}:$retriever_port/v1/retrieval \ + -X POST \ + -d "{\"text\":\"test\",\"embedding\":${test_embedding}}" \ + -H 'Content-Type: application/json' + docker logs test-comps-retriever-qdrant-server + docker logs test-comps-retriever-tei-endpoint +} + +function stop_docker() { + cid_retrievers=$(docker ps -aq --filter "name=test-comps-retrievers*") + if [[ ! -z "$cid_retrievers" ]]; then + docker stop $cid_retrievers && docker rm $cid_retrievers && sleep 1s + fi + + cid_qdrant=$(docker ps -aq --filter "name=test-qdrant-vector-db") + if [[ ! -z "$cid_qdrant" ]]; then + docker stop $cid_qdrant && docker rm $cid_qdrant && sleep 1s + fi +} + +function main() { + + stop_docker + + build_docker_images + start_service + + validate_microservice + + stop_docker + echo y | docker system prune + +} + +main