diff --git a/comps/dataprep/README.md b/comps/dataprep/README.md index 476d65e89..f7d445db4 100644 --- a/comps/dataprep/README.md +++ b/comps/dataprep/README.md @@ -9,3 +9,7 @@ For details, please refer to this [readme](redis/README.md) # Dataprep Microservice with Qdrant For details, please refer to this [readme](qdrant/README.md) + +# Dataprep Microservice with PGVector + +For details, please refer to this [readme](pgvector/README.md) diff --git a/comps/dataprep/pgvector/README.md b/comps/dataprep/pgvector/README.md new file mode 100644 index 000000000..36b99b6eb --- /dev/null +++ b/comps/dataprep/pgvector/README.md @@ -0,0 +1,78 @@ +# Dataprep Microservice with PGVector + +# 🚀1. Start Microservice with Python(Option 1) + +## 1.1 Install Requirements + +```bash +pip install -r requirements.txt +``` + +## 1.2 Start PGVector + +Please refer to this [readme](../../../vectorstores/langchain/pgvcetor/README.md). + +## 1.3 Setup Environment Variables + +```bash +export PG_CONNECTION_STRING=postgresql+psycopg2://testuser:testpwd@${your_ip}:5432/vectordb +export INDEX_NAME=${your_index_name} +export LANGCHAIN_TRACING_V2=true +export LANGCHAIN_API_KEY=${your_langchain_api_key} +export LANGCHAIN_PROJECT="opea/gen-ai-comps:dataprep" +``` + +## 1.4 Start Document Preparation Microservice for PGVector with Python Script + +Start document preparation microservice for PGVector with below command. + +```bash +python prepare_doc_pgvector.py +``` + +# 🚀2. Start Microservice with Docker (Option 2) + +## 2.1 Start PGVector + +Please refer to this [readme](../../../vectorstores/langchain/pgvector/README.md). + +## 2.2 Setup Environment Variables + +```bash +export PG_CONNECTION_STRING=postgresql+psycopg2://testuser:testpwd@${your_ip}:5432/vectordb +export INDEX_NAME=${your_index_name} +export LANGCHAIN_TRACING_V2=true +export LANGCHAIN_API_KEY=${your_langchain_api_key} +export LANGCHAIN_PROJECT="opea/dataprep" +``` + +## 2.3 Build Docker Image + +```bash +cd comps/dataprep/langchain/pgvector/docker +docker build -t opea/dataprep-pgvector:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/langchain/pgvector/docker/Dockerfile . +``` + +## 2.4 Run Docker with CLI (Option A) + +```bash +docker run -d --name="dataprep-pgvector" -p 6007:6007 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e PG_CONNECTION_STRING=$PG_CONNECTION_STRING -e INDEX_NAME=$INDEX_NAME -e TEI_ENDPOINT=$TEI_ENDPOINT opea/dataprep-pgvector:latest +``` + +## 2.5 Run with Docker Compose (Option B) + +```bash +cd comps/dataprep/langchain/pgvector/docker +docker compose -f docker-compose-dataprep-pgvector.yaml up -d +``` + +# 🚀3. Consume Microservice + +Once document preparation microservice for PGVector is started, user can use below command to invoke the microservice to convert the document to embedding and save to the database. + +```bash +curl -X POST \ + -H "Content-Type: application/json" \ + -d '{"path":"/path/to/document"}' \ + http://localhost:6007/v1/dataprep +``` diff --git a/comps/dataprep/pgvector/__pycache__/config.cpython-38.pyc b/comps/dataprep/pgvector/__pycache__/config.cpython-38.pyc new file mode 100644 index 000000000..fb9c485a0 Binary files /dev/null and b/comps/dataprep/pgvector/__pycache__/config.cpython-38.pyc differ diff --git a/comps/dataprep/pgvector/langchain/__init__.py b/comps/dataprep/pgvector/langchain/__init__.py new file mode 100644 index 000000000..916f3a44b --- /dev/null +++ b/comps/dataprep/pgvector/langchain/__init__.py @@ -0,0 +1,2 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 diff --git a/comps/dataprep/pgvector/langchain/config.py b/comps/dataprep/pgvector/langchain/config.py new file mode 100644 index 000000000..1206a8ed8 --- /dev/null +++ b/comps/dataprep/pgvector/langchain/config.py @@ -0,0 +1,20 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import os + +# Embedding model + +EMBED_MODEL = os.getenv("EMBED_MODEL", "BAAI/bge-base-en-v1.5") + +PG_CONNECTION_STRING = os.getenv("PG_CONNECTION_STRING", "localhost") + +# Vector Index Configuration +INDEX_NAME = os.getenv("INDEX_NAME", "rag-pgvector") + +# chunk parameters +CHUNK_SIZE = os.getenv("CHUNK_SIZE", 1500) +CHUNK_OVERLAP = os.getenv("CHUNK_OVERLAP", 100) + +current_file_path = os.path.abspath(__file__) +parent_dir = os.path.dirname(current_file_path) diff --git a/comps/dataprep/pgvector/langchain/docker/Dockerfile b/comps/dataprep/pgvector/langchain/docker/Dockerfile new file mode 100644 index 000000000..dedb127f3 --- /dev/null +++ b/comps/dataprep/pgvector/langchain/docker/Dockerfile @@ -0,0 +1,37 @@ + +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +FROM python:3.11-slim + +ENV LANG C.UTF-8 + +RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ + build-essential \ + libgl1-mesa-glx \ + libjemalloc-dev \ + vim + +RUN useradd -m -s /bin/bash user && \ + mkdir -p /home/user && \ + chown -R user /home/user/ + +USER user + +COPY comps /home/user/comps + +RUN pip install --no-cache-dir --upgrade pip setuptools && \ + pip install --no-cache-dir -r /home/user/comps/dataprep/pgvector/langchain/requirements.txt + +ENV PYTHONPATH=$PYTHONPATH:/home/user + +USER root + +RUN mkdir -p /home/user/comps/dataprep/pgvector/langchain/uploaded_files && chown -R user /home/user/comps/dataprep/pgvector/langchain/uploaded_files + +USER user + +WORKDIR /home/user/comps/dataprep/pgvector/langchain + +ENTRYPOINT ["python", "prepare_doc_pgvector.py"] + diff --git a/comps/dataprep/pgvector/langchain/docker/docker-compose-dataprep-pgvector.yaml b/comps/dataprep/pgvector/langchain/docker/docker-compose-dataprep-pgvector.yaml new file mode 100644 index 000000000..f11a88b93 --- /dev/null +++ b/comps/dataprep/pgvector/langchain/docker/docker-compose-dataprep-pgvector.yaml @@ -0,0 +1,39 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +version: "3" +services: + pgvector-vector-db: + hostname: db + container_name: pgvector-vector-db + image: pgvector/pgvector:0.7.0-pg16 + ports: + - "5432:5432" + restart: always + ipc: host + environment: + - POSTGRES_DB=vectordb + - POSTGRES_USER=testuser + - POSTGRES_PASSWORD=testpwd + - POSTGRES_HOST_AUTH_METHOD=trust + volumes: + - ./init.sql:/docker-entrypoint-initdb.d/init.sql + + dataprep-pgvector: + image: opea/dataprep-pgvector:latest + container_name: dataprep-pgvector + ports: + - "6007:6007" + ipc: host + environment: + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + PG_CONNECTION_STRING: ${PG_CONNECTION_STRING} + INDEX_NAME: ${INDEX_NAME} + TEI_ENDPOINT: ${TEI_ENDPOINT} + LANGCHAIN_API_KEY: ${LANGCHAIN_API_KEY} + restart: unless-stopped + +networks: + default: + driver: bridge diff --git a/comps/dataprep/pgvector/langchain/prepare_doc_pgvector.py b/comps/dataprep/pgvector/langchain/prepare_doc_pgvector.py new file mode 100644 index 000000000..02f034b3f --- /dev/null +++ b/comps/dataprep/pgvector/langchain/prepare_doc_pgvector.py @@ -0,0 +1,140 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import json +import os +import uuid +from pathlib import Path +from typing import List, Optional, Union + +from config import CHUNK_OVERLAP, CHUNK_SIZE, EMBED_MODEL, INDEX_NAME, PG_CONNECTION_STRING +from fastapi import File, Form, HTTPException, UploadFile +from langchain.text_splitter import RecursiveCharacterTextSplitter +from langchain_community.embeddings import HuggingFaceBgeEmbeddings, HuggingFaceHubEmbeddings +from langchain_community.vectorstores import PGVector +from langsmith import traceable + +from comps import DocPath, ServiceType, opea_microservices, register_microservice, register_statistics +from comps.dataprep.utils import document_loader, parse_html + +tei_embedding_endpoint = os.getenv("TEI_ENDPOINT") + + +async def save_file_to_local_disk(save_path: str, file): + save_path = Path(save_path) + with save_path.open("wb") as fout: + try: + content = await file.read() + fout.write(content) + except Exception as e: + print(f"Write file failed. Exception: {e}") + raise HTTPException(status_code=500, detail=f"Write file {save_path} failed. Exception: {e}") + + +def ingest_doc_to_pgvector(doc_path: DocPath): + """Ingest document to PGVector.""" + doc_path = doc_path.path + print(f"Parsing document {doc_path}.") + + text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=100, add_start_index=True) + content = document_loader(doc_path) + chunks = text_splitter.split_text(content) + print("Done preprocessing. Created ", len(chunks), " chunks of the original pdf") + print("PG Connection", PG_CONNECTION_STRING) + + # Create vectorstore + if tei_embedding_endpoint: + # create embeddings using TEI endpoint service + embedder = HuggingFaceHubEmbeddings(model=tei_embedding_endpoint) + else: + # create embeddings using local embedding model + embedder = HuggingFaceBgeEmbeddings(model_name=EMBED_MODEL) + + # Batch size + batch_size = 32 + num_chunks = len(chunks) + for i in range(0, num_chunks, batch_size): + batch_chunks = chunks[i : i + batch_size] + batch_texts = batch_chunks + + _ = PGVector.from_texts( + texts=batch_texts, embedding=embedder, collection_name=INDEX_NAME, connection_string=PG_CONNECTION_STRING + ) + print(f"Processed batch {i//batch_size + 1}/{(num_chunks-1)//batch_size + 1}") + return True + + +def ingest_link_to_pgvector(link_list: List[str]): + data_collection = parse_html(link_list) + + texts = [] + metadatas = [] + for data, meta in data_collection: + doc_id = str(uuid.uuid4()) + metadata = {"source": meta, "identify_id": doc_id} + texts.append(data) + metadatas.append(metadata) + + # Create vectorstore + if tei_embedding_endpoint: + # create embeddings using TEI endpoint service + embedder = HuggingFaceHubEmbeddings(model=tei_embedding_endpoint) + else: + # create embeddings using local embedding model + embedder = HuggingFaceBgeEmbeddings(model_name=EMBED_MODEL) + + _ = PGVector.from_texts( + texts=texts, + embedding=embedder, + metadatas=metadatas, + collection_name=INDEX_NAME, + connection_string=PG_CONNECTION_STRING, + ) + + +@register_microservice( + name="opea_service@prepare_doc_pgvector", + service_type=ServiceType.DATAPREP, + endpoint="/v1/dataprep", + host="0.0.0.0", + port=6007, +) +@traceable(run_type="tool") +@register_statistics(names=["opea_service@dataprep_pgvector"]) +async def ingest_documents( + files: Optional[Union[UploadFile, List[UploadFile]]] = File(None), link_list: Optional[str] = Form(None) +): + print(f"files:{files}") + print(f"link_list:{link_list}") + if files and link_list: + raise HTTPException(status_code=400, detail="Provide either a file or a string list, not both.") + + if files: + if not isinstance(files, list): + files = [files] + upload_folder = "./uploaded_files/" + if not os.path.exists(upload_folder): + Path(upload_folder).mkdir(parents=True, exist_ok=True) + for file in files: + save_path = upload_folder + file.filename + await save_file_to_local_disk(save_path, file) + ingest_doc_to_pgvector(DocPath(path=save_path)) + print(f"Successfully saved file {save_path}") + return {"status": 200, "message": "Data preparation succeeded"} + + if link_list: + try: + link_list = json.loads(link_list) # Parse JSON string to list + if not isinstance(link_list, list): + raise HTTPException(status_code=400, detail="link_list should be a list.") + ingest_link_to_pgvector(link_list) + print(f"Successfully saved link list {link_list}") + return {"status": 200, "message": "Data preparation succeeded"} + except json.JSONDecodeError: + raise HTTPException(status_code=400, detail="Invalid JSON format for link_list.") + + raise HTTPException(status_code=400, detail="Must provide either a file or a string list.") + + +if __name__ == "__main__": + opea_microservices["opea_service@prepare_doc_pgvector"].start() diff --git a/comps/dataprep/pgvector/langchain/requirements.txt b/comps/dataprep/pgvector/langchain/requirements.txt new file mode 100644 index 000000000..ccddd6bb0 --- /dev/null +++ b/comps/dataprep/pgvector/langchain/requirements.txt @@ -0,0 +1,21 @@ +beautifulsoup4 +docarray[full] +easyocr +fastapi +huggingface_hub +langchain +langchain-community +langsmith +numpy +opentelemetry-api +opentelemetry-exporter-otlp +opentelemetry-sdk +pandas +pgvector==0.2.5 +Pillow +prometheus-fastapi-instrumentator==7.0.0 +psycopg2-binary +pymupdf +python-docx +sentence_transformers +shortuuid diff --git a/comps/vectorstores/langchain/pgvector/README.md b/comps/vectorstores/langchain/pgvector/README.md index 7d13ee312..b23060cfd 100644 --- a/comps/vectorstores/langchain/pgvector/README.md +++ b/comps/vectorstores/langchain/pgvector/README.md @@ -17,5 +17,5 @@ export POSTGRES_DB=vectordb ## 3. Run Pgvector service ```bash -docker run --name vectorstore-postgres -e POSTGRES_USER=${POSTGRES_USER} -e POSTGRES_HOST_AUTH_METHOD=trust -e POSTGRES_DB=${POSTGRES_DB} -e POSTGRES_PASSWORD=${POSTGRES_PASSWORD} -d -v ./init.sql:/docker-entrypoint-initdb.d/init.sql pgvector/pgvector:0.7.0-pg16 +docker run --name vectorstore-postgres -e POSTGRES_USER=${POSTGRES_USER} -e POSTGRES_HOST_AUTH_METHOD=trust -e POSTGRES_DB=${POSTGRES_DB} -e POSTGRES_PASSWORD=${POSTGRES_PASSWORD} -d -v ./init.sql:/docker-entrypoint-initdb.d/init.sql -p 5432:5432 pgvector/pgvector:0.7.0-pg16 ``` diff --git a/comps/vectorstores/langchain/pgvector/__init__.py b/comps/vectorstores/langchain/pgvector/__init__.py index 28f108cb6..916f3a44b 100644 --- a/comps/vectorstores/langchain/pgvector/__init__.py +++ b/comps/vectorstores/langchain/pgvector/__init__.py @@ -1,13 +1,2 @@ -# Copyright (c) 2024 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 diff --git a/tests/test_dataprep_pgvector.sh b/tests/test_dataprep_pgvector.sh new file mode 100755 index 000000000..c9daba9fc --- /dev/null +++ b/tests/test_dataprep_pgvector.sh @@ -0,0 +1,62 @@ +#!/bin/bash +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +set -xe + +WORKPATH=$(dirname "$PWD") +ip_address=$(hostname -I | awk '{print $1}') +function build_docker_images() { + cd $WORKPATH + + # piull pgvector image + docker pull pgvector/pgvector:0.7.0-pg16 + + # build dataprep image for pgvector + docker build -t opea/dataprep-pgvector:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f $WORKPATH/comps/dataprep/pgvector/langchain/docker/Dockerfile . +} + +function start_service() { + export POSTGRES_USER=testuser + export POSTGRES_PASSWORD=testpwd + export POSTGRES_DB=vectordb + + docker run --name vectorstore-postgres -e POSTGRES_USER=${POSTGRES_USER} -e POSTGRES_HOST_AUTH_METHOD=trust -e POSTGRES_DB=${POSTGRES_DB} -e POSTGRES_PASSWORD=${POSTGRES_PASSWORD} -p 5432:5432 -d -v $WORKPATH/comps/vectorstores/langchain/pgvector/init.sql:/docker-entrypoint-initdb.d/init.sql pgvector/pgvector:0.7.0-pg16 + + sleep 10s + + docker run -d --name="dataprep-pgvector" -p 6007:6007 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e PG_CONNECTION_STRING=postgresql+psycopg2://${POSTGRES_USER}:${POSTGRES_PASSWORD}@$ip_address:5432/${POSTGRES_DB} opea/dataprep-pgvector:latest + + sleep 3m +} + +function validate_microservice() { + URL="http://$ip_address:6007/v1/dataprep" + echo 'The OPEA platform includes: Detailed framework of composable building blocks for state-of-the-art generative AI systems including LLMs, data stores, and prompt engines' > ./dataprep_file.txt + curl --noproxy $ip_address --location --request POST \ + --form 'files=@./dataprep_file.txt' $URL +} + +function stop_docker() { + cid=$(docker ps -aq --filter "name=vectorstore-postgres*") + if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid && sleep 1s; fi + + cid=$(docker ps -aq --filter "name=dataprep-pgvector*") + if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid && sleep 1s; fi +} + +function main() { + + stop_docker + + build_docker_images + start_service + + validate_microservice + + stop_docker + echo y | docker system prune + +} + +main diff --git a/tests/test_vectorstores_pgvector.sh b/tests/test_vectorstores_pgvector.sh new file mode 100755 index 000000000..1b43a6930 --- /dev/null +++ b/tests/test_vectorstores_pgvector.sh @@ -0,0 +1,63 @@ +#!/bin/bash +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +set -xe + +WORKPATH=$(dirname "$PWD") +ip_address=$(hostname -I | awk '{print $1}') +function build_docker_images() { + cd $WORKPATH + + # piull pgvector image + docker pull pgvector/pgvector:0.7.0-pg16 + + # build dataprep image for pgvector + docker build -t opea/dataprep-pgvector:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f $WORKPATH/comps/dataprep/pgvector/langchain/docker/Dockerfile . +} + +function start_service() { + export POSTGRES_USER=testuser + export POSTGRES_PASSWORD=testpwd + export POSTGRES_DB=vectordb + + docker run --name vectorstore-postgres -e POSTGRES_USER=${POSTGRES_USER} -e POSTGRES_HOST_AUTH_METHOD=trust -e POSTGRES_DB=${POSTGRES_DB} -e POSTGRES_PASSWORD=${POSTGRES_PASSWORD} -p 5432:5432 -d -v $WORKPATH/comps/vectorstores/langchain/pgvector/init.sql:/docker-entrypoint-initdb.d/init.sql pgvector/pgvector:0.7.0-pg16 + + sleep 10s + + docker run -d --name="dataprep-pgvector" -p 6007:6007 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e PG_CONNECTION_STRING=postgresql+psycopg2://${POSTGRES_USER}:${POSTGRES_PASSWORD}@$ip_address:5432/${POSTGRES_DB} opea/dataprep-pgvector:latest +} + +function validate_microservice() { + url="http://$ip_address:6007/v1/dataprep" + touch $WORKPATH/tests/test.txt + echo 'The OPEA platform includes: Detailed framework of composable building blocks for state-of-the-art generative AI systems including LLMs, data stores, and prompt engines' > $WORKPATH/tests/test.txt + + curl --location --request POST "${url}" \ + --form 'files=@"'${WORKPATH}'/tests/test.txt"' \ + --proxy http://proxy-chain.intel.com:912 +} + +function stop_docker() { + cid=$(docker ps -aq --filter "name=vectorstore-postgres*") + if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid && sleep 1s; fi + + cid=$(docker ps -aq --filter "name=dataprep-pgvector*") + if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid && sleep 1s; fi +} + +function main() { + + stop_docker + + build_docker_images + start_service + + validate_microservice + + #stop_docker + #echo y | docker system prune + +} + +main