Pinecone support for dataprep and retrieval microservice (opea-projec…

…t#157) Signed-off-by: Pallavi Jaini <[email protected]> Signed-off-by: Daniel Whitenack <[email protected]>
predictionguard · Jul 24, 2024 · 0e52862 · 0e52862
1 parent 90252db
commit 0e52862
Show file tree

Hide file tree

Showing 18 changed files with 438 additions and 1 deletion.
diff --git a/.github/workflows/docker/docker-compose.yaml b/.github/workflows/docker/docker-compose.yaml
@@ -14,7 +14,7 @@ services:
     pull_policy: always
   retriever-redis-server:
     build:
-      dockerfile: comps/retrievers/langchain/docker/Dockerfile
+      dockerfile: comps/retrievers/langchain/redis/docker/Dockerfile
     extends: embedding-tei-server
     image: ${REGISTRY}/${REPO}:retriever-redis-server
   reranking-tei-server:

diff --git a/comps/dataprep/README.md b/comps/dataprep/README.md
@@ -22,6 +22,10 @@ For details, please refer to this [readme](milvus/README.md)
 
 For details, please refer to this [readme](qdrant/README.md)
 
+# Dataprep Microservice with Pinecone
+
+For details, please refer to this [readme](pinecone/README.md)
+
 # Dataprep Microservice with PGVector
 
 For details, please refer to this [readme](pgvector/README.md)
diff --git a/comps/dataprep/pinecone/README.md b/comps/dataprep/pinecone/README.md
@@ -0,0 +1,69 @@
+# Dataprep Microservice with Pinecone
+
+# 🚀Start Microservice with Python
+
+## Install Requirements
+
+```bash
+pip install -r requirements.txt
+```
+
+## Start Pinecone Server
+
+Please refer to this [readme](../../../vectorstores/langchain/pinecone/README.md).
+
+## Setup Environment Variables
+
+```bash
+export http_proxy=${your_http_proxy}
+export https_proxy=${your_http_proxy}
+export PINECONE_API_KEY=${PINECONE_API_KEY}
+export PINECONE_INDEX_NAME=${PINECONE_INDEX_NAME}
+```
+
+## Start Document Preparation Microservice for Pinecone with Python Script
+
+Start document preparation microservice for Pinecone with below command.
+
+```bash
+python prepare_doc_pinecone.py
+```
+
+# 🚀Start Microservice with Docker
+
+## Build Docker Image
+
+```bash
+cd ../../../../
+docker build -t opea/dataprep-pinecone:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/pinecone/docker/Dockerfile .
+```
+
+## Run Docker with CLI
+
+```bash
+docker run -d --name="dataprep-pinecone-server" -p 6000:6000 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy opea/dataprep-pinecone:latest
+```
+
+## Setup Environment Variables
+
+```bash
+export http_proxy=${your_http_proxy}
+export https_proxy=${your_http_proxy}
+export PINECONE_API_KEY=${PINECONE_API_KEY}
+export PINECONE_INDEX_NAME=${PINECONE_INDEX_NAME}
+```
+
+## Run Docker with Docker Compose
+
+```bash
+cd comps/dataprep/pinecone/docker
+docker compose -f docker-compose-dataprep-pinecone.yaml up -d
+```
+
+# Invoke Microservice
+
+Once document preparation microservice for Pinecone is started, user can use below command to invoke the microservice to convert the document to embedding and save to the database.
+
+```bash
+curl -X POST -H "Content-Type: application/json" -d '{"path":"/path/to/document"}' http://localhost:6000/v1/dataprep
+```
diff --git a/comps/dataprep/pinecone/__init__.py b/comps/dataprep/pinecone/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
diff --git a/comps/dataprep/pinecone/config.py b/comps/dataprep/pinecone/config.py
@@ -0,0 +1,16 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+
+# Embedding model
+EMBED_MODEL = os.getenv("EMBED_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
+
+# Pinecone configuration
+PINECONE_API_KEY = os.getenv("PINECONE_API_KEY", "xxx_xxx")
+PINECONE_INDEX_NAME = int(os.getenv("PINECONE_INDEX_NAME", "langchain-test"))
+
+# LLM/Embedding endpoints
+TGI_LLM_ENDPOINT = os.getenv("TGI_LLM_ENDPOINT", "http://localhost:8080")
+TGI_LLM_ENDPOINT_NO_RAG = os.getenv("TGI_LLM_ENDPOINT_NO_RAG", "http://localhost:8081")
+TEI_EMBEDDING_ENDPOINT = os.getenv("TEI_ENDPOINT")
diff --git a/comps/dataprep/pinecone/docker/Dockerfile b/comps/dataprep/pinecone/docker/Dockerfile
@@ -0,0 +1,31 @@
+
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+FROM python:3.11-slim
+
+ENV LANG C.UTF-8
+
+RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \
+    build-essential \
+    libgl1-mesa-glx \
+    libjemalloc-dev \
+    vim
+
+RUN useradd -m -s /bin/bash user && \
+    mkdir -p /home/user && \
+    chown -R user /home/user/
+
+USER user
+
+COPY comps /home/user/comps
+
+RUN pip install --no-cache-dir --upgrade pip && \
+    pip install --no-cache-dir -r /home/user/comps/dataprep/pinecone/requirements.txt
+
+ENV PYTHONPATH=$PYTHONPATH:/home/user
+
+WORKDIR /home/user/comps/dataprep/pinecone
+
+ENTRYPOINT ["python", "prepare_doc_pinecone.py"]
+
diff --git a/comps/dataprep/pinecone/docker/docker-compose-dataprep-pinecone.yaml b/comps/dataprep/pinecone/docker/docker-compose-dataprep-pinecone.yaml
@@ -0,0 +1,21 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+version: "3"
+services:
+  dataprep-pinecone:
+    image: opea/gen-ai-comps:dataprep-pinecone-xeon-server
+    container_name: dataprep-pinecone-server
+    ports:
+      - "6000:6000"
+    ipc: host
+    environment:
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      PINECONE_API_KEY: ${PINECONE_API_KEY}
+      PINECONE_INDEX_NAME: ${PINECONE_INDEX_NAME}
+    restart: unless-stopped
+
+networks:
+  default:
+    driver: bridge
diff --git a/comps/dataprep/pinecone/prepare_doc_pinecone.py b/comps/dataprep/pinecone/prepare_doc_pinecone.py
@@ -0,0 +1,60 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+
+from config import EMBED_MODEL, PINECONE_API_KEY, PINECONE_INDEX_NAME
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_community.embeddings import HuggingFaceBgeEmbeddings, HuggingFaceEmbeddings, HuggingFaceHubEmbeddings
+from langchain_community.vectorstores import Pinecone
+
+from comps import DocPath, opea_microservices, opea_telemetry, register_microservice
+from comps.dataprep.utils import document_loader
+
+tei_embedding_endpoint = os.getenv("TEI_ENDPOINT")
+
+
+@register_microservice(
+    name="opea_service@prepare_doc_pinecone",
+    endpoint="/v1/dataprep",
+    host="0.0.0.0",
+    port=6000,
+    input_datatype=DocPath,
+    output_datatype=None,
+)
+@opea_telemetry
+def ingest_documents(doc_path: DocPath):
+    """Ingest document to Pinecone."""
+    doc_path = doc_path.path
+    print(f"Parsing document {doc_path}.")
+
+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=100, add_start_index=True)
+    content = document_loader(doc_path)
+    chunks = text_splitter.split_text(content)
+
+    print("Done preprocessing. Created ", len(chunks), " chunks of the original pdf")
+    # Create vectorstore
+    if tei_embedding_endpoint:
+        # create embeddings using TEI endpoint service
+        embedder = HuggingFaceHubEmbeddings(model=tei_embedding_endpoint)
+    else:
+        # create embeddings using local embedding model
+        embedder = HuggingFaceBgeEmbeddings(model_name=EMBED_MODEL)
+
+    # Batch size
+    batch_size = 32
+    num_chunks = len(chunks)
+    for i in range(0, num_chunks, batch_size):
+        batch_chunks = chunks[i : i + batch_size]
+        batch_texts = batch_chunks
+
+        _ = Pinecone.from_texts(
+            texts=batch_texts,
+            embedding=embedder,
+            index_name=PINECONE_INDEX_NAME,
+        )
+        print(f"Processed batch {i//batch_size + 1}/{(num_chunks-1)//batch_size + 1}")
+
+
+if __name__ == "__main__":
+    opea_microservices["opea_service@prepare_doc_pinecone"].start()
diff --git a/comps/dataprep/pinecone/requirements.txt b/comps/dataprep/pinecone/requirements.txt
@@ -0,0 +1,20 @@
+beautifulsoup4
+docarray[full]
+easyocr
+fastapi
+huggingface_hub
+langchain
+langchain-community
+langchain-pinecone
+langsmith
+numpy
+opentelemetry-api
+opentelemetry-exporter-otlp
+opentelemetry-sdk
+pandas
+Pillow
+pinecone-client
+pymupdf
+python-docx
+sentence_transformers
+shortuuid
diff --git a/comps/retrievers/langchain/pinecone/__init__.py b/comps/retrievers/langchain/pinecone/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
diff --git a/comps/retrievers/langchain/pinecone/config.py b/comps/retrievers/langchain/pinecone/config.py
@@ -0,0 +1,16 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+
+# Embedding model
+EMBED_MODEL = os.getenv("EMBED_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
+
+# Pinecone configuration
+PINECONE_API_KEY = os.getenv("PINECONE_API_KEY", "xxx_xxx")
+PINECONE_INDEX_NAME = int(os.getenv("PINECONE_INDEX_NAME", "langchain-test"))
+
+# LLM/Embedding endpoints
+TGI_LLM_ENDPOINT = os.getenv("TGI_LLM_ENDPOINT", "http://localhost:8080")
+TGI_LLM_ENDPOINT_NO_RAG = os.getenv("TGI_LLM_ENDPOINT_NO_RAG", "http://localhost:8081")
+TEI_EMBEDDING_ENDPOINT = os.getenv("TEI_ENDPOINT")
diff --git a/comps/retrievers/langchain/pinecone/docker/Dockerfile b/comps/retrievers/langchain/pinecone/docker/Dockerfile
@@ -0,0 +1,29 @@
+
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+FROM langchain/langchain:latest
+
+RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \
+    libgl1-mesa-glx \
+    libjemalloc-dev \
+    vim
+
+RUN useradd -m -s /bin/bash user && \
+    mkdir -p /home/user && \
+    chown -R user /home/user/
+
+COPY comps /home/user/comps
+
+RUN chmod +x /home/user/comps/retrievers/langchain/pinecone/run.sh
+
+USER user
+
+RUN pip install --no-cache-dir --upgrade pip && \
+    pip install --no-cache-dir -r /home/user/comps/retrievers/requirements.txt
+
+ENV PYTHONPATH=$PYTHONPATH:/home/user
+
+WORKDIR /home/user/comps/retrievers/langchain/pinecone
+
+ENTRYPOINT ["/home/user/comps/retrievers/langchain/pinecone/run.sh"]
diff --git a/comps/retrievers/langchain/pinecone/docker/docker_compose_retriever.yaml b/comps/retrievers/langchain/pinecone/docker/docker_compose_retriever.yaml
@@ -0,0 +1,32 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+version: "3.8"
+
+services:
+  tei_xeon_service:
+    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.2
+    container_name: tei-xeon-server
+    ports:
+      - "6060:80"
+    volumes:
+      - "./data:/data"
+    shm_size: 1g
+    command: --model-id ${RETRIEVE_MODEL_ID}
+  retriever:
+    image: opea/retriever-pinecone:latest
+    container_name: retriever-pinecone-server
+    ports:
+      - "7000:7000"
+    ipc: host
+    environment:
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      PINECONE_API_KEY: ${PINECONE_API_KEY}
+      INDEX_NAME: ${PINECONE_INDEX_NAME}
+      LANGCHAIN_API_KEY: ${LANGCHAIN_API_KEY}
+    restart: unless-stopped
+
+networks:
+  default:
+    driver: bridge
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		# Copyright (C) 2024 Intel Corporation
		# SPDX-License-Identifier: Apache-2.0