From 29fe569d342301fa99f2723c396f0e3fe495dd60 Mon Sep 17 00:00:00 2001 From: XuhuiRen <44249229+XuhuiRen@users.noreply.github.com> Date: Sun, 15 Sep 2024 18:12:29 +0800 Subject: [PATCH] Enable GraphRAG with Neo4J (#682) * add graphrag for neo4j Signed-off-by: XuhuiRen * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add Signed-off-by: XuhuiRen * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add Signed-off-by: XuhuiRen * add Signed-off-by: XuhuiRen * fix ut Signed-off-by: XuhuiRen * fix Signed-off-by: XuhuiRen * add Signed-off-by: XuhuiRen * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update retriever_neo4j.py * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add Signed-off-by: XuhuiRen * Update test_retrievers_neo4j_langchain.sh * add Signed-off-by: XuhuiRen * Update test_retrievers_neo4j_langchain.sh * Update test_retrievers_neo4j_langchain.sh * Update test_retrievers_neo4j_langchain.sh * add docker Signed-off-by: XuhuiRen * Update retrievers-compose-cd.yaml * Update test_retrievers_neo4j_langchain.sh * Update config.py * Update test_retrievers_neo4j_langchain.sh * Update test_retrievers_neo4j_langchain.sh * Update config.py * Update test_retrievers_neo4j_langchain.sh * Update requirements.txt * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update requirements.txt * Update requirements.txt * Update requirements.txt --------- Signed-off-by: XuhuiRen Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: lvliang-intel --- .../docker/compose/dataprep-compose-cd.yaml | 4 + .../docker/compose/retrievers-compose-cd.yaml | 4 + comps/dataprep/neo4j/langchain/Dockerfile | 38 ++++ comps/dataprep/neo4j/langchain/README.md | 116 +++++++++++ comps/dataprep/neo4j/langchain/__init__.py | 2 + comps/dataprep/neo4j/langchain/config.py | 15 ++ .../docker-compose-dataprep-neo4j.yaml | 48 +++++ .../neo4j/langchain/prepare_doc_neo4j.py | 191 ++++++++++++++++++ .../dataprep/neo4j/langchain/requirements.txt | 31 +++ comps/retrievers/neo4j/langchain/Dockerfile | 35 ++++ comps/retrievers/neo4j/langchain/README.md | 112 ++++++++++ comps/retrievers/neo4j/langchain/__init__.py | 2 + comps/retrievers/neo4j/langchain/config.py | 15 ++ .../neo4j/langchain/requirements.txt | 21 ++ .../neo4j/langchain/retriever_neo4j.py | 117 +++++++++++ .../test_retrievers_neo4j_langchain.sh | 99 +++++++++ 16 files changed, 850 insertions(+) create mode 100644 comps/dataprep/neo4j/langchain/Dockerfile create mode 100644 comps/dataprep/neo4j/langchain/README.md create mode 100644 comps/dataprep/neo4j/langchain/__init__.py create mode 100644 comps/dataprep/neo4j/langchain/config.py create mode 100644 comps/dataprep/neo4j/langchain/docker-compose-dataprep-neo4j.yaml create mode 100644 comps/dataprep/neo4j/langchain/prepare_doc_neo4j.py create mode 100644 comps/dataprep/neo4j/langchain/requirements.txt create mode 100644 comps/retrievers/neo4j/langchain/Dockerfile create mode 100644 comps/retrievers/neo4j/langchain/README.md create mode 100644 comps/retrievers/neo4j/langchain/__init__.py create mode 100644 comps/retrievers/neo4j/langchain/config.py create mode 100644 comps/retrievers/neo4j/langchain/requirements.txt create mode 100644 comps/retrievers/neo4j/langchain/retriever_neo4j.py create mode 100644 tests/retrievers/test_retrievers_neo4j_langchain.sh diff --git a/.github/workflows/docker/compose/dataprep-compose-cd.yaml b/.github/workflows/docker/compose/dataprep-compose-cd.yaml index fb08b51fa..6622a2921 100644 --- a/.github/workflows/docker/compose/dataprep-compose-cd.yaml +++ b/.github/workflows/docker/compose/dataprep-compose-cd.yaml @@ -27,3 +27,7 @@ services: build: dockerfile: comps/dataprep/vdms/langchain/Dockerfile image: ${REGISTRY:-opea}/dataprep-vdms:${TAG:-latest} + dataprep-neo4j: + build: + dockerfile: comps/dataprep/neo4j/langchain/Dockerfile + image: ${REGISTRY:-opea}/dataprep-neo4j:${TAG:-latest} diff --git a/.github/workflows/docker/compose/retrievers-compose-cd.yaml b/.github/workflows/docker/compose/retrievers-compose-cd.yaml index f9230412d..67b44fd0f 100644 --- a/.github/workflows/docker/compose/retrievers-compose-cd.yaml +++ b/.github/workflows/docker/compose/retrievers-compose-cd.yaml @@ -27,3 +27,7 @@ services: build: dockerfile: comps/retrievers/multimodal/redis/langchain/Dockerfile image: ${REGISTRY:-opea}/multimodal-retriever-redis:${TAG:-latest} + retriever-neo4j: + build: + dockerfile: comps/retrievers/neo4j/langchain/Dockerfile + image: ${REGISTRY:-opea}/retriever-neo4j:${TAG:-latest} diff --git a/comps/dataprep/neo4j/langchain/Dockerfile b/comps/dataprep/neo4j/langchain/Dockerfile new file mode 100644 index 000000000..5c1884359 --- /dev/null +++ b/comps/dataprep/neo4j/langchain/Dockerfile @@ -0,0 +1,38 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +FROM python:3.11-slim + +ENV LANG=C.UTF-8 + +ARG ARCH="cpu" + +RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ + build-essential \ + default-jre \ + libgl1-mesa-glx \ + libjemalloc-dev + +RUN useradd -m -s /bin/bash user && \ + mkdir -p /home/user && \ + chown -R user /home/user/ + +USER user + +COPY comps /home/user/comps + +RUN pip install --no-cache-dir --upgrade pip setuptools && \ + if [ ${ARCH} = "cpu" ]; then pip install --no-cache-dir torch torchvision --index-url https://download.pytorch.org/whl/cpu; fi && \ + pip install --no-cache-dir -r /home/user/comps/dataprep/neo4j/langchain/requirements.txt + +ENV PYTHONPATH=$PYTHONPATH:/home/user + +USER root + +RUN mkdir -p /home/user/comps/dataprep/qdrant/langchain/uploaded_files && chown -R user /home/user/comps/dataprep/neo4j/langchain/uploaded_files + +USER user + +WORKDIR /home/user/comps/dataprep/neo4j/langchain + +ENTRYPOINT ["python", "prepare_doc_neo4j.py"] diff --git a/comps/dataprep/neo4j/langchain/README.md b/comps/dataprep/neo4j/langchain/README.md new file mode 100644 index 000000000..31f92548b --- /dev/null +++ b/comps/dataprep/neo4j/langchain/README.md @@ -0,0 +1,116 @@ +# Dataprep Microservice with Neo4J + +## 🚀Start Microservice with Python + +### Install Requirements + +```bash +pip install -r requirements.txt +apt-get install libtesseract-dev -y +apt-get install poppler-utils -y +``` + +### Start Neo4J Server + +To launch Neo4j locally, first ensure you have docker installed. Then, you can launch the database with the following docker command. + +```bash +docker run \ + -p 7474:7474 -p 7687:7687 \ + -v $PWD/data:/data -v $PWD/plugins:/plugins \ + --name neo4j-apoc \ + -d \ + -e NEO4J_AUTH=neo4j/password \ + -e NEO4J_PLUGINS=\[\"apoc\"\] \ + neo4j:latest +``` + +### Setup Environment Variables + +```bash +export no_proxy=${your_no_proxy} +export http_proxy=${your_http_proxy} +export https_proxy=${your_http_proxy} +export NEO4J_URI=${your_neo4j_url} +export NEO4J_USERNAME=${your_neo4j_username} +export NEO4J_PASSWORD=${your_neo4j_password} +export PYTHONPATH=${path_to_comps} +``` + +### Start Document Preparation Microservice for Neo4J with Python Script + +Start document preparation microservice for Neo4J with below command. + +```bash +python prepare_doc_neo4j.py +``` + +## 🚀Start Microservice with Docker + +### Build Docker Image + +```bash +cd ../../../../ +docker build -t opea/dataprep-neo4j:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/neo4j/langchain/Dockerfile . +``` + +### Run Docker with CLI + +```bash +docker run -d --name="dataprep-neo4j-server" -p 6007:6007 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy opea/dataprep-neo4j:latest +``` + +### Setup Environment Variables + +```bash +export no_proxy=${your_no_proxy} +export http_proxy=${your_http_proxy} +export https_proxy=${your_http_proxy} +export NEO4J_URI=${your_neo4j_url} +export NEO4J_USERNAME=${your_neo4j_username} +export NEO4J_PASSWORD=${your_neo4j_password} +``` + +### Run Docker with Docker Compose + +```bash +cd comps/dataprep/neo4j/langchain +docker compose -f docker-compose-dataprep-neo4j.yaml up -d +``` + +## Invoke Microservice + +Once document preparation microservice for Neo4J is started, user can use below command to invoke the microservice to convert the document to embedding and save to the database. + +```bash +curl -X POST \ + -H "Content-Type: multipart/form-data" \ + -F "files=@./file1.txt" \ + http://localhost:6007/v1/dataprep +``` + +You can specify chunk_size and chunk_size by the following commands. + +```bash +curl -X POST \ + -H "Content-Type: multipart/form-data" \ + -F "files=@./file1.txt" \ + -F "chunk_size=1500" \ + -F "chunk_overlap=100" \ + http://localhost:6007/v1/dataprep +``` + +We support table extraction from pdf documents. You can specify process_table and table_strategy by the following commands. "table_strategy" refers to the strategies to understand tables for table retrieval. As the setting progresses from "fast" to "hq" to "llm," the focus shifts towards deeper table understanding at the expense of processing speed. The default strategy is "fast". + +Note: If you specify "table_strategy=llm", You should first start TGI Service, please refer to 1.2.1, 1.3.1 in https://github.com/opea-project/GenAIComps/tree/main/comps/llms/README.md, and then `export TGI_LLM_ENDPOINT="http://${your_ip}:8008"`. + +For ensure the quality and comprehensiveness of the extracted entities, we recommend to use `gpt-4o` as the default model for parsing the document. To enable the openai service, please `export OPENAI_KEY=xxxx` before using this services. + +```bash +curl -X POST \ + -H "Content-Type: multipart/form-data" \ + -F "files=@./your_file.pdf" \ + -F "process_table=true" \ + -F "table_strategy=hq" \ + http://localhost:6007/v1/dataprep +``` diff --git a/comps/dataprep/neo4j/langchain/__init__.py b/comps/dataprep/neo4j/langchain/__init__.py new file mode 100644 index 000000000..916f3a44b --- /dev/null +++ b/comps/dataprep/neo4j/langchain/__init__.py @@ -0,0 +1,2 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 diff --git a/comps/dataprep/neo4j/langchain/config.py b/comps/dataprep/neo4j/langchain/config.py new file mode 100644 index 000000000..bb21d57e3 --- /dev/null +++ b/comps/dataprep/neo4j/langchain/config.py @@ -0,0 +1,15 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import os + +# Neo4J configuration +NEO4J_URL = os.getenv("NEO4J_URI", "bolt://localhost:7687") +NEO4J_USERNAME = os.getenv("NEO4J_USERNAME", "neo4j") +NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD", "test") + +# LLM/Embedding endpoints +TGI_LLM_ENDPOINT = os.getenv("TGI_LLM_ENDPOINT", "http://localhost:8080") +TGI_LLM_ENDPOINT_NO_RAG = os.getenv("TGI_LLM_ENDPOINT_NO_RAG", "http://localhost:8081") +TEI_EMBEDDING_ENDPOINT = os.getenv("TEI_ENDPOINT") +OPENAI_KEY = os.getenv("OPENAI_API_KEY") diff --git a/comps/dataprep/neo4j/langchain/docker-compose-dataprep-neo4j.yaml b/comps/dataprep/neo4j/langchain/docker-compose-dataprep-neo4j.yaml new file mode 100644 index 000000000..d7d210adf --- /dev/null +++ b/comps/dataprep/neo4j/langchain/docker-compose-dataprep-neo4j.yaml @@ -0,0 +1,48 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +version: "3" +services: + neo4j-vector-db: + image: neo4j/neo4j + container_name: neo4j-graph-db + ports: + - "6337:6337" + - "6338:6338" + tgi_gaudi_service: + image: ghcr.io/huggingface/tgi-gaudi:2.0.1 + container_name: tgi-service + ports: + - "8088:80" + volumes: + - "./data:/data" + shm_size: 1g + environment: + no_proxy: ${no_proxy} + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + HF_TOKEN: ${HF_TOKEN} + command: --model-id ${LLM_MODEL_ID} --auto-truncate --max-input-tokens 1024 --max-total-tokens 2048 + dataprep-neo4j: + image: opea/gen-ai-comps:dataprep-neo4j-xeon-server + container_name: dataprep-neo4j-server + depends_on: + - neo4j-vector-db + - tgi_gaudi_service + ports: + - "6007:6007" + ipc: host + environment: + no_proxy: ${no_proxy} + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + NEO4J_URL: ${NEO4J_URL} + NEO4J_USERNAME: ${NEO4J_USERNAME} + NEO4J_PASSWORD: ${NEO4J_PASSWORD} + TGI_LLM_ENDPOINT: ${TEI_ENDPOINT} + OPENAI_KEY: ${OPENAI_API_KEY} + restart: unless-stopped + +networks: + default: + driver: bridge diff --git a/comps/dataprep/neo4j/langchain/prepare_doc_neo4j.py b/comps/dataprep/neo4j/langchain/prepare_doc_neo4j.py new file mode 100644 index 000000000..39d88d055 --- /dev/null +++ b/comps/dataprep/neo4j/langchain/prepare_doc_neo4j.py @@ -0,0 +1,191 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import json +import os +from typing import List, Optional, Union + +import openai +from config import NEO4J_PASSWORD, NEO4J_URL, NEO4J_USERNAME, OPENAI_KEY, TGI_LLM_ENDPOINT +from fastapi import File, Form, HTTPException, UploadFile +from langchain.text_splitter import RecursiveCharacterTextSplitter +from langchain_community.graphs import Neo4jGraph +from langchain_community.graphs.graph_document import GraphDocument +from langchain_community.llms import HuggingFaceEndpoint +from langchain_core.documents import Document +from langchain_experimental.graph_transformers import LLMGraphTransformer +from langchain_openai import ChatOpenAI +from langchain_text_splitters import HTMLHeaderTextSplitter + +from comps import CustomLogger, DocPath, opea_microservices, register_microservice +from comps.dataprep.utils import ( + document_loader, + encode_filename, + get_separators, + get_tables_result, + parse_html, + save_content_to_local_disk, +) + +logger = CustomLogger("prepare_doc_neo4j") +logflag = os.getenv("LOGFLAG", False) + +upload_folder = "./uploaded_files/" + + +def ingest_data_to_neo4j(doc_path: DocPath): + """Ingest document to Neo4J.""" + path = doc_path.path + if logflag: + logger.info(f"Parsing document {path}.") + + if path.endswith(".html"): + headers_to_split_on = [ + ("h1", "Header 1"), + ("h2", "Header 2"), + ("h3", "Header 3"), + ] + text_splitter = HTMLHeaderTextSplitter(headers_to_split_on=headers_to_split_on) + else: + text_splitter = RecursiveCharacterTextSplitter( + chunk_size=doc_path.chunk_size, + chunk_overlap=doc_path.chunk_overlap, + add_start_index=True, + separators=get_separators(), + ) + + content = document_loader(path) + + structured_types = [".xlsx", ".csv", ".json", "jsonl"] + _, ext = os.path.splitext(path) + + if ext in structured_types: + chunks = content + else: + chunks = text_splitter.split_text(content) + + if doc_path.process_table and path.endswith(".pdf"): + table_chunks = get_tables_result(path, doc_path.table_strategy) + chunks = chunks + table_chunks + if logflag: + logger.info("Done preprocessing. Created ", len(chunks), " chunks of the original file.") + + if OPENAI_KEY: + logger.info("OpenAI API Key is set. Verifying its validity...") + openai.api_key = OPENAI_KEY + + try: + response = openai.Engine.list() + logger.info("OpenAI API Key is valid.") + llm = ChatOpenAI(temperature=0, model_name="gpt-4o") + except openai.error.AuthenticationError: + logger.info("OpenAI API Key is invalid.") + except Exception as e: + logger.info(f"An error occurred while verifying the API Key: {e}") + else: + llm = HuggingFaceEndpoint( + endpoint_url=TGI_LLM_ENDPOINT, + max_new_tokens=512, + top_k=40, + top_p=0.9, + temperature=0.8, + timeout=600, + ) + + llm_transformer = LLMGraphTransformer( + llm=llm, node_properties=["description"], relationship_properties=["description"] + ) + + doc_list = [Document(page_content=text) for text in chunks] + graph_doc = llm_transformer.convert_to_graph_documents(doc_list) + + graph = Neo4jGraph(url=NEO4J_URL, username=NEO4J_USERNAME, password=NEO4J_PASSWORD) + + graph.add_graph_documents(graph_doc, baseEntityLabel=True, include_source=True) + + if logflag: + logger.info("The graph is built.") + + return True + + +@register_microservice( + name="opea_service@prepare_doc_neo4j", + endpoint="/v1/dataprep", + host="0.0.0.0", + port=6007, + input_datatype=DocPath, + output_datatype=None, +) +async def ingest_documents( + files: Optional[Union[UploadFile, List[UploadFile]]] = File(None), + link_list: Optional[str] = Form(None), + chunk_size: int = Form(1500), + chunk_overlap: int = Form(100), + process_table: bool = Form(False), + table_strategy: str = Form("fast"), +): + if logflag: + logger.info(f"files:{files}") + logger.info(f"link_list:{link_list}") + + if files: + if not isinstance(files, list): + files = [files] + uploaded_files = [] + for file in files: + encode_file = encode_filename(file.filename) + save_path = upload_folder + encode_file + await save_content_to_local_disk(save_path, file) + ingest_data_to_neo4j( + DocPath( + path=save_path, + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + process_table=process_table, + table_strategy=table_strategy, + ) + ) + uploaded_files.append(save_path) + if logflag: + logger.info(f"Successfully saved file {save_path}") + result = {"status": 200, "message": "Data preparation succeeded"} + if logflag: + logger.info(result) + return result + + if link_list: + link_list = json.loads(link_list) # Parse JSON string to list + if not isinstance(link_list, list): + raise HTTPException(status_code=400, detail="link_list should be a list.") + for link in link_list: + encoded_link = encode_filename(link) + save_path = upload_folder + encoded_link + ".txt" + content = parse_html([link])[0][0] + try: + await save_content_to_local_disk(save_path, content) + ingest_data_to_neo4j( + DocPath( + path=save_path, + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + process_table=process_table, + table_strategy=table_strategy, + ) + ) + except json.JSONDecodeError: + raise HTTPException(status_code=500, detail="Fail to ingest data into qdrant.") + + if logflag: + logger.info(f"Successfully saved link {link}") + + result = {"status": 200, "message": "Data preparation succeeded"} + if logflag: + logger.info(result) + return result + + raise HTTPException(status_code=400, detail="Must provide either a file or a string list.") + + +if __name__ == "__main__": + opea_microservices["opea_service@prepare_doc_neo4j"].start() diff --git a/comps/dataprep/neo4j/langchain/requirements.txt b/comps/dataprep/neo4j/langchain/requirements.txt new file mode 100644 index 000000000..b8326a623 --- /dev/null +++ b/comps/dataprep/neo4j/langchain/requirements.txt @@ -0,0 +1,31 @@ +beautifulsoup4 +cairosvg +docarray[full] +docx2txt +easyocr +fastapi +huggingface_hub +langchain +langchain-community +langchain-experimental +langchain-openai +langchain-text-splitters +langchain_huggingface +markdown +neo4j +numpy +openai +opentelemetry-api +opentelemetry-exporter-otlp +opentelemetry-sdk +pandas +Pillow +prometheus-fastapi-instrumentator +pymupdf +python-docx +python-pptx +sentence_transformers +shortuuid +unstructured[all-docs]==0.15.7 +uvicorn + diff --git a/comps/retrievers/neo4j/langchain/Dockerfile b/comps/retrievers/neo4j/langchain/Dockerfile new file mode 100644 index 000000000..5d8e8d254 --- /dev/null +++ b/comps/retrievers/neo4j/langchain/Dockerfile @@ -0,0 +1,35 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +FROM python:3.11-slim + +ENV LANG=C.UTF-8 + +ENV no_proxy=localhost,127.0.0.1 + +ENV HUGGINGFACEHUB_API_TOKEN=dummy + +ARG ARCH="cpu" + +RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ + build-essential \ + libgl1-mesa-glx \ + libjemalloc-dev + +RUN useradd -m -s /bin/bash user && \ + mkdir -p /home/user && \ + chown -R user /home/user/ + +USER user + +COPY comps /home/user/comps + +RUN pip install --no-cache-dir --upgrade pip && \ + if [ ${ARCH} = "cpu" ]; then pip install --no-cache-dir torch --index-url https://download.pytorch.org/whl/cpu; fi && \ + pip install --no-cache-dir -r /home/user/comps/retrievers/neo4j/langchain/requirements.txt + +ENV PYTHONPATH=$PYTHONPATH:/home/user + +WORKDIR /home/user/comps/retrievers/neo4j/langchain + +ENTRYPOINT ["python", "retriever_neo4j.py"] diff --git a/comps/retrievers/neo4j/langchain/README.md b/comps/retrievers/neo4j/langchain/README.md new file mode 100644 index 000000000..731abc20f --- /dev/null +++ b/comps/retrievers/neo4j/langchain/README.md @@ -0,0 +1,112 @@ +# Retriever Microservice with Neo4J + +## 🚀Start Microservice with Python + +### Install Requirements + +```bash +pip install -r requirements.txt +``` + +### Start Neo4J Server + +To launch Neo4j locally, first ensure you have docker installed. Then, you can launch the database with the following docker command. + +```bash +docker run \ + -p 7474:7474 -p 7687:7687 \ + -v $PWD/data:/data -v $PWD/plugins:/plugins \ + --name neo4j-apoc \ + -d \ + -e NEO4J_AUTH=neo4j/password \ + -e NEO4J_PLUGINS=\[\"apoc\"\] \ + neo4j:latest +``` + +### Setup Environment Variables + +```bash +export no_proxy=${your_no_proxy} +export http_proxy=${your_http_proxy} +export https_proxy=${your_http_proxy} +export NEO4J_URI=${your_neo4j_url} +export NEO4J_USERNAME=${your_neo4j_username} +export NEO4J_PASSWORD=${your_neo4j_password} +``` + +### Start Retriever Service + +```bash +python retriever_neo4j.py +``` + +## 🚀Start Microservice with Docker + +### Build Docker Image + +```bash +cd ../../ +docker build -t opea/retriever-neo4j:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/retrievers/neo4j/langchain/Dockerfile . +``` + +### Run Docker with CLI + +```bash +docker run -d --name="retriever-neo4j-server" -p 7000:7000 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e NEO4J_URI=${your_neo4j_host_ip} opea/retriever-neo4j:latest +``` + +## 🚀3. Consume Retriever Service + +### 3.1 Check Service Status + +```bash +curl http://${your_ip}:7000/v1/health_check \ + -X GET \ + -H 'Content-Type: application/json' +``` + +### 3.2 Consume Embedding Service + +To consume the Retriever Microservice, you can generate a mock embedding vector of length 768 with Python. + +```bash +export your_embedding=$(python -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)") +curl http://${your_ip}:7000/v1/retrieval \ + -X POST \ + -d "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${your_embedding}}" \ + -H 'Content-Type: application/json' +``` + +You can set the parameters for the retriever. + +```bash +export your_embedding=$(python -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)") +curl http://localhost:7000/v1/retrieval \ + -X POST \ + -d "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${your_embedding},\"search_type\":\"similarity\", \"k\":4}" \ + -H 'Content-Type: application/json' +``` + +```bash +export your_embedding=$(python -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)") +curl http://localhost:7000/v1/retrieval \ + -X POST \ + -d "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${your_embedding},\"search_type\":\"similarity_distance_threshold\", \"k\":4, \"distance_threshold\":1.0}" \ + -H 'Content-Type: application/json' +``` + +```bash +export your_embedding=$(python -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)") +curl http://localhost:7000/v1/retrieval \ + -X POST \ + -d "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${your_embedding},\"search_type\":\"similarity_score_threshold\", \"k\":4, \"score_threshold\":0.2}" \ + -H 'Content-Type: application/json' +``` + +```bash +export your_embedding=$(python -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)") +curl http://localhost:7000/v1/retrieval \ + -X POST \ + -d "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${your_embedding},\"search_type\":\"mmr\", \"k\":4, \"fetch_k\":20, \"lambda_mult\":0.5}" \ + -H 'Content-Type: application/json' +``` diff --git a/comps/retrievers/neo4j/langchain/__init__.py b/comps/retrievers/neo4j/langchain/__init__.py new file mode 100644 index 000000000..916f3a44b --- /dev/null +++ b/comps/retrievers/neo4j/langchain/__init__.py @@ -0,0 +1,2 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 diff --git a/comps/retrievers/neo4j/langchain/config.py b/comps/retrievers/neo4j/langchain/config.py new file mode 100644 index 000000000..39adf6d89 --- /dev/null +++ b/comps/retrievers/neo4j/langchain/config.py @@ -0,0 +1,15 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import os + +# Neo4J configuration +NEO4J_URL = os.getenv("NEO4J_URI", "bolt://localhost:7687") +NEO4J_USERNAME = os.getenv("NEO4J_USERNAME", "neo4j") +NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD", "test") + +# Embedding model +EMBED_MODEL = os.getenv("EMBED_MODEL", "BAAI/bge-base-en-v1.5") + +# Embedding endpoints +EMBED_ENDPOINT = os.getenv("TEI_EMBEDDING_ENDPOINT", "") diff --git a/comps/retrievers/neo4j/langchain/requirements.txt b/comps/retrievers/neo4j/langchain/requirements.txt new file mode 100644 index 000000000..24f579c6a --- /dev/null +++ b/comps/retrievers/neo4j/langchain/requirements.txt @@ -0,0 +1,21 @@ +docarray[full] +fastapi +frontend==0.0.3 +huggingface_hub +langchain==0.2 +langchain-community +neo4j +numpy +opentelemetry-api +opentelemetry-exporter-otlp +opentelemetry-sdk +pandas +Pillow +prometheus-fastapi-instrumentator +pydantic==2.7.3 +pymupdf==1.24.5 +python-docx==0.8.11 +sentence_transformers +shortuuid +tiktoken +uvicorn diff --git a/comps/retrievers/neo4j/langchain/retriever_neo4j.py b/comps/retrievers/neo4j/langchain/retriever_neo4j.py new file mode 100644 index 000000000..47ce4a544 --- /dev/null +++ b/comps/retrievers/neo4j/langchain/retriever_neo4j.py @@ -0,0 +1,117 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import argparse +import os +import time +from typing import Union + +from config import EMBED_ENDPOINT, EMBED_MODEL, NEO4J_PASSWORD, NEO4J_URL, NEO4J_USERNAME +from langchain_community.embeddings import HuggingFaceBgeEmbeddings, HuggingFaceHubEmbeddings +from langchain_community.vectorstores import Neo4jVector + +from comps import ( + CustomLogger, + EmbedDoc, + SearchedDoc, + ServiceType, + TextDoc, + opea_microservices, + register_microservice, + register_statistics, + statistics_dict, +) +from comps.cores.proto.api_protocol import ( + ChatCompletionRequest, + RetrievalRequest, + RetrievalResponse, + RetrievalResponseData, +) + +logger = CustomLogger("retriever_neo4j") +logflag = os.getenv("LOGFLAG", False) + + +@register_microservice( + name="opea_service@retriever_neo4j", + service_type=ServiceType.RETRIEVER, + endpoint="/v1/retrieval", + host="0.0.0.0", + port=7000, +) +@register_statistics(names=["opea_service@retriever_neo4j"]) +def retrieve( + input: Union[EmbedDoc, RetrievalRequest, ChatCompletionRequest] +) -> Union[SearchedDoc, RetrievalResponse, ChatCompletionRequest]: + if logflag: + logger.info(input) + start = time.time() + + if isinstance(input, EmbedDoc): + query = input.text + else: + # for RetrievalRequest, ChatCompletionRequest + query = input.input + + if input.search_type == "similarity": + search_res = vector_db.similarity_search_by_vector(embedding=input.embedding, query=input.text, k=input.k) + elif input.search_type == "similarity_distance_threshold": + if input.distance_threshold is None: + raise ValueError("distance_threshold must be provided for " + "similarity_distance_threshold retriever") + search_res = vector_db.similarity_search_by_vector( + embedding=input.embedding, query=input.text, k=input.k, distance_threshold=input.distance_threshold + ) + elif input.search_type == "similarity_score_threshold": + docs_and_similarities = vector_db.similarity_search_with_relevance_scores( + query=input.text, k=input.k, score_threshold=input.score_threshold + ) + search_res = [doc for doc, _ in docs_and_similarities] + elif input.search_type == "mmr": + search_res = vector_db.max_marginal_relevance_search( + query=input.text, k=input.k, fetch_k=input.fetch_k, lambda_mult=input.lambda_mult + ) + else: + raise ValueError(f"{input.search_type} not valid") + + # return different response format + retrieved_docs = [] + if isinstance(input, EmbedDoc): + for r in search_res: + retrieved_docs.append(TextDoc(text=r.page_content)) + result = SearchedDoc(retrieved_docs=retrieved_docs, initial_query=input.text) + else: + for r in search_res: + retrieved_docs.append(RetrievalResponseData(text=r.page_content, metadata=r.metadata)) + if isinstance(input, RetrievalRequest): + result = RetrievalResponse(retrieved_docs=retrieved_docs) + elif isinstance(input, ChatCompletionRequest): + input.retrieved_docs = retrieved_docs + input.documents = [doc.text for doc in retrieved_docs] + result = input + + statistics_dict["opea_service@retriever_neo4j"].append_latency(time.time() - start, None) + if logflag: + logger.info(result) + return result + + +if __name__ == "__main__": + + if EMBED_ENDPOINT: + # create embeddings using TEI endpoint service + hf_token = os.getenv("HUGGINGFACEHUB_API_TOKEN") + embeddings = HuggingFaceHubEmbeddings(model=EMBED_ENDPOINT, huggingfacehub_api_token=hf_token) + else: + # create embeddings using local embedding model + embeddings = HuggingFaceBgeEmbeddings(model_name=EMBED_MODEL) + + vector_db = Neo4jVector.from_existing_graph( + embedding=embeddings, + url=NEO4J_URL, + username=NEO4J_USERNAME, + password=NEO4J_PASSWORD, + node_label="__Entity__", + text_node_properties=["id", "description"], + embedding_node_property="embedding", + ) + opea_microservices["opea_service@retriever_neo4j"].start() diff --git a/tests/retrievers/test_retrievers_neo4j_langchain.sh b/tests/retrievers/test_retrievers_neo4j_langchain.sh new file mode 100644 index 000000000..9855fe75f --- /dev/null +++ b/tests/retrievers/test_retrievers_neo4j_langchain.sh @@ -0,0 +1,99 @@ +#!/bin/bash +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +set -x + +WORKPATH=$(dirname "$PWD") +LOG_PATH="$WORKPATH/tests" +ip_address=$(hostname -I | awk '{print $1}') + +function build_docker_images() { + cd $WORKPATH + docker run -d -p 7474:7474 -p 7687:7687 -v ./data:/data -v ./plugins:/plugins --name test-comps-neo4j-apoc1 -e NEO4J_AUTH=neo4j/password -e NEO4J_PLUGINS=\[\"apoc\"\] neo4j:latest + sleep 30s + + docker build --no-cache -t opea/retriever-neo4j:comps --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/retrievers/neo4j/langchain/Dockerfile . + if [ $? -ne 0 ]; then + echo "opea/retriever-neo4j built fail" + exit 1 + else + echo "opea/retriever-neo4j built successful" + fi +} + +function start_service() { + # tei endpoint + tei_endpoint=5434 + model="BAAI/bge-base-en-v1.5" + docker run -d --name="test-comps-retriever-neo4j-tei-endpoint" -p $tei_endpoint:80 -v ./data:/data --pull always ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 --model-id $model + sleep 30s + export TEI_EMBEDDING_ENDPOINT="http://${ip_address}:${tei_endpoint}" + + # Neo4J retriever + export NEO4J_URI="bolt://${ip_address}:7687" + export NEO4J_USERNAME="neo4j" + export NEO4J_PASSWORD="password" + retriever_port=5435 + # unset http_proxy + export no_proxy="localhost,127.0.0.1,"${ip_address} + docker run -d --name="test-comps-retriever-neo4j-server" -p ${retriever_port}:7000 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e NEO4J_URI="bolt://${ip_address}:7687" -e NEO4J_USERNAME="neo4j" -e NEO4J_PASSWORD="password" opea/retriever-neo4j:comps + + sleep 1m +} + +function validate_microservice() { + retriever_port=5435 + export PATH="${HOME}/miniforge3/bin:$PATH" + source activate + URL="http://${ip_address}:$retriever_port/v1/retrieval" + + test_embedding=$(python -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)") + + HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -d "{\"text\":\"test\",\"embedding\":${test_embedding}}" -H 'Content-Type: application/json' "$URL") + if [ "$HTTP_STATUS" -eq 200 ]; then + echo "[ retriever ] HTTP status is 200. Checking content..." + local CONTENT=$(curl -s -X POST -d "{\"text\":\"test\",\"embedding\":${test_embedding}}" -H 'Content-Type: application/json' "$URL" | tee ${LOG_PATH}/retriever.log) + + if echo "$CONTENT" | grep -q "retrieved_docs"; then + echo "[ retriever ] Content is as expected." + else + echo "[ retriever ] Content does not match the expected result: $CONTENT" + docker logs test-comps-retriever-neo4j-server >> ${LOG_PATH}/retriever.log + docker logs test-comps-retriever-neo4j-tei-endpoint >> ${LOG_PATH}/tei.log + exit 1 + fi + else + echo "[ retriever ] HTTP status is not 200. Received status was $HTTP_STATUS" + docker logs test-comps-retriever-neo4j-server >> ${LOG_PATH}/retriever.log + docker logs test-comps-retriever-neo4j-tei-endpoint >> ${LOG_PATH}/tei.log + exit 1 + fi +} + +function stop_docker() { + cid_retrievers=$(docker ps -aq --filter "name=test-comps-retriever-neo4j*") + if [[ ! -z "$cid_retrievers" ]]; then + docker stop $cid_retrievers && docker rm $cid_retrievers && sleep 1s + fi + cid_db=$(docker ps -aq --filter "name=test-comps-neo4j-apoc1") + if [[ ! -z "$cid_retrievers" ]]; then + docker stop $cid_retrievers && docker rm $cid_retrievers && sleep 1s + fi +} + +function main() { + + stop_docker + + build_docker_images + start_service + + validate_microservice + + stop_docker + echo y | docker system prune + +} + +main