From 1ddff7220232527d9b34edaa766ab6f80675ec11 Mon Sep 17 00:00:00 2001 From: Letong Han <106566639+letonghan@users.noreply.github.com> Date: Wed, 12 Jun 2024 20:51:52 +0800 Subject: [PATCH] Support Dataprep Microservice with Llama Index (#154) * move file to langchain folder Signed-off-by: letonghan * support dataprep with llama_index Signed-off-by: letonghan * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add e2e test script Signed-off-by: letonghan * update test script name Signed-off-by: letonghan --------- Signed-off-by: letonghan Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: gadmarkovits --- comps/dataprep/redis/README.md | 24 ++++- .../redis/{ => langchain}/__init__.py | 0 .../dataprep/redis/{ => langchain}/config.py | 0 .../redis/{ => langchain}/docker/Dockerfile | 0 .../docker/docker-compose-dataprep-redis.yaml | 0 .../{ => langchain}/prepare_doc_redis.py | 0 .../redis/{ => langchain}/requirements.txt | 0 .../dataprep/redis/{ => langchain}/schema.yml | 0 .../redis/{ => langchain}/schema_dim_1024.yml | 0 .../redis/{ => langchain}/schema_dim_768.yml | 0 .../{ => langchain}/schema_lcdocs_dim_768.yml | 0 comps/dataprep/redis/llama_index/__init__.py | 2 + comps/dataprep/redis/llama_index/config.py | 57 ++++++++++++ .../redis/llama_index/docker/Dockerfile | 30 ++++++ .../docker/docker-compose-dataprep-redis.yaml | 28 ++++++ .../redis/llama_index/prepare_doc_redis.py | 91 +++++++++++++++++++ .../redis/llama_index/requirements.txt | 15 +++ tests/test_dataprep_redis_llama_index.sh | 67 ++++++++++++++ 18 files changed, 312 insertions(+), 2 deletions(-) rename comps/dataprep/redis/{ => langchain}/__init__.py (100%) rename comps/dataprep/redis/{ => langchain}/config.py (100%) rename comps/dataprep/redis/{ => langchain}/docker/Dockerfile (100%) rename comps/dataprep/redis/{ => langchain}/docker/docker-compose-dataprep-redis.yaml (100%) rename comps/dataprep/redis/{ => langchain}/prepare_doc_redis.py (100%) rename comps/dataprep/redis/{ => langchain}/requirements.txt (100%) rename comps/dataprep/redis/{ => langchain}/schema.yml (100%) rename comps/dataprep/redis/{ => langchain}/schema_dim_1024.yml (100%) rename comps/dataprep/redis/{ => langchain}/schema_dim_768.yml (100%) rename comps/dataprep/redis/{ => langchain}/schema_lcdocs_dim_768.yml (100%) create mode 100644 comps/dataprep/redis/llama_index/__init__.py create mode 100644 comps/dataprep/redis/llama_index/config.py create mode 100644 comps/dataprep/redis/llama_index/docker/Dockerfile create mode 100644 comps/dataprep/redis/llama_index/docker/docker-compose-dataprep-redis.yaml create mode 100644 comps/dataprep/redis/llama_index/prepare_doc_redis.py create mode 100644 comps/dataprep/redis/llama_index/requirements.txt create mode 100644 tests/test_dataprep_redis_llama_index.sh diff --git a/comps/dataprep/redis/README.md b/comps/dataprep/redis/README.md index 87564c584..6aa080141 100644 --- a/comps/dataprep/redis/README.md +++ b/comps/dataprep/redis/README.md @@ -1,10 +1,18 @@ # Dataprep Microservice with Redis +For dataprep microservice, we provide two frameworks: `Langchain` and `LlamaIndex`. + +We organized these two folders in the same way, so you can use either framework for dataprep microservice with the following constructions. + # 🚀1. Start Microservice with Python(Option 1) ## 1.1 Install Requirements ```bash +# for langchain +cd langchain +# for llama_index +cd llama_index pip install -r requirements.txt ``` @@ -48,9 +56,18 @@ export LANGCHAIN_PROJECT="opea/dataprep" ## 2.3 Build Docker Image +- Build docker image with langchain + +```bash +cd ../../../../ +docker build -t opea/dataprep-redis:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/redis/langchain/docker/Dockerfile . +``` + +- Build docker image with llama_index + ```bash cd ../../../../ -docker build -t opea/dataprep-redis:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/redis/docker/Dockerfile . +docker build -t opea/dataprep-redis:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/redis/llama_index/docker/Dockerfile . ``` ## 2.4 Run Docker with CLI (Option A) @@ -62,7 +79,10 @@ docker run -d --name="dataprep-redis-server" -p 6007:6007 --ipc=host -e http_pro ## 2.5 Run with Docker Compose (Option B) ```bash -cd comps/dataprep/redis/docker +# for langchain +cd comps/dataprep/redis/langchain/docker +# for llama_index +cd comps/dataprep/redis/llama_index/docker docker compose -f docker-compose-dataprep-redis.yaml up -d ``` diff --git a/comps/dataprep/redis/__init__.py b/comps/dataprep/redis/langchain/__init__.py similarity index 100% rename from comps/dataprep/redis/__init__.py rename to comps/dataprep/redis/langchain/__init__.py diff --git a/comps/dataprep/redis/config.py b/comps/dataprep/redis/langchain/config.py similarity index 100% rename from comps/dataprep/redis/config.py rename to comps/dataprep/redis/langchain/config.py diff --git a/comps/dataprep/redis/docker/Dockerfile b/comps/dataprep/redis/langchain/docker/Dockerfile similarity index 100% rename from comps/dataprep/redis/docker/Dockerfile rename to comps/dataprep/redis/langchain/docker/Dockerfile diff --git a/comps/dataprep/redis/docker/docker-compose-dataprep-redis.yaml b/comps/dataprep/redis/langchain/docker/docker-compose-dataprep-redis.yaml similarity index 100% rename from comps/dataprep/redis/docker/docker-compose-dataprep-redis.yaml rename to comps/dataprep/redis/langchain/docker/docker-compose-dataprep-redis.yaml diff --git a/comps/dataprep/redis/prepare_doc_redis.py b/comps/dataprep/redis/langchain/prepare_doc_redis.py similarity index 100% rename from comps/dataprep/redis/prepare_doc_redis.py rename to comps/dataprep/redis/langchain/prepare_doc_redis.py diff --git a/comps/dataprep/redis/requirements.txt b/comps/dataprep/redis/langchain/requirements.txt similarity index 100% rename from comps/dataprep/redis/requirements.txt rename to comps/dataprep/redis/langchain/requirements.txt diff --git a/comps/dataprep/redis/schema.yml b/comps/dataprep/redis/langchain/schema.yml similarity index 100% rename from comps/dataprep/redis/schema.yml rename to comps/dataprep/redis/langchain/schema.yml diff --git a/comps/dataprep/redis/schema_dim_1024.yml b/comps/dataprep/redis/langchain/schema_dim_1024.yml similarity index 100% rename from comps/dataprep/redis/schema_dim_1024.yml rename to comps/dataprep/redis/langchain/schema_dim_1024.yml diff --git a/comps/dataprep/redis/schema_dim_768.yml b/comps/dataprep/redis/langchain/schema_dim_768.yml similarity index 100% rename from comps/dataprep/redis/schema_dim_768.yml rename to comps/dataprep/redis/langchain/schema_dim_768.yml diff --git a/comps/dataprep/redis/schema_lcdocs_dim_768.yml b/comps/dataprep/redis/langchain/schema_lcdocs_dim_768.yml similarity index 100% rename from comps/dataprep/redis/schema_lcdocs_dim_768.yml rename to comps/dataprep/redis/langchain/schema_lcdocs_dim_768.yml diff --git a/comps/dataprep/redis/llama_index/__init__.py b/comps/dataprep/redis/llama_index/__init__.py new file mode 100644 index 000000000..916f3a44b --- /dev/null +++ b/comps/dataprep/redis/llama_index/__init__.py @@ -0,0 +1,2 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 diff --git a/comps/dataprep/redis/llama_index/config.py b/comps/dataprep/redis/llama_index/config.py new file mode 100644 index 000000000..0f99cc05e --- /dev/null +++ b/comps/dataprep/redis/llama_index/config.py @@ -0,0 +1,57 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import os + +EMBED_MODEL = os.getenv("EMBED_MODEL", "BAAI/bge-base-en-v1.5") + +# Redis Connection Information +REDIS_HOST = os.getenv("REDIS_HOST", "localhost") +REDIS_PORT = int(os.getenv("REDIS_PORT", 6379)) + + +def get_boolean_env_var(var_name, default_value=False): + """Retrieve the boolean value of an environment variable. + + Args: + var_name (str): The name of the environment variable to retrieve. + default_value (bool): The default value to return if the variable + is not found. + + Returns: + bool: The value of the environment variable, interpreted as a boolean. + """ + true_values = {"true", "1", "t", "y", "yes"} + false_values = {"false", "0", "f", "n", "no"} + + # Retrieve the environment variable's value + value = os.getenv(var_name, "").lower() + + # Decide the boolean value based on the content of the string + if value in true_values: + return True + elif value in false_values: + return False + else: + return default_value + + +def format_redis_conn_from_env(): + redis_url = os.getenv("REDIS_URL", None) + if redis_url: + return redis_url + else: + using_ssl = get_boolean_env_var("REDIS_SSL", False) + start = "rediss://" if using_ssl else "redis://" + + # if using RBAC + password = os.getenv("REDIS_PASSWORD", None) + username = os.getenv("REDIS_USERNAME", "default") + if password is not None: + start += f"{username}:{password}@" + + return start + f"{REDIS_HOST}:{REDIS_PORT}" + + +INDEX_NAME = os.getenv("INDEX_NAME", "rag-redis") +REDIS_URL = format_redis_conn_from_env() diff --git a/comps/dataprep/redis/llama_index/docker/Dockerfile b/comps/dataprep/redis/llama_index/docker/Dockerfile new file mode 100644 index 000000000..8fcb8bab1 --- /dev/null +++ b/comps/dataprep/redis/llama_index/docker/Dockerfile @@ -0,0 +1,30 @@ + +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +FROM ubuntu:22.04 + +RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ + libgl1-mesa-glx \ + libjemalloc-dev \ + vim \ + python3 \ + python3-pip + +RUN useradd -m -s /bin/bash user && \ + mkdir -p /home/user && \ + chown -R user /home/user/ + +USER user + +COPY comps /home/user/comps + +RUN pip install --no-cache-dir --upgrade pip && \ + pip install --no-cache-dir -r /home/user/comps/dataprep/redis/llama_index/requirements.txt + +ENV PYTHONPATH=$PYTHONPATH:/home/user + +WORKDIR /home/user/comps/dataprep/redis/llama_index + +ENTRYPOINT ["python3", "prepare_doc_redis.py"] + diff --git a/comps/dataprep/redis/llama_index/docker/docker-compose-dataprep-redis.yaml b/comps/dataprep/redis/llama_index/docker/docker-compose-dataprep-redis.yaml new file mode 100644 index 000000000..8135a242c --- /dev/null +++ b/comps/dataprep/redis/llama_index/docker/docker-compose-dataprep-redis.yaml @@ -0,0 +1,28 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +version: "3" +services: + redis-vector-db: + image: redis/redis-stack:7.2.0-v9 + container_name: redis-vector-db + ports: + - "6379:6379" + - "8001:8001" + dataprep-redis: + image: opea/dataprep-redis:latest + container_name: dataprep-redis-server + ports: + - "6007:6007" + ipc: host + environment: + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + REDIS_URL: ${REDIS_URL} + INDEX_NAME: ${INDEX_NAME} + LANGCHAIN_API_KEY: ${LANGCHAIN_API_KEY} + restart: unless-stopped + +networks: + default: + driver: bridge diff --git a/comps/dataprep/redis/llama_index/prepare_doc_redis.py b/comps/dataprep/redis/llama_index/prepare_doc_redis.py new file mode 100644 index 000000000..ec0ddf0fa --- /dev/null +++ b/comps/dataprep/redis/llama_index/prepare_doc_redis.py @@ -0,0 +1,91 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import json +import os +from pathlib import Path +from typing import List, Optional, Union + +from config import EMBED_MODEL, INDEX_NAME, REDIS_URL +from fastapi import File, Form, HTTPException, UploadFile +from langsmith import traceable +from llama_index.core import SimpleDirectoryReader, StorageContext, VectorStoreIndex +from llama_index.core.settings import Settings +from llama_index.embeddings.huggingface import HuggingFaceEmbedding +from llama_index.vector_stores.redis import RedisVectorStore +from redis import Redis +from redisvl.schema import IndexSchema + +from comps import DocPath, opea_microservices, register_microservice + + +async def save_file_to_local_disk(save_path: str, file): + save_path = Path(save_path) + with save_path.open("wb") as fout: + try: + content = await file.read() + fout.write(content) + except Exception as e: + print(f"Write file failed. Exception: {e}") + raise HTTPException(status_code=500, detail=f"Write file {save_path} failed. Exception: {e}") + + +async def ingest_data_to_redis(doc_path: DocPath): + embedder = HuggingFaceEmbedding(model_name=EMBED_MODEL) + print(f"embedder: {embedder}") + Settings.embed_model = embedder + doc_path = doc_path.path + content = SimpleDirectoryReader(input_files=[doc_path]).load_data() + redis_client = Redis.from_url(REDIS_URL) + schema = IndexSchema.from_dict( + { + "index": {"name": INDEX_NAME, "prefix": f"doc:{INDEX_NAME}"}, + "fields": [ + {"name": "id", "type": "tag"}, + {"name": "doc_id", "type": "tag"}, + {"name": "text", "type": "text"}, + {"name": "content", "type": "text"}, + {"name": "source", "type": "text"}, + {"name": "start_index", "type": "numeric"}, + { + "name": "vector", + "type": "vector", + "attrs": {"dims": 768, "algorithm": "HNSW", "date_type": "FLOAT32"}, + }, + ], + } + ) + vector_store = RedisVectorStore(redis_client=redis_client, schema=schema) + storage_context = StorageContext.from_defaults(vector_store=vector_store) + _ = VectorStoreIndex.from_documents(content, storage_context=storage_context) + print("[ ingest data ] data ingested into Redis DB.") + return True + + +@register_microservice(name="opea_service@prepare_doc_redis", endpoint="/v1/dataprep", host="0.0.0.0", port=6007) +@traceable(run_type="tool") +# llama index only support upload files now +async def ingest_documents(files: Optional[Union[UploadFile, List[UploadFile]]] = File(None)): + print(f"files:{files}") + if not files: + raise HTTPException(status_code=400, detail="Please provide at least one file.") + + if not isinstance(files, list): + files = [files] + upload_folder = "./uploaded_files/" + if not os.path.exists(upload_folder): + Path(upload_folder).mkdir(parents=True, exist_ok=True) + try: + for file in files: + save_path = upload_folder + file.filename + await save_file_to_local_disk(save_path, file) + await ingest_data_to_redis(DocPath(path=save_path)) + print(f"Successfully saved file {save_path}") + return {"status": 200, "message": "Data preparation succeeded"} + except Exception as e: + print(f"Data preparation failed. Exception: {e}") + raise HTTPException(status_code=500, detail=f"Data preparation failed. Exception: {e}") + + +if __name__ == "__main__": + opea_microservices["opea_service@prepare_doc_redis"].start() diff --git a/comps/dataprep/redis/llama_index/requirements.txt b/comps/dataprep/redis/llama_index/requirements.txt new file mode 100644 index 000000000..3eb05757f --- /dev/null +++ b/comps/dataprep/redis/llama_index/requirements.txt @@ -0,0 +1,15 @@ +docarray[full] +fastapi +huggingface_hub +langsmith +llama-index +llama-index-embeddings-huggingface==0.2.0 +llama-index-readers-file +llama-index-vector-stores-redis +numpy +opentelemetry-api +opentelemetry-exporter-otlp +opentelemetry-sdk +redis +sentence_transformers +shortuuid diff --git a/tests/test_dataprep_redis_llama_index.sh b/tests/test_dataprep_redis_llama_index.sh new file mode 100644 index 000000000..1889e4040 --- /dev/null +++ b/tests/test_dataprep_redis_llama_index.sh @@ -0,0 +1,67 @@ +#!/bin/bash +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +set -xe + +WORKPATH=$(dirname "$PWD") +LOG_PATH="$WORKPATH/tests" +ip_address=$(hostname -I | awk '{print $1}') + +function build_docker_images() { + cd $WORKPATH + echo $(pwd) + docker build --no-cache -t opea/dataprep-redis:comps --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/redis/llama_index/docker/Dockerfile . +} + +function start_service() { + docker run -d --name="test-comps-dataprep-redis" -e http_proxy=$http_proxy -e https_proxy=$https_proxy -p 6379:6379 -p 8001:8001 --ipc=host redis/redis-stack:7.2.0-v9 + dataprep_service_port=5011 + REDIS_URL="redis://${ip_address}:6379" + docker run -d --name="test-comps-dataprep-redis-server" -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e REDIS_URL=$REDIS_URL -p ${dataprep_service_port}:6007 --ipc=host opea/dataprep-redis:comps + sleep 1m +} + +function validate_microservice() { + dataprep_service_port=5011 + URL="http://${ip_address}:$dataprep_service_port/v1/dataprep" + echo "Deep learning is a subset of machine learning that utilizes neural networks with multiple layers to analyze various levels of abstract data representations. It enables computers to identify patterns and make decisions with minimal human intervention by learning from large amounts of data." > ./dataprep_file.txt + HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -F 'files=@./dataprep_file.txt' -H 'Content-Type: multipart/form-data' "$URL") + if [ "$HTTP_STATUS" -eq 200 ]; then + echo "[ dataprep ] HTTP status is 200. Checking content..." + local CONTENT=$(curl -s -X POST -F 'files=@./dataprep_file.txt' -H 'Content-Type: multipart/form-data' "$URL" | tee ${LOG_PATH}/dataprep.log) + + if echo 'Data preparation succeeded' | grep -q "$EXPECTED_RESULT"; then + echo "[ dataprep ] Content is as expected." + else + echo "[ dataprep ] Content does not match the expected result: $CONTENT" + docker logs test-comps-dataprep-redis-server >> ${LOG_PATH}/dataprep.log + exit 1 + fi + else + echo "[ dataprep ] HTTP status is not 200. Received status was $HTTP_STATUS" + docker logs test-comps-dataprep-redis-server >> ${LOG_PATH}/dataprep.log + exit 1 + fi +} + +function stop_docker() { + cid=$(docker ps -aq --filter "name=test-comps-dataprep-redis*") + if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid && sleep 1s; fi +} + +function main() { + + stop_docker + + build_docker_images + start_service + + validate_microservice + + stop_docker + echo y | docker system prune + +} + +main