Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor VectorStores and related Comps #1088

Closed
wants to merge 27 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
16da5f5
modify vector redis & milvus, and related dataprep/retrievers
letonghan Dec 30, 2024
1b841bc
Merge branch 'main' of https://github.com/letonghan/GenAIComps into r…
letonghan Dec 30, 2024
4e3e28f
Merge branch 'main' of https://github.com/opea-project/GenAIComps int…
letonghan Dec 30, 2024
b286e39
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Dec 30, 2024
79fea3e
delete wrong file
letonghan Dec 30, 2024
90ad341
fix conflict
letonghan Dec 30, 2024
c623c26
add docker image in .github
letonghan Dec 30, 2024
19ad036
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Dec 30, 2024
4f36ac5
update
letonghan Dec 30, 2024
ca6693d
modify file name
letonghan Dec 30, 2024
8e60bb6
minor fix
letonghan Dec 30, 2024
74e2315
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Dec 30, 2024
bf522ca
comment embedding bridgetower part
letonghan Dec 30, 2024
6e799ea
modify embedding folder path
letonghan Dec 30, 2024
5d37e43
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Dec 30, 2024
ba0d528
add redis & milvus docker compose yaml
letonghan Dec 30, 2024
90351ed
update & add compose yaml
letonghan Dec 30, 2024
8a2139e
refactor pinecone in vectordb and retrievers
letonghan Dec 31, 2024
fb35244
Merge branch 'main' into refactor_vectordb
letonghan Dec 31, 2024
d1e176f
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Dec 31, 2024
bd827b8
delete duplicate code in dataprep milvus
letonghan Dec 31, 2024
2b95ab7
refine dataprep milvus
letonghan Dec 31, 2024
e493a7f
refactor dataprep pinecone
letonghan Dec 31, 2024
a62d448
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Dec 31, 2024
07ad384
add requirements for pinecone
letonghan Dec 31, 2024
4caf288
Merge branch 'refactor_vectordb' of https://github.com/opea-project/G…
letonghan Dec 31, 2024
7fb53bc
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Dec 31, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .github/workflows/docker/compose/dataprep-compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,10 @@

# this file should be run in the root of the repo
services:
dataprep:
build:
dockerfile: comps/dataprep/src/Dockerfile
image: ${REGISTRY:-opea}/dataprep:${TAG:-latest}
dataprep-redis:
build:
dockerfile: comps/dataprep/redis/langchain/Dockerfile
Expand Down
4 changes: 4 additions & 0 deletions .github/workflows/docker/compose/retrievers-compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,10 @@

# this file should be run in the root of the repo
services:
retriever:
build:
dockerfile: comps/retrievers/src/Dockerfile
image: ${REGISTRY:-opea}/retriever:${TAG:-latest}
retriever-redis:
build:
dockerfile: comps/retrievers/redis/langchain/Dockerfile
Expand Down
106 changes: 106 additions & 0 deletions comps/dataprep/deployment/docker_compose/milvus.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

version: "3"
services:
milvus-etcd:
container_name: milvus-etcd
image: quay.io/coreos/etcd:v3.5.5
environment:
- ETCD_AUTO_COMPACTION_MODE=revision
- ETCD_AUTO_COMPACTION_RETENTION=1000
- ETCD_QUOTA_BACKEND_BYTES=4294967296
- ETCD_SNAPSHOT_COUNT=50000
volumes:
- ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/etcd:/etcd
command: etcd -advertise-client-urls=http://127.0.0.1:2379 -listen-client-urls http://0.0.0.0:2379 --data-dir /etcd
healthcheck:
test: ["CMD", "etcdctl", "endpoint", "health"]
interval: 30s
timeout: 20s
retries: 3
milvus-minio:
container_name: milvus-minio
image: minio/minio:RELEASE.2023-03-20T20-16-18Z
environment:
MINIO_ACCESS_KEY: minioadmin
MINIO_SECRET_KEY: minioadmin
ports:
- "5044:9001"
- "5043:9000"
volumes:
- ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/minio:/minio_data
command: minio server /minio_data --console-address ":9001"
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"]
interval: 30s
timeout: 20s
retries: 3
milvus-standalone:
container_name: milvus-standalone
image: milvusdb/milvus:v2.4.9
command: ["milvus", "run", "standalone"]
security_opt:
- seccomp:unconfined
environment:
ETCD_ENDPOINTS: etcd:2379
MINIO_ADDRESS: minio:9000
volumes:
- ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/milvus:/var/lib/milvus
- ${DOCKER_VOLUME_DIRECTORY:-.}/milvus.yaml:/milvus/configs/milvus.yaml
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:9091/healthz"]
interval: 30s
start_period: 90s
timeout: 20s
retries: 3
ports:
- "19530:19530"
- "9091:9091"
depends_on:
- "milvus-etcd"
- "milvus-minio"
tei-embedding-service:
image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
entrypoint: /bin/sh -c "apt-get update && apt-get install -y curl && text-embeddings-router --json-output --model-id ${EMBEDDING_MODEL_ID} --auto-truncate"
container_name: tei-embedding-server
ports:
- "6006:80"
volumes:
- "./data:/data"
shm_size: 1g
environment:
no_proxy: ${no_proxy}
http_proxy: ${http_proxy}
https_proxy: ${https_proxy}
host_ip: ${host_ip}
healthcheck:
test: ["CMD-SHELL", "curl -f http://$host_ip:6006/health || exit 1"]
interval: 10s
timeout: 10s
retries: 60
dataprep-milvus:
image: opea/dataprep-milvus:latest
container_name: dataprep-milvus-server
depends_on:
milvus-standalone:
condition: service_started
tei-embedding-service:
condition: service_healthy
ports:
- "5000:5000"
ipc: host
environment:
no_proxy: ${no_proxy}
http_proxy: ${http_proxy}
https_proxy: ${https_proxy}
TEI_EMBEDDING_ENDPOINT: ${TEI_EMBEDDING_ENDPOINT}
MILVUS_HOST: ${MILVUS_HOST}
HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
DATAPREP_TYPE: ${DATAPREP_TYPE:-milvus}
LOGFLAG: ${LOGFLAG:-true}
restart: unless-stopped

networks:
default:
driver: bridge
58 changes: 58 additions & 0 deletions comps/dataprep/deployment/docker_compose/redis.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

version: "3"
services:
redis-vector-db:
image: redis/redis-stack:7.2.0-v9
container_name: redis-vector-db
ports:
- "6379:6379"
- "8001:8001"
tei-embedding-service:
image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
entrypoint: /bin/sh -c "apt-get update && apt-get install -y curl && text-embeddings-router --json-output --model-id ${EMBEDDING_MODEL_ID} --auto-truncate"
container_name: tei-embedding-server
ports:
- "6006:80"
volumes:
- "./data:/data"
shm_size: 1g
environment:
no_proxy: ${no_proxy}
http_proxy: ${http_proxy}
https_proxy: ${https_proxy}
host_ip: ${host_ip}
healthcheck:
test: ["CMD-SHELL", "curl -f http://$host_ip:6006/health || exit 1"]
interval: 10s
timeout: 10s
retries: 60
dataprep-redis:
image: opea/dataprep-redis:latest
container_name: dataprep-redis-server
depends_on:
redis-vector-db:
condition: service_started
tei-embedding-service:
condition: service_healthy
ports:
- "5000:5000"
ipc: host
environment:
no_proxy: ${no_proxy}
http_proxy: ${http_proxy}
https_proxy: ${https_proxy}
REDIS_URL: ${REDIS_URL}
REDIS_HOST: ${REDIS_HOST}
REDIS_PORT: ${REDIS_PORT:-6379}
INDEX_NAME: ${INDEX_NAME:-rag_redis}
TEI_EMBEDDING_ENDPOINT: ${TEI_EMBEDDING_ENDPOINT}
HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
DATAPREP_TYPE: ${DATAPREP_TYPE:-redis}
LOGFLAG: ${LOGFLAG:-true}
restart: unless-stopped

networks:
default:
driver: bridge
42 changes: 42 additions & 0 deletions comps/dataprep/src/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

FROM python:3.11-slim

ENV LANG=C.UTF-8

ARG ARCH="cpu"

RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \
build-essential \
default-jre \
libgl1-mesa-glx \
libjemalloc-dev \
libreoffice \
poppler-utils \
tesseract-ocr \
vim

RUN useradd -m -s /bin/bash user && \
mkdir -p /home/user && \
chown -R user /home/user/

USER user

COPY comps /home/user/comps

RUN pip install --no-cache-dir --upgrade pip setuptools && \
if [ ${ARCH} = "cpu" ]; then pip install --no-cache-dir torch torchvision --index-url https://download.pytorch.org/whl/cpu; fi && \
pip install --no-cache-dir -r /home/user/comps/dataprep/src/requirements.txt

ENV PYTHONPATH=$PYTHONPATH:/home/user

USER root

RUN mkdir -p /home/user/comps/dataprep/src/uploaded_files && chown -R user /home/user/comps/dataprep/src/uploaded_files

USER user

WORKDIR /home/user/comps/dataprep/src

ENTRYPOINT ["python", "opea_dataprep_microservice.py"]
2 changes: 2 additions & 0 deletions comps/dataprep/src/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
2 changes: 2 additions & 0 deletions comps/dataprep/src/integrations/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
91 changes: 91 additions & 0 deletions comps/dataprep/src/integrations/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

import os


#######################################################
# Common Functions #
#######################################################
def get_boolean_env_var(var_name, default_value=False):
"""Retrieve the boolean value of an environment variable.

Args:
var_name (str): The name of the environment variable to retrieve.
default_value (bool): The default value to return if the variable
is not found.

Returns:
bool: The value of the environment variable, interpreted as a boolean.
"""
true_values = {"true", "1", "t", "y", "yes"}
false_values = {"false", "0", "f", "n", "no"}

# Retrieve the environment variable's value
value = os.getenv(var_name, "").lower()

# Decide the boolean value based on the content of the string
if value in true_values:
return True
elif value in false_values:
return False
else:
return default_value


# Embedding model
EMBED_MODEL = os.getenv("EMBED_MODEL", "BAAI/bge-base-en-v1.5")
# TEI Embedding endpoints
TEI_EMBEDDING_ENDPOINT = os.getenv("TEI_EMBEDDING_ENDPOINT", "")

# Vector Index Configuration
INDEX_NAME = os.getenv("INDEX_NAME", "rag_redis")
KEY_INDEX_NAME = os.getenv("KEY_INDEX_NAME", "file-keys")
TIMEOUT_SECONDS = int(os.getenv("TIMEOUT_SECONDS", 600))
SEARCH_BATCH_SIZE = int(os.getenv("SEARCH_BATCH_SIZE", 10))


#######################################################
# Redis #
#######################################################
# Redis Connection Information
REDIS_HOST = os.getenv("REDIS_HOST", "localhost")
REDIS_PORT = int(os.getenv("REDIS_PORT", 6379))


def format_redis_conn_from_env():
redis_url = os.getenv("REDIS_URL", None)
if redis_url:
return redis_url
else:
using_ssl = get_boolean_env_var("REDIS_SSL", False)
start = "rediss://" if using_ssl else "redis://"

# if using RBAC
password = os.getenv("REDIS_PASSWORD", None)
username = os.getenv("REDIS_USERNAME", "default")
if password is not None:
start += f"{username}:{password}@"

return start + f"{REDIS_HOST}:{REDIS_PORT}"


REDIS_URL = format_redis_conn_from_env()


#######################################################
# Milvus #
#######################################################
# Local Embedding model
LOCAL_EMBEDDING_MODEL = os.getenv("LOCAL_EMBEDDING_MODEL", "maidalun1020/bce-embedding-base_v1")
# MOSEC configuration
MOSEC_EMBEDDING_MODEL = os.environ.get("MOSEC_EMBEDDING_MODEL", "/home/user/bge-large-zh-v1.5")
MOSEC_EMBEDDING_ENDPOINT = os.environ.get("MOSEC_EMBEDDING_ENDPOINT", "")
os.environ["OPENAI_API_BASE"] = MOSEC_EMBEDDING_ENDPOINT
os.environ["OPENAI_API_KEY"] = "Dummy key"
# MILVUS configuration
MILVUS_HOST = os.getenv("MILVUS_HOST", "localhost")
MILVUS_PORT = int(os.getenv("MILVUS_PORT", 19530))
MILVUS_URI = f"http://{MILVUS_HOST}:{MILVUS_PORT}"
INDEX_PARAMS = {"index_type": "FLAT", "metric_type": "IP", "params": {}}
COLLECTION_NAME = os.getenv("COLLECTION_NAME", "rag_milvus")
Loading
Loading