forked from opea-project/GenAIComps
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Feature/elasticsearch vector store integration - Infosys (opea-projec…
…t#972) * Feature/elastic Elasticsearch vectorstore, dataprep and retriever --------- Co-authored-by: Adarsh <[email protected]> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Liang Lv <[email protected]>
- Loading branch information
1 parent
fbf3017
commit 5ed041b
Showing
21 changed files
with
1,229 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,6 @@ | ||
__pycache__ | ||
*.egg-info/ | ||
.DS_Store | ||
.idea/ | ||
.venv/ | ||
build/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
# Copyright (C) 2024 Intel Corporation | ||
# SPDX-License-Identifier: Apache-2.0 | ||
|
||
FROM python:3.11-slim | ||
|
||
ENV LANG=C.UTF-8 | ||
|
||
ARG ARCH="cpu" | ||
|
||
RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ | ||
build-essential \ | ||
default-jre \ | ||
libgl1-mesa-glx \ | ||
libjemalloc-dev | ||
|
||
RUN useradd -m -s /bin/bash user && \ | ||
mkdir -p /home/user && \ | ||
chown -R user /home/user/ | ||
|
||
USER user | ||
|
||
COPY comps /home/user/comps | ||
|
||
RUN pip install --no-cache-dir --upgrade pip setuptools && \ | ||
if [ ${ARCH} = "cpu" ]; then pip install --no-cache-dir torch torchvision --index-url https://download.pytorch.org/whl/cpu; fi && \ | ||
pip install --no-cache-dir -r /home/user/comps/dataprep/elasticsearch/langchain/requirements.txt | ||
|
||
ENV PYTHONPATH=$PYTHONPATH:/home/user | ||
|
||
USER root | ||
|
||
RUN mkdir -p /home/user/comps/dataprep/elasticsearch/langchain/uploaded_files && chown -R user /home/user/comps/dataprep/elasticsearch/langchain/uploaded_files | ||
|
||
USER user | ||
|
||
WORKDIR /home/user/comps/dataprep/elasticsearch/langchain | ||
|
||
ENTRYPOINT ["python", "prepare_doc_elasticsearch.py"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,130 @@ | ||
# Dataprep Microservice with Elasticsearch | ||
|
||
## 🚀1. Start Microservice with Python(Option 1) | ||
|
||
### 1.1 Install Requirements | ||
|
||
```bash | ||
pip install -r requirements.txt | ||
``` | ||
|
||
### 1.2 Setup Environment Variables | ||
|
||
```bash | ||
export ES_CONNECTION_STRING=http://localhost:9200 | ||
export INDEX_NAME=${your_index_name} | ||
``` | ||
|
||
### 1.3 Start Elasticsearch | ||
|
||
Please refer to this [readme](../../../vectorstores/elasticsearch/README.md). | ||
|
||
### 1.4 Start Document Preparation Microservice for Elasticsearch with Python Script | ||
|
||
Start document preparation microservice for Elasticsearch with below command. | ||
|
||
```bash | ||
python prepare_doc_elastic.py | ||
``` | ||
|
||
## 🚀2. Start Microservice with Docker (Option 2) | ||
|
||
### 2.1 Start Elasticsearch | ||
|
||
Please refer to this [readme](../../../vectorstores/elasticsearch/README.md). | ||
|
||
### 2.2 Setup Environment Variables | ||
|
||
```bash | ||
export ES_CONNECTION_STRING=http://localhost:9200 | ||
export INDEX_NAME=${your_index_name} | ||
``` | ||
|
||
### 2.3 Build Docker Image | ||
|
||
```bash | ||
cd GenAIComps | ||
docker build -t opea/dataprep-elasticsearch:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/elasticsearch/langchain/Dockerfile . | ||
``` | ||
|
||
### 2.4 Run Docker with CLI (Option A) | ||
|
||
```bash | ||
docker run --name="dataprep-elasticsearch" -p 6011:6011 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e ES_CONNECTION_STRING=$ES_CONNECTION_STRING -e INDEX_NAME=$INDEX_NAME -e TEI_ENDPOINT=$TEI_ENDPOINT opea/dataprep-elastic:latest | ||
``` | ||
|
||
### 2.5 Run with Docker Compose (Option B) | ||
|
||
```bash | ||
cd comps/dataprep/elasticsearch/langchain | ||
docker compose -f docker-compose-dataprep-elastic.yaml up -d | ||
``` | ||
|
||
## 🚀3. Consume Microservice | ||
|
||
### 3.1 Consume Upload API | ||
|
||
Once document preparation microservice for Elasticsearch is started, user can use below command to invoke the | ||
microservice to convert the document to embedding and save to the database. | ||
|
||
```bash | ||
curl -X POST \ | ||
-H "Content-Type: application/json" \ | ||
-d '{"path":"/path/to/document"}' \ | ||
http://localhost:6011/v1/dataprep | ||
``` | ||
|
||
### 3.2 Consume get_file API | ||
|
||
To get uploaded file structures, use the following command: | ||
|
||
```bash | ||
curl -X POST \ | ||
-H "Content-Type: application/json" \ | ||
http://localhost:6011/v1/dataprep/get_file | ||
``` | ||
|
||
Then you will get the response JSON like this: | ||
|
||
```json | ||
[ | ||
{ | ||
"name": "uploaded_file_1.txt", | ||
"id": "uploaded_file_1.txt", | ||
"type": "File", | ||
"parent": "" | ||
}, | ||
{ | ||
"name": "uploaded_file_2.txt", | ||
"id": "uploaded_file_2.txt", | ||
"type": "File", | ||
"parent": "" | ||
} | ||
] | ||
``` | ||
|
||
### 4.3 Consume delete_file API | ||
|
||
To delete uploaded file/link, use the following command. | ||
|
||
The `file_path` here should be the `id` get from `/v1/dataprep/get_file` API. | ||
|
||
```bash | ||
# delete link | ||
curl -X POST \ | ||
-H "Content-Type: application/json" \ | ||
-d '{"file_path": "https://www.ces.tech/.txt"}' \ | ||
http://localhost:6011/v1/dataprep/delete_file | ||
|
||
# delete file | ||
curl -X POST \ | ||
-H "Content-Type: application/json" \ | ||
-d '{"file_path": "uploaded_file_1.txt"}' \ | ||
http://localhost:6011/v1/dataprep/delete_file | ||
|
||
# delete all files and links | ||
curl -X POST \ | ||
-H "Content-Type: application/json" \ | ||
-d '{"file_path": "all"}' \ | ||
http://localhost:6011/v1/dataprep/delete_file | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
# Copyright (C) 2024 Intel Corporation | ||
# SPDX-License-Identifier: Apache-2.0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
# Copyright (C) 2024 Intel Corporation | ||
# SPDX-License-Identifier: Apache-2.0 | ||
|
||
import os | ||
|
||
ES_CONNECTION_STRING = os.getenv("ES_CONNECTION_STRING", "http://localhost:9200") | ||
UPLOADED_FILES_PATH = os.getenv("UPLOADED_FILES_PATH", "./uploaded_files/") | ||
|
||
# Embedding model | ||
EMBED_MODEL = os.getenv("EMBED_MODEL", "BAAI/bge-base-en-v1.5") | ||
|
||
# TEI Embedding endpoints | ||
TEI_ENDPOINT = os.getenv("TEI_ENDPOINT", "") | ||
|
||
# Vector Index Configuration | ||
INDEX_NAME = os.getenv("INDEX_NAME", "rag-elastic") | ||
|
||
# chunk parameters | ||
CHUNK_SIZE = os.getenv("CHUNK_SIZE", 1500) | ||
CHUNK_OVERLAP = os.getenv("CHUNK_OVERLAP", 100) | ||
|
||
# Logging enabled | ||
LOG_FLAG = os.getenv("LOGFLAG", False) |
41 changes: 41 additions & 0 deletions
41
comps/dataprep/elasticsearch/langchain/docker-compose.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
# Copyright (C) 2024 Intel Corporation | ||
# SPDX-License-Identifier: Apache-2.0 | ||
|
||
version: "3" | ||
services: | ||
elasticsearch-vector-db: | ||
hostname: db | ||
container_name: elasticsearch-vector-db | ||
image: docker.elastic.co/elasticsearch/elasticsearch:8.16.0 | ||
ports: | ||
- "9200:9200" | ||
- "9300:9300" | ||
restart: always | ||
ipc: host | ||
environment: | ||
- ES_JAVA_OPTS=-Xms1g -Xmx1g | ||
- discovery.type=single-node | ||
- xpack.security.enabled=false | ||
- bootstrap.memory_lock=false | ||
- no_proxy= ${no_proxy} | ||
- http_proxy= ${http_proxy} | ||
- https_proxy= ${https_proxy} | ||
|
||
dataprep-elasticsearch: | ||
image: opea/dataprep-elasticsearch:latest | ||
container_name: dataprep-elasticsearch | ||
ports: | ||
- "6011:6011" | ||
ipc: host | ||
environment: | ||
http_proxy: ${http_proxy} | ||
https_proxy: ${https_proxy} | ||
ES_CONNECTION_STRING: ${ES_CONNECTION_STRING} | ||
INDEX_NAME: ${INDEX_NAME} | ||
TEI_ENDPOINT: ${TEI_ENDPOINT} | ||
HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} | ||
restart: unless-stopped | ||
|
||
networks: | ||
default: | ||
driver: bridge |
Oops, something went wrong.