From c4cfd8a0e95f21dc3cadcfaf57f97ebbbc4f12e4 Mon Sep 17 00:00:00 2001 From: Zhichang Yu Date: Sat, 28 Sep 2024 13:11:38 +0800 Subject: [PATCH] Doc for ragflow:dev --- Dockerfile | 113 +++++++++++++++++++--- docs/guides/develop/build_docker_image.md | 51 ++++++++-- download_deps.py | 26 +++++ download_deps.sh | 38 -------- 4 files changed, 168 insertions(+), 60 deletions(-) create mode 100644 download_deps.py delete mode 100644 download_deps.sh diff --git a/Dockerfile b/Dockerfile index 76eb0d8d68e..8842a89b315 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,23 +1,108 @@ -FROM infiniflow/ragflow-base:v2.0 -USER root +# base stage +FROM ubuntu:24.04 AS base +USER root + +ENV LIGHTEN=0 WORKDIR /ragflow -ADD ./web ./web -RUN cd ./web && npm i --force && npm run build +RUN rm -f /etc/apt/apt.conf.d/docker-clean \ + && echo 'Binary::apt::APT::Keep-Downloaded-Packages "true";' > /etc/apt/apt.conf.d/keep-cache + +RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \ + apt update && apt-get --no-install-recommends install -y ca-certificates + +# if you located in China, you can use tsinghua mirror to speed up apt +RUN sed -i 's|http://archive.ubuntu.com|https://mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list.d/ubuntu.sources + +RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \ + apt update && apt install -y curl libpython3-dev nginx libglib2.0-0 libglx-mesa0 \ + && rm -rf /var/lib/apt/lists/* \ + && curl -sSL https://install.python-poetry.org | python3 - + +ENV PYTHONDONTWRITEBYTECODE=1 LD_LIBRARY_PATH=usr/lib/x86_64-linux-gnu/openmpi/lib:$LD_LIBRARY_PATH + +# Configure Poetry +ENV POETRY_NO_INTERACTION=1 +ENV POETRY_VIRTUALENVS_IN_PROJECT=true +ENV POETRY_VIRTUALENVS_CREATE=true +ENV POETRY_REQUESTS_TIMEOUT=15 + +# builder stage +FROM base AS builder +USER root + +WORKDIR /ragflow + +RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \ + apt update && apt install -y nodejs npm cargo && \ + rm -rf /var/lib/apt/lists/* + +COPY web web +RUN cd web && npm i --force && npm run build + +# install dependencies from poetry.lock file +COPY pyproject.toml poetry.toml poetry.lock ./ + +RUN --mount=type=cache,target=/root/.cache/pypoetry,sharing=locked \ + if [ "$LIGHTEN" -eq 0 ]; then \ + /root/.local/bin/poetry install --sync --no-cache --no-root --with=full; \ + else \ + /root/.local/bin/poetry install --sync --no-cache --no-root; \ + fi + +# production stage +FROM base AS production +USER root + +WORKDIR /ragflow + +# Install python packages' dependencies +# cv2 requires libGL.so.1 +RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \ + apt update && apt install -y --no-install-recommends nginx libgl1 vim less && \ + rm -rf /var/lib/apt/lists/* + +COPY web web +COPY api api +COPY conf conf +COPY deepdoc deepdoc +COPY rag rag +COPY agent agent +COPY graphrag graphrag +COPY pyproject.toml poetry.toml poetry.lock ./ + +# Copy models downloaded via download_deps.py +RUN mkdir -p /ragflow/rag/res/deepdoc /root/.ragflow +RUN --mount=type=bind,source=huggingface.io,target=/huggingface.io \ + tar --exclude='.*' -cf - \ + /huggingface.io/InfiniFlow/text_concat_xgb_v1.0 \ + /huggingface.io/InfiniFlow/deepdoc \ + | tar -xf - --strip-components=3 -C /ragflow/rag/res/deepdoc +RUN --mount=type=bind,source=huggingface.io,target=/huggingface.io \ + tar -cf - \ + /huggingface.io/BAAI/bge-large-zh-v1.5 \ + /huggingface.io/BAAI/bge-reranker-v2-m3 \ + /huggingface.io/jinaai/jina-embeddings-v3 \ + /huggingface.io/jinaai/jina-reranker-v2-base-multilingual \ + /huggingface.io/maidalun1020/bce-embedding-base_v1 \ + /huggingface.io/maidalun1020/bce-reranker-base_v1 \ + | tar -xf - --strip-components=2 -C /root/.ragflow + +# Copy compiled web pages +COPY --from=builder /ragflow/web/dist /ragflow/web/dist + +# Copy Python environment and packages +ENV VIRTUAL_ENV=/ragflow/.venv +COPY --from=builder ${VIRTUAL_ENV} ${VIRTUAL_ENV} +ENV PATH="${VIRTUAL_ENV}/bin:/root/.local/bin:${PATH}" -ADD ./api ./api -ADD ./conf ./conf -ADD ./deepdoc ./deepdoc -ADD ./rag ./rag -ADD ./agent ./agent -ADD ./graphrag ./graphrag +# Download nltk data +RUN python3 -m nltk.downloader wordnet punkt punkt_tab ENV PYTHONPATH=/ragflow/ -ENV HF_ENDPOINT=https://hf-mirror.com -ADD docker/entrypoint.sh ./entrypoint.sh -ADD docker/.env ./ +COPY docker/entrypoint.sh ./entrypoint.sh RUN chmod +x ./entrypoint.sh -ENTRYPOINT ["./entrypoint.sh"] \ No newline at end of file +ENTRYPOINT ["./entrypoint.sh"] diff --git a/docs/guides/develop/build_docker_image.md b/docs/guides/develop/build_docker_image.md index 2733acfefbd..1a20272d679 100644 --- a/docs/guides/develop/build_docker_image.md +++ b/docs/guides/develop/build_docker_image.md @@ -36,17 +36,52 @@ cd ragflow ### Build the Docker Image -Navigate to the `ragflow` directory where the Dockerfile and other necessary files are located. Now you can build the Docker image using the provided Dockerfile. The command below specifies which Dockerfile to use and tages the image with a name for reference purpose. +Navigate to the `ragflow` directory where the Dockerfile and other necessary files are located. Now you can build the Docker image using the provided Dockerfile. The command below specifies which Dockerfile to use and tags the image with a name for reference purpose. -#### Build image `ragflow:dev-slim` +#### Build and push multi-arch image `ragflow:dev-slim` + +On a `linux/amd64` host: +```bash +docker build -f Dockerfile.slim -t infiniflow/ragflow:dev-slim-amd64 . +docker push infiniflow/ragflow:dev-slim-amd64 +``` + +On a `linux/arm64` host: +```bash +docker build -f Dockerfile.slim -t infiniflow/ragflow:dev-slim-arm64 . +docker push infiniflow/ragflow:dev-slim-arm64 +``` + +On any linux host: +```bash +docker manifest create infiniflow/ragflow:dev-slim --amend infiniflow/ragflow:dev-slim-amd64 --amend infiniflow/ragflow:dev-slim-arm64 +docker manifest push infiniflow/ragflow:dev-slim +``` + +This image's size is about 1 GB. It relies external LLM services since it doesn't contain embedding models. + +#### Build and push multi-arch image `ragflow:dev` + +On a `linux/amd64` host: ```bash -docker build -f Dockerfile.slim -t infiniflow/ragflow:dev-slim . +pip3 install huggingface-hub +python3 download_deps.py +docker build -f Dockerfile -t infiniflow/ragflow:dev-amd64 . +docker push infiniflow/ragflow:dev-amd64 ``` -This image's size is about 1GB. It relies external LLM services since it doesn't contain embedding models. -#### Build image `ragflow:dev` +On a `linux/arm64` host: ```bash -cd ragflow/ -docker build -f Dockerfile -t infiniflow/ragflow:dev . +pip3 install huggingface-hub +python3 download_deps.py +docker build -f Dockerfile -t infiniflow/ragflow:dev-arm64 . +docker push infiniflow/ragflow:dev-arm64 ``` -This image's size is about 11GB. It contains embedding models, and can inference via local CPU/GPU or external LLM services. + +On any linux host: +```bash +docker manifest create infiniflow/ragflow:dev --amend infiniflow/ragflow:dev-amd64 --amend infiniflow/ragflow:dev-arm64 +docker manifest push infiniflow/ragflow:dev +``` + +This image's size is about 11 GB. It contains embedding models, and can inference via local CPU/GPU or external LLM services. diff --git a/download_deps.py b/download_deps.py new file mode 100644 index 00000000000..a0e5e87a59f --- /dev/null +++ b/download_deps.py @@ -0,0 +1,26 @@ +#!/usr/bin/env python3 + +from huggingface_hub import snapshot_download +import os + +repos = [ + "InfiniFlow/text_concat_xgb_v1.0", + "InfiniFlow/deepdoc", + "BAAI/bge-large-zh-v1.5", + "BAAI/bge-reranker-v2-m3", + "jinaai/jina-embeddings-v3", + "jinaai/jina-reranker-v2-base-multilingual", + "maidalun1020/bce-embedding-base_v1", + "maidalun1020/bce-reranker-base_v1", +] + + +def download_model(repo_id): + local_dir = os.path.join("huggingface.io", repo_id) + os.makedirs(local_dir, exist_ok=True) + snapshot_download(repo_id=repo_id, local_dir=local_dir) + + +if __name__ == "__main__": + for repo_id in repos: + download_model(repo_id) diff --git a/download_deps.sh b/download_deps.sh deleted file mode 100644 index e23ee8460f6..00000000000 --- a/download_deps.sh +++ /dev/null @@ -1,38 +0,0 @@ -#!/usr/bin/env bash - -download() -{ - echo "download $1" - # https://stackoverflow.com/questions/3162385/how-to-split-a-string-in-shell-and-get-the-last-field - fn=${1##*/} - if [ ! -f $fn ] ; then - wget --no-check-certificate $1 - fi -} - -# https://stackoverflow.com/questions/24628076/convert-multiline-string-to-array -names="https://huggingface.co/InfiniFlow/deepdoc/resolve/main/det.onnx -https://huggingface.co/InfiniFlow/deepdoc/resolve/main/layout.laws.onnx -https://huggingface.co/InfiniFlow/deepdoc/resolve/main/layout.manual.onnx -https://huggingface.co/InfiniFlow/deepdoc/resolve/main/layout.onnx -https://huggingface.co/InfiniFlow/deepdoc/resolve/main/layout.paper.onnx -https://huggingface.co/InfiniFlow/deepdoc/resolve/main/ocr.res -https://huggingface.co/InfiniFlow/deepdoc/resolve/main/rec.onnx -https://huggingface.co/InfiniFlow/deepdoc/resolve/main/tsr.onnx -https://huggingface.co/InfiniFlow/text_concat_xgb_v1.0/resolve/main/updown_concat_xgb.model" - -SAVEIFS=$IFS # Save current IFS (Internal Field Separator) -IFS=$'\n' # Change IFS to newline char -names=($names) # split the `names` string into an array by the same name -IFS=$SAVEIFS # Restore original IFS - -find . -size 0 | xargs rm -f -# https://stackoverflow.com/questions/15466808/shell-iterate-over-array -for ((i=0; i<${#names[@]}; i+=1)); do - url="${names[$i]}" - download $url - if [ $? != 0 ]; then - exit -1 - fi -done -find . -size 0 | xargs rm -f