diff --git a/Dockerfile b/Dockerfile index 682336f58a9..0dc0adae908 100644 --- a/Dockerfile +++ b/Dockerfile @@ -104,6 +104,11 @@ RUN --mount=type=bind,source=huggingface.co,target=/huggingface.co \ # Copy nltk data downloaded via download_deps.py COPY nltk_data /root/nltk_data +# https://github.com/chrismattmann/tika-python +# This is the only way to run python-tika without internet access. Without this set, the default is to check the tika version and pull latest every time from Apache. +COPY tika-server-standard-3.0.0.jar tika-server-standard-3.0.0.jar.md5 / +ENV TIKA_SERVER_JAR="file:////tika-server-standard.jar" + # Copy compiled web pages COPY --from=builder /ragflow/web/dist /ragflow/web/dist diff --git a/Dockerfile.slim b/Dockerfile.slim index e2e35f326e9..96bd3d8bd75 100644 --- a/Dockerfile.slim +++ b/Dockerfile.slim @@ -97,6 +97,11 @@ RUN --mount=type=bind,source=huggingface.co,target=/huggingface.co \ # Copy nltk data downloaded via download_deps.py COPY nltk_data /root/nltk_data +# https://github.com/chrismattmann/tika-python +# This is the only way to run python-tika without internet access. Without this set, the default is to check the tika version and pull latest every time from Apache. +COPY tika-server-standard-3.0.0.jar tika-server-standard-3.0.0.jar.md5 / +ENV TIKA_SERVER_JAR="file:////tika-server-standard.jar" + # Copy compiled web pages COPY --from=builder /ragflow/web/dist /ragflow/web/dist diff --git a/download_deps.py b/download_deps.py index ab8fafb2f93..58399293d95 100644 --- a/download_deps.py +++ b/download_deps.py @@ -7,6 +7,8 @@ urls = [ "http://archive.ubuntu.com/ubuntu/pool/main/o/openssl/libssl1.1_1.1.1f-1ubuntu2_amd64.deb", + "https://repo1.maven.org/maven2/org/apache/tika/tika-server-standard/3.0.0/tika-server-standard-3.0.0.jar", + "https://repo1.maven.org/maven2/org/apache/tika/tika-server-standard/3.0.0/tika-server-standard-3.0.0.jar.md5", ] repos = [