-
Notifications
You must be signed in to change notification settings - Fork 21
/
Copy pathDockerfile
133 lines (110 loc) · 5.43 KB
/
Dockerfile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
ARG GOLANG_VERSION=1.23.4
FROM golang:${GOLANG_VERSION}-bullseye AS build
ARG TARGETOS TARGETARCH
RUN apt-get update && apt-get install -y \
build-essential \
libleptonica-dev \
libtesseract-dev \
libsoxr-dev \
&& rm -rf /var/lib/apt/lists/*
# Install FFmpeg Static Build
RUN FFMPEG_ARCH=$([ "$TARGETARCH" = "arm64" ] && echo "arm64" || echo "amd64") && \
wget https://johnvansickle.com/ffmpeg/releases/ffmpeg-release-${FFMPEG_ARCH}-static.tar.xz && \
tar xvf ffmpeg-release-${FFMPEG_ARCH}-static.tar.xz && \
mv ffmpeg-*-static/ffmpeg /usr/local/bin/ && \
mv ffmpeg-*-static/ffprobe /usr/local/bin/ && \
rm -rf ffmpeg-*
# Install ONNX Runtime (latest release)
ENV ONNXRUNTIME_ROOT_PATH=/usr/local/onnxruntime
RUN apt update && \
apt install -y wget jq && \
VERSION=v1.20.1 && \
ONNX_ARCH=$([ "$TARGETARCH" = "arm64" ] && echo "aarch64" || echo "x64") && \
wget https://github.com/microsoft/onnxruntime/releases/download/${VERSION}/onnxruntime-linux-${ONNX_ARCH}-${VERSION#v}.tgz && \
tar -xzf onnxruntime-linux-${ONNX_ARCH}-${VERSION#v}.tgz && \
mv onnxruntime-linux-${ONNX_ARCH}-${VERSION#v} ${ONNXRUNTIME_ROOT_PATH} && \
rm onnxruntime-linux-${ONNX_ARCH}-${VERSION#v}.tgz && \
apt remove -y wget jq && \
apt autoremove -y && \
rm -rf /var/lib/apt/lists/*
# Set environment variables and create symlinks for ONNX Runtime
ENV C_INCLUDE_PATH=${ONNXRUNTIME_ROOT_PATH}/include
ENV LD_RUN_PATH=${ONNXRUNTIME_ROOT_PATH}/lib
ENV LIBRARY_PATH=${ONNXRUNTIME_ROOT_PATH}/lib
WORKDIR /src
COPY go.mod go.sum ./
RUN go mod download
COPY . .
ARG SERVICE_NAME TARGETOS TARGETARCH
RUN --mount=target=. --mount=type=cache,target=/root/.cache/go-build --mount=type=cache,target=/go/pkg GOOS=$TARGETOS GOARCH=$TARGETARCH go build -tags=ocr,onnx -o /${SERVICE_NAME} ./cmd/main
RUN --mount=target=. --mount=type=cache,target=/root/.cache/go-build --mount=type=cache,target=/go/pkg GOOS=$TARGETOS GOARCH=$TARGETARCH go build -o /${SERVICE_NAME}-worker ./cmd/worker
RUN --mount=target=. --mount=type=cache,target=/root/.cache/go-build --mount=type=cache,target=/go/pkg GOOS=$TARGETOS GOARCH=$TARGETARCH go build -o /${SERVICE_NAME}-migrate ./cmd/migration
RUN --mount=target=. --mount=type=cache,target=/root/.cache/go-build --mount=type=cache,target=/go/pkg GOOS=$TARGETOS GOARCH=$TARGETARCH go build -o /${SERVICE_NAME}-init ./cmd/init
FROM debian:bullseye-slim
# Install Python, create virtual environment, install pdfplumber and Docling
RUN apt update && \
apt install -y \
build-essential \
curl \
wget \
xz-utils \
python3 \
python3-venv \
python3-dev \
libgeos++-dev \
poppler-utils \
wv \
unrtf \
tidy \
tesseract-ocr \
libtesseract-dev \
libreoffice \
libsoxr-dev \
chromium \
qpdf && \
python3 -m venv /opt/venv && \
/opt/venv/bin/pip install pdfplumber mistral-common tokenizers docling==2.18.0 && \
rm -rf /var/lib/apt/lists/*
# Copy FFmpeg from build stage
COPY --from=build --chown=nobody:nogroup /usr/local/bin/ffmpeg /usr/local/bin/ffmpeg
COPY --from=build --chown=nobody:nogroup /usr/local/bin/ffprobe /usr/local/bin/ffprobe
# Copy ONNX runtime from build stage
ENV ONNXRUNTIME_ROOT_PATH=/usr/local/onnxruntime
COPY --from=build --chown=nobody:nogroup /usr/local/onnxruntime ${ONNXRUNTIME_ROOT_PATH}
# Set environment variables and create symlinks for ONNX Runtime
ENV C_INCLUDE_PATH=${ONNXRUNTIME_ROOT_PATH}/include
RUN ln -s ${ONNXRUNTIME_ROOT_PATH}/lib/libonnxruntime.so* /usr/lib/
# Docling will need a $HOME-prefixed path to write cache files. We'll also put
# the prefetched model artifacts there.
ENV BASE_DOCLING_PATH=/home/nobody
RUN mkdir -p ${BASE_DOCLING_PATH}/.EasyOCR/model && chown -R nobody:nogroup ${BASE_DOCLING_PATH}
RUN apt update && \
apt install -y wget unzip && \
wget https://github.com/JaidedAI/EasyOCR/releases/download/v1.3/latin_g2.zip && \
unzip latin_g2.zip -d ${BASE_DOCLING_PATH}/.EasyOCR/model/ && \
rm latin_g2.zip && \
wget https://github.com/JaidedAI/EasyOCR/releases/download/pre-v1.1.6/craft_mlt_25k.zip && \
unzip craft_mlt_25k.zip -d ${BASE_DOCLING_PATH}/.EasyOCR/model/ && \
rm craft_mlt_25k.zip && \
apt remove -y wget unzip && \
apt autoremove -y && \
rm -rf /var/lib/apt/lists/*
ENV DOCLING_ARTIFACTS_PATH=${BASE_DOCLING_PATH}/docling-artifacts
RUN echo "from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline" > import_artifacts.py
RUN echo "StandardPdfPipeline.download_models_hf(local_dir='${DOCLING_ARTIFACTS_PATH}')" >> import_artifacts.py
RUN /opt/venv/bin/python import_artifacts.py && rm import_artifacts.py
USER nobody:nogroup
ARG SERVICE_NAME
ENV HOME=${BASE_DOCLING_PATH}
WORKDIR /${SERVICE_NAME}
ENV GODEBUG=tlsrsakex=1
COPY --from=build --chown=nobody:nogroup /src/config ./config
COPY --from=build --chown=nobody:nogroup /src/release-please ./release-please
COPY --from=build --chown=nobody:nogroup /src/pkg/db/migration ./pkg/db/migration
COPY --from=build --chown=nobody:nogroup /${SERVICE_NAME}-migrate ./
COPY --from=build --chown=nobody:nogroup /${SERVICE_NAME}-init ./
COPY --from=build --chown=nobody:nogroup /${SERVICE_NAME}-worker ./
COPY --from=build --chown=nobody:nogroup /${SERVICE_NAME} ./
# Set up ONNX model and environment variable
COPY --chown=nobody:nogroup ./pkg/component/resources/onnx/silero_vad.onnx /${SERVICE_NAME}/pkg/component/resources/onnx/silero_vad.onnx
ENV ONNX_MODEL_FOLDER_PATH=/${SERVICE_NAME}/pkg/component/resources/onnx