Skip to content

Commit

Permalink
refactoring: reimplement Docker strategy (#3162)
Browse files Browse the repository at this point in the history
* setup base images

* add cpu flavor

* use the same Dockerfile for cpu and gpu

* better naming, add docs

* add docker workflow

* add missing image input

* change cwd for bake

* also push api images

* try conditional tagging for releases

* revert testing code

* update docker readme

* document variable override

* use Python 3.10

* allow empty HAYSTACK_EXTRAS

* Apply suggestions from code review

Co-authored-by: Sara Zan <[email protected]>

* remove repo description step, can't make it work so far

* add docs to the last step as it's tricky

* manage tags for the newest images

* tests are passing, checking in the last bit

Co-authored-by: Sara Zan <[email protected]>
  • Loading branch information
masci and ZanSara authored Sep 12, 2022
1 parent 21aedc6 commit 64b0c43
Show file tree
Hide file tree
Showing 5 changed files with 287 additions and 0 deletions.
93 changes: 93 additions & 0 deletions .github/workflows/docker_release.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
name: Release Docker images

on:
workflow_dispatch:
push:
branches:
- main
tags:
- v*

env:
DOCKER_REPO_NAME: deepset/haystack

jobs:
build-and-push:
runs-on: ubuntu-latest

steps:
- name: Checkout
uses: actions/checkout@v3

- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v2

- name: Login to DockerHub
uses: docker/login-action@v1
with:
username: ${{ secrets.DOCKER_HUB_USER }}
password: ${{ secrets.DOCKER_HUB_TOKEN }}

- name: Docker meta
id: meta
uses: docker/metadata-action@v4
with:
images: $DOCKER_REPO_NAME

- name: Build base images
uses: docker/bake-action@v2
env:
IMAGE_TAG_SUFFIX: ${{ steps.meta.outputs.version }}
with:
workdir: docker
targets: base
push: true

- name: Build api images
uses: docker/bake-action@v2
env:
IMAGE_TAG_SUFFIX: ${{ steps.meta.outputs.version }}
BASE_IMAGE_TAG_SUFFIX: ${{ steps.meta.outputs.version }}
with:
workdir: docker
targets: api
push: true

- name: Get latest version of Haystack
id: latest-version
uses: pozetroninc/github-action-get-latest-release@master
if: startsWith(github.ref, 'refs/tags/')
with:
repository: ${{ github.repository }}
excludes: prerelease, draft

- name: Compare current version with latest
uses: madhead/semver-utils@latest
id: version
if: startsWith(github.ref, 'refs/tags/')
with:
# Version being built
version: ${{ github.ref_name }}
# Compare to latest
compare-to: ${{ steps.latest-version.outputs.release }}

- name: Use latest
if: steps.version.outputs.comparison-result == '>'
run: |
echo ${{ steps.version.outputs.comparison-result }};
echo ${{ steps.latest-version.outputs.release }};
# This step should only run when we release a new minor, so
# that we can tag the most recent image without the version number.
# For example, if the previous step builds `deepset/haystack:cpu-1.8.0`,
# this builds `deepset/haystack:cpu`
- name: Build api images no version in tag
uses: docker/bake-action@v2
if: steps.version.outputs.comparison-result == '>'
env:
IMAGE_TAG_SUFFIX: ${{ steps.meta.outputs.version }}
BASE_IMAGE_TAG_SUFFIX: ${{ steps.meta.outputs.version }}
with:
workdir: docker
targets: api-latest
push: true
13 changes: 13 additions & 0 deletions docker/Dockerfile.api
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
ARG base_image_tag

FROM deepset/haystack:${base_image_tag}

# Create a folder for the /file-upload API endpoint with write permissions
RUN mkdir -p /opt/file-upload && chmod 777 /opt/file-upload

# Tell rest_api which folder to use for uploads
ENV FILE_UPLOAD_PATH="/opt/file-upload"

EXPOSE 8000

CMD ["gunicorn", "rest_api.application:app", "-b", "0.0.0.0", "-k", "uvicorn.workers.UvicornWorker", "--workers", "1", "--timeout", "180"]
38 changes: 38 additions & 0 deletions docker/Dockerfile.base
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
ARG build_image
ARG base_immage

FROM $build_image AS build-image

ARG haystack_version
ARG haystack_extras
ARG torch_scatter

RUN apt-get update && apt-get install -y --no-install-recommends \
build-essential gcc git curl \
tesseract-ocr libtesseract-dev poppler-utils

# Install PDF converter
RUN curl -O https://dl.xpdfreader.com/xpdf-tools-linux-4.04.tar.gz && \
tar -xvf xpdf-tools-linux-4.04.tar.gz && \
cp xpdf-tools-linux-4.04/bin64/pdftotext /opt && \
rm -rf xpdf-tools-linux-4.04

# Shallow clone Haystack repo, we'll install from the local sources
RUN git clone --depth=1 --branch=${haystack_version} https://github.com/deepset-ai/haystack.git /opt/haystack
WORKDIR /opt/haystack

# Use a virtualenv we can copy over the next build stage
RUN python -m venv --system-site-packages /opt/venv
ENV PATH="/opt/venv/bin:$PATH"

RUN pip install --upgrade pip && \
pip install --no-cache-dir .${haystack_extras} && \
pip install --no-cache-dir ./rest_api && \
pip install --no-cache-dir torch-scatter -f $torch_scatter

FROM $base_immage AS final

COPY --from=build-image /opt/venv /opt/venv
COPY --from=build-image /opt/pdftotext /usr/local/bin

ENV PATH="/opt/venv/bin:$PATH"
49 changes: 49 additions & 0 deletions docker/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
# Haystack Docker image

Haystack is an end-to-end framework that enables you to build powerful and production-ready
pipelines for different search use cases. The Docker image comes with a web service
configured to serve Haystack's `rest_api` to ease pipelines' deployments in containerized
environments.

Start the Docker container binding the TCP port `8000` locally:
```sh
docker run -p 8000:8000 deepset/haystack
```

If you need the container to access other services available in the host:
```sh
docker run -p 8000:8000 --network="host" deepset/haystack
```

## Image variants

The Docker image comes in two variants:
- `haystack:cpu-<version>`: this image is smaller but doesn't support GPU
- `haystack:gpu-<version>`: this image comes with the Cuda runtime and is capable of running on GPUs


## Image development

Images are built with BuildKit and we use `bake` to orchestrate the process.
You can build a specific image by simply run:
```sh
docker buildx bake gpu
```

You can override any `variable` defined in the `docker-bake.hcl` file and build custom
images, for example if you want to use a branch from the Haystack repo:
```sh
HAYSTACK_VERSION=mybranch_or_tag BASE_IMAGE_TAG_SUFFIX=latest docker buildx bake gpu --no-cache
```

# License

View [license information](https://github.com/deepset-ai/haystack/blob/main/LICENSE) for
the software contained in this image.

As with all Docker images, these likely also contain other software which may be under
other licenses (such as Bash, etc from the base distribution, along with any direct or
indirect dependencies of the primary software being contained).

As for any pre-built image usage, it is the image user's responsibility to ensure that any
use of this image complies with any relevant licenses for all software contained within.
94 changes: 94 additions & 0 deletions docker/docker-bake.hcl
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
variable "HAYSTACK_VERSION" {
default = "main"
}

variable "GITHUB_REF" {
default = ""
}

variable "IMAGE_NAME" {
default = "deepset/haystack"
}

variable "IMAGE_TAG_SUFFIX" {
default = "local"
}

variable "BASE_IMAGE_TAG_SUFFIX" {
default = "local"
}

variable "HAYSTACK_EXTRAS" {
default = ""
}

group "base" {
targets = ["base", "base-gpu"]
}

group "api" {
targets = ["cpu", "gpu"]
}

group "api-latest" {
targets = ["cpu-latest", "gpu-latest"]
}

group "all" {
targets = ["base", "base-gpu", "cpu", "gpu"]
}

target "docker-metadata-action" {}

target "base" {
dockerfile = "Dockerfile.base"
tags = ["${IMAGE_NAME}:base-${IMAGE_TAG_SUFFIX}"]
args = {
build_image = "python:3.10-slim"
base_immage = "python:3.10-slim"
haystack_version = "${HAYSTACK_VERSION}"
haystack_extras = notequal("",HAYSTACK_EXTRAS) ? "${HAYSTACK_EXTRAS}" : "[docstores,crawler,preprocessing,ocr,onnx,beir]"
torch_scatter = "https://data.pyg.org/whl/torch-1.12.0+cpu.html"
}
}

target "base-gpu" {
dockerfile = "Dockerfile.base"
tags = ["${IMAGE_NAME}:base-gpu-${IMAGE_TAG_SUFFIX}"]
args = {
build_image = "pytorch/pytorch:1.12.1-cuda11.3-cudnn8-runtime"
base_immage = "pytorch/pytorch:1.12.1-cuda11.3-cudnn8-runtime"
haystack_version = "${HAYSTACK_VERSION}"
haystack_extras = notequal("",HAYSTACK_EXTRAS) ? "${HAYSTACK_EXTRAS}" : "[docstores-gpu,crawler,preprocessing,ocr,onnx-gpu,beir]"
torch_scatter = "https://data.pyg.org/whl/torch-1.12.1%2Bcu113.html"
}
}

target "cpu" {
dockerfile = "Dockerfile.api"
tags = ["${IMAGE_NAME}:cpu-${IMAGE_TAG_SUFFIX}"]
args = {
base_image_tag = "base-${BASE_IMAGE_TAG_SUFFIX}"
}
}

target "cpu-latest" {
inherits = ["cpu"]
tags = ["${IMAGE_NAME}:cpu"]
}

target "gpu" {
dockerfile = "Dockerfile.api"
tags = ["${IMAGE_NAME}:gpu-${IMAGE_TAG_SUFFIX}"]
args = {
base_image_tag = "base-gpu-${BASE_IMAGE_TAG_SUFFIX}"
}
platforms = [
"linux/amd64"
]
}

target "gpu-latest" {
inherits = ["gpu"]
tags = ["${IMAGE_NAME}:gpu"]
}

0 comments on commit 64b0c43

Please sign in to comment.