Skip to content

Commit

Permalink
[SPARK-50477][INFRA] Add a separate docker file for python 3.9 daily …
Browse files Browse the repository at this point in the history
…build

### What changes were proposed in this pull request?
Add a separate docker file for python 3.9 daily build

### Why are the changes needed?
to isolate the environments

### Does this PR introduce _any_ user-facing change?
no, infra-only

### How was this patch tested?
ci, the second commit and the 4-th commit tested this PR against the new image

https://github.com/zhengruifeng/spark/actions/runs/12135050296/job/33835846375

https://github.com/zhengruifeng/spark/actions/runs/12140138335/job/33850700922

### Was this patch authored or co-authored using generative AI tooling?
no

Closes #49042 from zhengruifeng/infra_py_images.

Authored-by: Ruifeng Zheng <[email protected]>
Signed-off-by: Ruifeng Zheng <[email protected]>
  • Loading branch information
zhengruifeng committed Dec 4, 2024
1 parent 5fc6b71 commit 45da6f6
Show file tree
Hide file tree
Showing 4 changed files with 132 additions and 1 deletion.
36 changes: 35 additions & 1 deletion .github/workflows/build_and_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ jobs:
runs-on: ubuntu-latest
env:
GITHUB_PREV_SHA: ${{ github.event.before }}
PYSPARK_IMAGE_TO_TEST: ''
outputs:
required: ${{ steps.set-outputs.outputs.required }}
image_url: ${{ steps.infra-image-outputs.outputs.image_url }}
Expand All @@ -64,6 +65,8 @@ jobs:
image_lint_url_link: ${{ steps.infra-image-link.outputs.image_lint_url_link }}
image_sparkr_url: ${{ steps.infra-image-sparkr-outputs.outputs.image_sparkr_url }}
image_sparkr_url_link: ${{ steps.infra-image-link.outputs.image_sparkr_url_link }}
image_pyspark_url: ${{ steps.infra-image-pyspark-outputs.outputs.image_pyspark_url }}
image_pyspark_url_link: ${{ steps.infra-image-link.outputs.image_pyspark_url_link }}
steps:
- name: Checkout Spark repository
uses: actions/checkout@v4
Expand Down Expand Up @@ -164,19 +167,36 @@ jobs:
IMG_NAME="apache-spark-ci-image-sparkr:${{ inputs.branch }}-${{ github.run_id }}"
IMG_URL="ghcr.io/$REPO_OWNER/$IMG_NAME"
echo "image_sparkr_url=$IMG_URL" >> $GITHUB_OUTPUT
- name: Generate infra image URL (PySpark ${{ env.PYSPARK_IMAGE_TO_TEST }})
id: infra-image-pyspark-outputs
if: ${{ env.PYSPARK_IMAGE_TO_TEST }}
env: ${{ fromJSON(inputs.envs) }}
run: |
# Convert to lowercase to meet Docker repo name requirement
REPO_OWNER=$(echo "${{ github.repository_owner }}" | tr '[:upper:]' '[:lower:]')
IMG_NAME="apache-spark-ci-image-pyspark-${{ env.PYSPARK_IMAGE_TO_TEST }}:${{ inputs.branch }}-${{ github.run_id }}"
IMG_URL="ghcr.io/$REPO_OWNER/$IMG_NAME"
echo "image_pyspark_url=$IMG_URL" >> $GITHUB_OUTPUT
- name: Link the docker images
id: infra-image-link
env: ${{ fromJSON(inputs.envs) }}
run: |
# Set the image URL for job "docs"
# Should delete the link and directly use image_docs_url after SPARK 3.x EOL
if [[ "${{ inputs.branch }}" == 'branch-3.5' ]]; then
echo "image_docs_url_link=${{ steps.infra-image-outputs.outputs.image_url }}" >> $GITHUB_OUTPUT
echo "image_lint_url_link=${{ steps.infra-image-outputs.outputs.image_url }}" >> $GITHUB_OUTPUT
echo "image_sparkr_url_link=${{ steps.infra-image-outputs.outputs.image_url }}" >> $GITHUB_OUTPUT
echo "image_pyspark_url_link=${{ steps.infra-image-outputs.outputs.image_url }}" >> $GITHUB_OUTPUT
else
echo "image_docs_url_link=${{ steps.infra-image-docs-outputs.outputs.image_docs_url }}" >> $GITHUB_OUTPUT
echo "image_lint_url_link=${{ steps.infra-image-lint-outputs.outputs.image_lint_url }}" >> $GITHUB_OUTPUT
echo "image_sparkr_url_link=${{ steps.infra-image-sparkr-outputs.outputs.image_sparkr_url }}" >> $GITHUB_OUTPUT
if [[ "${{ env.PYSPARK_IMAGE_TO_TEST }}" != "" ]]; then
echo "image_pyspark_url_link=${{ steps.infra-image-pyspark-outputs.outputs.image_pyspark_url }}" >> $GITHUB_OUTPUT
else
echo "image_pyspark_url_link=${{ steps.infra-image-outputs.outputs.image_url }}" >> $GITHUB_OUTPUT
fi
fi
# Build: build Spark and run the tests for specified modules.
Expand Down Expand Up @@ -360,6 +380,8 @@ jobs:
runs-on: ubuntu-latest
permissions:
packages: write
env:
PYSPARK_IMAGE_TO_TEST: ''
steps:
- name: Login to GitHub Container Registry
uses: docker/login-action@v3
Expand Down Expand Up @@ -428,6 +450,18 @@ jobs:
${{ needs.precondition.outputs.image_sparkr_url }}
# Use the infra image cache to speed up
cache-from: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-sparkr-cache:${{ inputs.branch }}
- name: Build and push (PySpark ${{ env.PYSPARK_IMAGE_TO_TEST }})
if: ${{ env.PYSPARK_IMAGE_TO_TEST }}
id: docker_build_pyspark
env: ${{ fromJSON(inputs.envs) }}
uses: docker/build-push-action@v6
with:
context: ./dev/spark-test-image/${{ env.PYSPARK_IMAGE_TO_TEST }}/
push: true
tags: |
${{ needs.precondition.outputs.image_pyspark_url }}
# Use the infra image cache to speed up
cache-from: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-pyspark-${{ env.PYSPARK_IMAGE_TO_TEST }}-cache:${{ inputs.branch }}


pyspark:
Expand All @@ -438,7 +472,7 @@ jobs:
runs-on: ubuntu-latest
timeout-minutes: 180
container:
image: ${{ needs.precondition.outputs.image_url }}
image: ${{ needs.precondition.outputs.image_pyspark_url_link }}
strategy:
fail-fast: false
matrix:
Expand Down
14 changes: 14 additions & 0 deletions .github/workflows/build_infra_images_cache.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ on:
- 'dev/spark-test-image/docs/Dockerfile'
- 'dev/spark-test-image/lint/Dockerfile'
- 'dev/spark-test-image/sparkr/Dockerfile'
- 'dev/spark-test-image/python-309/Dockerfile'
- '.github/workflows/build_infra_images_cache.yml'
# Create infra image when cutting down branches/tags
create:
Expand Down Expand Up @@ -102,3 +103,16 @@ jobs:
- name: Image digest (SparkR)
if: hashFiles('dev/spark-test-image/sparkr/Dockerfile') != ''
run: echo ${{ steps.docker_build_sparkr.outputs.digest }}
- name: Build and push (PySpark with Python 3.9)
if: hashFiles('dev/spark-test-image/python-309/Dockerfile') != ''
id: docker_build_pyspark_python_309
uses: docker/build-push-action@v6
with:
context: ./dev/spark-test-image/python-309/
push: true
tags: ghcr.io/apache/spark/apache-spark-github-action-image-pyspark-python-309-cache:${{ github.ref_name }}-static
cache-from: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-pyspark-python-309-cache:${{ github.ref_name }}
cache-to: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-pyspark-python-309-cache:${{ github.ref_name }},mode=max
- name: Image digest (PySpark with Python 3.9)
if: hashFiles('dev/spark-test-image/python-309/Dockerfile') != ''
run: echo ${{ steps.docker_build_pyspark_python_309.outputs.digest }}
1 change: 1 addition & 0 deletions .github/workflows/build_python_3.9.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ jobs:
hadoop: hadoop3
envs: >-
{
"PYSPARK_IMAGE_TO_TEST": "python-309",
"PYTHON_TO_TEST": "python3.9"
}
jobs: >-
Expand Down
82 changes: 82 additions & 0 deletions dev/spark-test-image/python-309/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

# Image for building and testing Spark branches. Based on Ubuntu 22.04.
# See also in https://hub.docker.com/_/ubuntu
FROM ubuntu:jammy-20240911.1
LABEL org.opencontainers.image.authors="Apache Spark project <[email protected]>"
LABEL org.opencontainers.image.licenses="Apache-2.0"
LABEL org.opencontainers.image.ref.name="Apache Spark Infra Image For PySpark with Python 3.09"
# Overwrite this label to avoid exposing the underlying Ubuntu OS version label
LABEL org.opencontainers.image.version=""

ENV FULL_REFRESH_DATE 20241119

ENV DEBIAN_FRONTEND noninteractive
ENV DEBCONF_NONINTERACTIVE_SEEN true

RUN apt-get update && apt-get install -y \
build-essential \
ca-certificates \
curl \
gfortran \
git \
gnupg \
libcurl4-openssl-dev \
libfontconfig1-dev \
libfreetype6-dev \
libfribidi-dev \
libgit2-dev \
libharfbuzz-dev \
libjpeg-dev \
liblapack-dev \
libopenblas-dev \
libpng-dev \
libpython3-dev \
libssl-dev \
libtiff5-dev \
libxml2-dev \
openjdk-17-jdk-headless \
pandoc \
pkg-config \
qpdf \
software-properties-common \
wget \
zlib1g-dev \
&& rm -rf /var/lib/apt/lists/*

# Install Python 3.9
RUN add-apt-repository ppa:deadsnakes/ppa
RUN apt-get update && apt-get install -y \
python3.9 python3.9-distutils \
&& rm -rf /var/lib/apt/lists/*

ARG BASIC_PIP_PKGS="numpy pyarrow>=18.0.0 six==1.16.0 pandas==2.2.3 scipy plotly>=4.8 mlflow>=2.8.1 coverage matplotlib openpyxl memory-profiler>=0.61.0 scikit-learn>=1.3.2"
# Python deps for Spark Connect
ARG CONNECT_PIP_PKGS="grpcio==1.67.0 grpcio-status==1.67.0 protobuf==5.28.3 googleapis-common-protos==1.65.0 graphviz==0.20.3"

# Install Python 3.9
RUN add-apt-repository ppa:deadsnakes/ppa
RUN apt-get update && apt-get install -y \
python3.9 python3.9-distutils \
&& rm -rf /var/lib/apt/lists/*
RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.9
RUN python3.9 -m pip install --ignore-installed blinker>=1.6.2 # mlflow needs this
RUN python3.9 -m pip install --force $BASIC_PIP_PKGS unittest-xml-reporting $CONNECT_PIP_PKGS && \
python3.9 -m pip install torch torchvision --index-url https://download.pytorch.org/whl/cpu && \
python3.9 -m pip install torcheval && \
python3.9 -m pip cache purge

0 comments on commit 45da6f6

Please sign in to comment.