diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 3117872e21680..cf49316fafbbf 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -55,6 +55,7 @@ jobs: runs-on: ubuntu-latest env: GITHUB_PREV_SHA: ${{ github.event.before }} + PYSPARK_IMAGE_TO_TEST: '' outputs: required: ${{ steps.set-outputs.outputs.required }} image_url: ${{ steps.infra-image-outputs.outputs.image_url }} @@ -64,6 +65,8 @@ jobs: image_lint_url_link: ${{ steps.infra-image-link.outputs.image_lint_url_link }} image_sparkr_url: ${{ steps.infra-image-sparkr-outputs.outputs.image_sparkr_url }} image_sparkr_url_link: ${{ steps.infra-image-link.outputs.image_sparkr_url_link }} + image_pyspark_url: ${{ steps.infra-image-pyspark-outputs.outputs.image_pyspark_url }} + image_pyspark_url_link: ${{ steps.infra-image-link.outputs.image_pyspark_url_link }} steps: - name: Checkout Spark repository uses: actions/checkout@v4 @@ -164,8 +167,19 @@ jobs: IMG_NAME="apache-spark-ci-image-sparkr:${{ inputs.branch }}-${{ github.run_id }}" IMG_URL="ghcr.io/$REPO_OWNER/$IMG_NAME" echo "image_sparkr_url=$IMG_URL" >> $GITHUB_OUTPUT + - name: Generate infra image URL (PySpark ${{ env.PYSPARK_IMAGE_TO_TEST }}) + id: infra-image-pyspark-outputs + if: ${{ env.PYSPARK_IMAGE_TO_TEST }} + env: ${{ fromJSON(inputs.envs) }} + run: | + # Convert to lowercase to meet Docker repo name requirement + REPO_OWNER=$(echo "${{ github.repository_owner }}" | tr '[:upper:]' '[:lower:]') + IMG_NAME="apache-spark-ci-image-pyspark-${{ env.PYSPARK_IMAGE_TO_TEST }}:${{ inputs.branch }}-${{ github.run_id }}" + IMG_URL="ghcr.io/$REPO_OWNER/$IMG_NAME" + echo "image_pyspark_url=$IMG_URL" >> $GITHUB_OUTPUT - name: Link the docker images id: infra-image-link + env: ${{ fromJSON(inputs.envs) }} run: | # Set the image URL for job "docs" # Should delete the link and directly use image_docs_url after SPARK 3.x EOL @@ -173,10 +187,16 @@ jobs: echo "image_docs_url_link=${{ steps.infra-image-outputs.outputs.image_url }}" >> $GITHUB_OUTPUT echo "image_lint_url_link=${{ steps.infra-image-outputs.outputs.image_url }}" >> $GITHUB_OUTPUT echo "image_sparkr_url_link=${{ steps.infra-image-outputs.outputs.image_url }}" >> $GITHUB_OUTPUT + echo "image_pyspark_url_link=${{ steps.infra-image-outputs.outputs.image_url }}" >> $GITHUB_OUTPUT else echo "image_docs_url_link=${{ steps.infra-image-docs-outputs.outputs.image_docs_url }}" >> $GITHUB_OUTPUT echo "image_lint_url_link=${{ steps.infra-image-lint-outputs.outputs.image_lint_url }}" >> $GITHUB_OUTPUT echo "image_sparkr_url_link=${{ steps.infra-image-sparkr-outputs.outputs.image_sparkr_url }}" >> $GITHUB_OUTPUT + if [[ "${{ env.PYSPARK_IMAGE_TO_TEST }}" != "" ]]; then + echo "image_pyspark_url_link=${{ steps.infra-image-pyspark-outputs.outputs.image_pyspark_url }}" >> $GITHUB_OUTPUT + else + echo "image_pyspark_url_link=${{ steps.infra-image-outputs.outputs.image_url }}" >> $GITHUB_OUTPUT + fi fi # Build: build Spark and run the tests for specified modules. @@ -360,6 +380,8 @@ jobs: runs-on: ubuntu-latest permissions: packages: write + env: + PYSPARK_IMAGE_TO_TEST: '' steps: - name: Login to GitHub Container Registry uses: docker/login-action@v3 @@ -428,6 +450,18 @@ jobs: ${{ needs.precondition.outputs.image_sparkr_url }} # Use the infra image cache to speed up cache-from: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-sparkr-cache:${{ inputs.branch }} + - name: Build and push (PySpark ${{ env.PYSPARK_IMAGE_TO_TEST }}) + if: ${{ env.PYSPARK_IMAGE_TO_TEST }} + id: docker_build_pyspark + env: ${{ fromJSON(inputs.envs) }} + uses: docker/build-push-action@v6 + with: + context: ./dev/spark-test-image/${{ env.PYSPARK_IMAGE_TO_TEST }}/ + push: true + tags: | + ${{ needs.precondition.outputs.image_pyspark_url }} + # Use the infra image cache to speed up + cache-from: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-pyspark-${{ env.PYSPARK_IMAGE_TO_TEST }}-cache:${{ inputs.branch }} pyspark: @@ -438,7 +472,7 @@ jobs: runs-on: ubuntu-latest timeout-minutes: 180 container: - image: ${{ needs.precondition.outputs.image_url }} + image: ${{ needs.precondition.outputs.image_pyspark_url_link }} strategy: fail-fast: false matrix: diff --git a/.github/workflows/build_infra_images_cache.yml b/.github/workflows/build_infra_images_cache.yml index a6beacedeebd4..3d5a8306aca80 100644 --- a/.github/workflows/build_infra_images_cache.yml +++ b/.github/workflows/build_infra_images_cache.yml @@ -30,6 +30,7 @@ on: - 'dev/spark-test-image/docs/Dockerfile' - 'dev/spark-test-image/lint/Dockerfile' - 'dev/spark-test-image/sparkr/Dockerfile' + - 'dev/spark-test-image/python-309/Dockerfile' - '.github/workflows/build_infra_images_cache.yml' # Create infra image when cutting down branches/tags create: @@ -102,3 +103,16 @@ jobs: - name: Image digest (SparkR) if: hashFiles('dev/spark-test-image/sparkr/Dockerfile') != '' run: echo ${{ steps.docker_build_sparkr.outputs.digest }} + - name: Build and push (PySpark with Python 3.9) + if: hashFiles('dev/spark-test-image/python-309/Dockerfile') != '' + id: docker_build_pyspark_python_309 + uses: docker/build-push-action@v6 + with: + context: ./dev/spark-test-image/python-309/ + push: true + tags: ghcr.io/apache/spark/apache-spark-github-action-image-pyspark-python-309-cache:${{ github.ref_name }}-static + cache-from: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-pyspark-python-309-cache:${{ github.ref_name }} + cache-to: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-pyspark-python-309-cache:${{ github.ref_name }},mode=max + - name: Image digest (PySpark with Python 3.9) + if: hashFiles('dev/spark-test-image/python-309/Dockerfile') != '' + run: echo ${{ steps.docker_build_pyspark_python_309.outputs.digest }} diff --git a/.github/workflows/build_python_3.9.yml b/.github/workflows/build_python_3.9.yml index b2401fcf2aa14..744e18cc8db39 100644 --- a/.github/workflows/build_python_3.9.yml +++ b/.github/workflows/build_python_3.9.yml @@ -36,6 +36,7 @@ jobs: hadoop: hadoop3 envs: >- { + "PYSPARK_IMAGE_TO_TEST": "python-309", "PYTHON_TO_TEST": "python3.9" } jobs: >- diff --git a/dev/spark-test-image/python-309/Dockerfile b/dev/spark-test-image/python-309/Dockerfile new file mode 100644 index 0000000000000..dbab99c1441bc --- /dev/null +++ b/dev/spark-test-image/python-309/Dockerfile @@ -0,0 +1,82 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Image for building and testing Spark branches. Based on Ubuntu 22.04. +# See also in https://hub.docker.com/_/ubuntu +FROM ubuntu:jammy-20240911.1 +LABEL org.opencontainers.image.authors="Apache Spark project " +LABEL org.opencontainers.image.licenses="Apache-2.0" +LABEL org.opencontainers.image.ref.name="Apache Spark Infra Image For PySpark with Python 3.09" +# Overwrite this label to avoid exposing the underlying Ubuntu OS version label +LABEL org.opencontainers.image.version="" + +ENV FULL_REFRESH_DATE 20241119 + +ENV DEBIAN_FRONTEND noninteractive +ENV DEBCONF_NONINTERACTIVE_SEEN true + +RUN apt-get update && apt-get install -y \ + build-essential \ + ca-certificates \ + curl \ + gfortran \ + git \ + gnupg \ + libcurl4-openssl-dev \ + libfontconfig1-dev \ + libfreetype6-dev \ + libfribidi-dev \ + libgit2-dev \ + libharfbuzz-dev \ + libjpeg-dev \ + liblapack-dev \ + libopenblas-dev \ + libpng-dev \ + libpython3-dev \ + libssl-dev \ + libtiff5-dev \ + libxml2-dev \ + openjdk-17-jdk-headless \ + pandoc \ + pkg-config \ + qpdf \ + software-properties-common \ + wget \ + zlib1g-dev \ + && rm -rf /var/lib/apt/lists/* + +# Install Python 3.9 +RUN add-apt-repository ppa:deadsnakes/ppa +RUN apt-get update && apt-get install -y \ + python3.9 python3.9-distutils \ + && rm -rf /var/lib/apt/lists/* + +ARG BASIC_PIP_PKGS="numpy pyarrow>=18.0.0 six==1.16.0 pandas==2.2.3 scipy plotly>=4.8 mlflow>=2.8.1 coverage matplotlib openpyxl memory-profiler>=0.61.0 scikit-learn>=1.3.2" +# Python deps for Spark Connect +ARG CONNECT_PIP_PKGS="grpcio==1.67.0 grpcio-status==1.67.0 protobuf==5.28.3 googleapis-common-protos==1.65.0 graphviz==0.20.3" + +# Install Python 3.9 +RUN add-apt-repository ppa:deadsnakes/ppa +RUN apt-get update && apt-get install -y \ + python3.9 python3.9-distutils \ + && rm -rf /var/lib/apt/lists/* +RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.9 +RUN python3.9 -m pip install --ignore-installed blinker>=1.6.2 # mlflow needs this +RUN python3.9 -m pip install --force $BASIC_PIP_PKGS unittest-xml-reporting $CONNECT_PIP_PKGS && \ + python3.9 -m pip install torch torchvision --index-url https://download.pytorch.org/whl/cpu && \ + python3.9 -m pip install torcheval && \ + python3.9 -m pip cache purge