Skip to content

Add rag enrichment and ingestion. #30909

Add rag enrichment and ingestion.

Add rag enrichment and ingestion. #30909

Workflow file for this run

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# To learn more about GitHub Actions in Apache Beam check the CI.md
name: Build python source distribution and wheels
on:
schedule:
- cron: '10 2 * * *'
push:
branches: ['master', 'release-*']
tags: 'v*'
pull_request:
branches: ['master', 'release-*']
tags: 'v*'
paths: ['sdks/python/**', 'model/**', 'release/**']
workflow_dispatch:
# This allows a subsequently queued workflow run to interrupt previous runs
concurrency:
group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.event.pull_request.head.label || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login}}'
cancel-in-progress: true
env:
GCP_PATH: "gs://${{ secrets.GCP_PYTHON_WHEELS_BUCKET }}/${GITHUB_REF##*/}/${GITHUB_SHA}-${GITHUB_RUN_ID}/"
jobs:
check_env_variables:
timeout-minutes: 5
name: "Check environment variables"
runs-on: [self-hosted, ubuntu-20.04, main]
env:
EVENT_NAME: ${{ github.event_name }}
# Keep in sync with py_version matrix value below - if changed, change that as well.
PY_VERSIONS_FULL: "cp39-* cp310-* cp311-* cp312-*"
outputs:
gcp-variables-set: ${{ steps.check_gcp_variables.outputs.gcp-variables-set }}
py-versions-full: ${{ steps.set-py-versions.outputs.py-versions-full }}
py-versions-test: ${{ steps.set-py-versions.outputs.py-versions-test }}
steps:
- uses: actions/checkout@v4
- name: "Check are GCP variables set"
run: "./scripts/ci/ci_check_are_gcp_variables_set.sh"
id: check_gcp_variables
env:
GCP_SA_EMAIL: "not used by self hosted runner"
GCP_SA_KEY: "not used by self hosted runner"
GCP_PYTHON_WHEELS_BUCKET: ${{ secrets.GCP_PYTHON_WHEELS_BUCKET }}
GCP_PROJECT_ID: "not-needed-here"
GCP_REGION: "not-needed-here"
GCP_TESTING_BUCKET: "not-needed-here"
- name: Set Python Versions for different environments
id: set-py-versions
run: |
set -xeu
if [ $EVENT_NAME == "pull_request" ]; then
# run highest supported version on pull request.
echo "py-versions-test=${PY_VERSIONS_FULL##* }" >> $GITHUB_OUTPUT
else
# run full version for push and cron jobs.
echo "py-versions-test=$PY_VERSIONS_FULL" >> $GITHUB_OUTPUT
fi
# Output full set of versions so that we can test all languages on pull requests for certain platforms.
echo "py-versions-full=$PY_VERSIONS_FULL" >> $GITHUB_OUTPUT
build_source:
runs-on: [self-hosted, ubuntu-20.04, main]
name: Build python source distribution
outputs:
is_rc: ${{ steps.is_rc.outputs.is_rc }}
rc_num: ${{ steps.get_rc_version.outputs.RC_NUM }}
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Install python
uses: actions/setup-python@v5
with:
python-version: 3.9
- name: Get tag
id: get_tag
run: |
echo "TAG=${GITHUB_REF#refs/*/}" >> $GITHUB_OUTPUT
- name: Check whether an -RC tag was applied to the commit.
id: is_rc
run: |
echo ${{ steps.get_tag.outputs.TAG }} > temp
OUTPUT=$( if grep -e '-RC.' -q temp; then echo 1; else echo 0; fi)
echo "is_rc=$OUTPUT" >> $GITHUB_OUTPUT
- name: Get RELEASE_VERSION and RC_NUM
if: steps.is_rc.outputs.is_rc == 1
id: get_rc_version
run: |
RC_NUM=$(sed -n "s/^.*-RC\([0-9]*\)/\1/p" temp)
RELEASE_VERSION=$(sed -n "s/^v\(.*\)-RC[0-9]/\1/p" temp)
echo "RC_NUM=$RC_NUM" >> $GITHUB_OUTPUT
echo "RELEASE_VERSION=$RELEASE_VERSION" >> $GITHUB_OUTPUT
- name: Build source
working-directory: ./sdks/python
run: pip install -U build && python -m build --sdist
- name: Add checksums
working-directory: ./sdks/python/dist
run: |
file=$(ls | grep .tar.gz | head -n 1)
sha512sum $file > ${file}.sha512
- name: Unzip source
working-directory: ./sdks/python
run: tar -xzvf dist/$(ls dist | grep .tar.gz | head -n 1)
- name: Rename source directory
working-directory: ./sdks/python
# https://github.com/pypa/setuptools/issues/4300 changed naming. Match both old and new names.
run: mv $(ls | grep "apache-beam-\|apache_beam-") apache-beam-source
- name: Upload source as artifact
uses: actions/upload-artifact@v4
with:
name: source
path: sdks/python/apache-beam-source
- name: Upload compressed sources as artifacts
uses: actions/upload-artifact@v4
with:
name: source_zip
path: sdks/python/dist
- name: Clear dist
if: steps.is_rc.outputs.is_rc == 1
working-directory: ./sdks/python
run: |
rm -r ./dist
rm -rd apache-beam-source
- name: Rewrite SDK version to include RC number
if: steps.is_rc.outputs.is_rc == 1
working-directory: ./sdks/python
run: |
RELEASE_VERSION=${{ steps.get_rc_version.outputs.RELEASE_VERSION }}
RC_NUM=${{ steps.get_rc_version.outputs.RC_NUM }}
sed -i -e "s/${RELEASE_VERSION}/${RELEASE_VERSION}rc${RC_NUM}/g" apache_beam/version.py
- name: Build RC source
if: steps.is_rc.outputs.is_rc == 1
working-directory: ./sdks/python
run: pip install -U build && python -m build --sdist
- name: Add RC checksums
if: steps.is_rc.outputs.is_rc == 1
working-directory: ./sdks/python/dist
run: |
file=$(ls | grep .tar.gz | head -n 1)
sha512sum $file > ${file}.sha512
- name: Unzip RC source
if: steps.is_rc.outputs.is_rc == 1
working-directory: ./sdks/python
run: tar -xzvf dist/$(ls dist | grep .tar.gz | head -n 1)
- name: Rename RC source directory
if: steps.is_rc.outputs.is_rc == 1
working-directory: ./sdks/python
# https://github.com/pypa/setuptools/issues/4300 changed naming. Match both old and new names.
run: mv $(ls | grep "apache-beam-\|apache_beam-") apache-beam-source-rc
- name: Upload RC source as artifact
if: steps.is_rc.outputs.is_rc == 1
uses: actions/upload-artifact@v4
with:
name: source_rc${{ steps.get_rc_version.outputs.RC_NUM }}
path: sdks/python/apache-beam-source-rc
- name: Upload compressed RC sources as artifacts
if: steps.is_rc.outputs.is_rc == 1
uses: actions/upload-artifact@v4
with:
name: source_zip_rc${{ steps.get_rc_version.outputs.RC_NUM }}
path: sdks/python/dist
prepare_gcs:
name: Prepare GCS
needs:
- build_source
- check_env_variables
runs-on: [self-hosted, ubuntu-20.04, main]
if: needs.check_env_variables.outputs.gcp-variables-set == 'true' && github.event_name != 'pull_request'
steps:
- name: Remove existing files on GCS bucket
run: gsutil rm -r ${{ env.GCP_PATH }} || true
upload_source_to_gcs:
name: Upload python source distribution to GCS bucket
needs:
- prepare_gcs
- check_env_variables
runs-on: [self-hosted, ubuntu-20.04, main]
if: needs.check_env_variables.outputs.gcp-variables-set == 'true'
steps:
- name: Download compressed sources from artifacts
uses: actions/[email protected]
with:
name: source_zip
path: source/
- name: Copy sources to GCS bucket
run: gsutil cp -r -a public-read source/* ${{ env.GCP_PATH }}
build_wheels:
name: Build python ${{matrix.py_version}} wheels on ${{matrix.os_python.arch}} for ${{ matrix.os_python.os }}
needs:
- check_env_variables
- build_source
env:
CIBW_ARCHS_LINUX: ${{matrix.os_python.arch}}
runs-on: ${{ matrix.os_python.runner }}
timeout-minutes: 480
strategy:
matrix:
os_python: [
{"os": "ubuntu-20.04", "runner": [self-hosted, ubuntu-20.04, main], "python": "${{ needs.check_env_variables.outputs.py-versions-full }}", arch: "auto" },
# Temporarily pin to macos-13 because macos-latest breaks this build
# TODO(https://github.com/apache/beam/issues/31114)
{"os": "macos-13", "runner": "macos-13", "python": "${{ needs.check_env_variables.outputs.py-versions-test }}", arch: "auto" },
{"os": "windows-latest", "runner": "windows-latest", "python": "${{ needs.check_env_variables.outputs.py-versions-test }}", arch: "auto" },
{"os": "ubuntu-20.04", "runner": [self-hosted, ubuntu-20.04, main], "python": "${{ needs.check_env_variables.outputs.py-versions-test }}", arch: "aarch64" }
]
# Keep in sync (remove asterisks) with PY_VERSIONS_FULL env var above - if changed, change that as well.
py_version: ["cp39-", "cp310-", "cp311-", "cp312-"]
steps:
- name: Download python source distribution from artifacts
uses: actions/[email protected]
with:
name: source
path: apache-beam-source
- name: Download Python SDK RC source distribution from artifacts
if: ${{ needs.build_source.outputs.is_rc == 1 }}
uses: actions/[email protected]
with:
name: source_rc${{ needs.build_source.outputs.rc_num }}
path: apache-beam-source-rc
- name: Install Python
uses: actions/setup-python@v5
with:
python-version: 3.9
- uses: docker/setup-qemu-action@v1
if: ${{matrix.os_python.arch == 'aarch64'}}
name: Set up QEMU
- name: Install cibuildwheel
# note: sync cibuildwheel version with gradle task sdks:python:bdistPy* steps
run: pip install cibuildwheel==2.17.0 setuptools
- name: Build wheel
# Only build wheel if it is one of the target versions for this platform, otherwise no-op
if: ${{ contains(matrix.os_python.python, matrix.py_version) }}
working-directory: apache-beam-source
env:
CIBW_BUILD: ${{ matrix.py_version }}*
# TODO: https://github.com/apache/beam/issues/23048
CIBW_SKIP: "*-musllinux_*"
CIBW_BEFORE_BUILD: pip install cython==0.29.36 numpy --config-settings=setup-args="-Dallow-noblas=true" && pip install --upgrade setuptools
run: cibuildwheel --print-build-identifiers && cibuildwheel --output-dir wheelhouse
shell: bash
- name: install sha512sum on MacOS
if: startsWith(matrix.os_python.os, 'macos')
run: brew install coreutils
- name: Add checksums
if: ${{ contains(matrix.os_python.python, matrix.py_version) }}
working-directory: apache-beam-source/wheelhouse/
run: |
for file in *.whl; do
sha512sum $file > ${file}.sha512
done
shell: bash
- name: Upload wheels as artifacts
if: ${{ contains(matrix.os_python.python, matrix.py_version) }}
uses: actions/upload-artifact@v4
with:
name: wheelhouse-${{ matrix.py_version }}${{ matrix.os_python.os }}${{ (matrix.os_python.arch == 'aarch64' && '-aarch64') || '' }}
path: apache-beam-source/wheelhouse/
- name: Build RC wheels
# Only build wheel if it is one of the target versions for this platform, otherwise no-op
if: ${{ needs.build_source.outputs.is_rc == 1 && contains(matrix.os_python.python, matrix.py_version) }}
working-directory: apache-beam-source-rc
env:
CIBW_BUILD: ${{ matrix.py_version }}*
# TODO: https://github.com/apache/beam/issues/23048
CIBW_SKIP: "*-musllinux_*"
CIBW_BEFORE_BUILD: pip install cython==0.29.36 numpy --config-settings=setup-args="-Dallow-noblas=true" && pip install --upgrade setuptools
run: cibuildwheel --print-build-identifiers && cibuildwheel --output-dir wheelhouse
shell: bash
- name: Add RC checksums
if: ${{ needs.build_source.outputs.is_rc == 1 }}
working-directory: apache-beam-source-rc/wheelhouse/
run: |
for file in *.whl; do
sha512sum $file > ${file}.sha512
done
shell: bash
- name: Upload RC wheels as artifacts
if: ${{ needs.build_source.outputs.is_rc == 1 }}
uses: actions/upload-artifact@v4
with:
name: wheelhouse-rc${{ needs.build_source.outputs.rc_num }}-${{ matrix.py_version }}${{ matrix.os_python.os }}${{ (matrix.os_python.arch == 'aarch64' && '-aarch64') || '' }}
path: apache-beam-source-rc/wheelhouse/
upload_wheels_to_gcs:
name: Upload wheels to GCS bucket
needs:
- build_wheels
- check_env_variables
runs-on: [self-hosted, ubuntu-20.04, main]
if: needs.check_env_variables.outputs.gcp-variables-set == 'true' && github.event_name != 'pull_request'
steps:
- name: Download wheels from artifacts
uses: actions/download-artifact@v4
with:
pattern: wheelhouse-*
merge-multiple: true
path: wheelhouse/
- name: Copy wheels to GCS bucket
run: gsutil cp -r -a public-read wheelhouse/* ${{ env.GCP_PATH }}
- name: Create github action information file on GCS bucket
run: |
cat > github_action_info <<EOF
GITHUB_WORKFLOW=$GITHUB_WORKFLOW
GITHUB_RUN_ID=$GITHUB_RUN_ID
GITHUB_RUN_NUMBER=$GITHUB_RUN_NUMBER
GITHUB_ACTION=$GITHUB_ACTION
GITHUB_ACTOR=$GITHUB_ACTOR
GITHUB_REPOSITORY=$GITHUB_REPOSITORY
GITHUB_EVENT_NAME=$GITHUB_EVENT_NAME
GITHUB_SHA=$GITHUB_SHA
GITHUB_REF=$GITHUB_REF
# only for forked repositiories
GITHUB_HEAD_REF=$GITHUB_HEAD_REF
GITHUB_BASE_REF=$GITHUB_BASE_REF
EOF
echo $(cat github_action_info)
gsutil cp -a public-read github_action_info ${{ env.GCP_PATH }}
- name: Upload GitHub event file to GCS bucket
run: gsutil cp -a public-read ${GITHUB_EVENT_PATH} ${{ env.GCP_PATH }}
list_files_on_gcs:
name: List files on Google Cloud Storage Bucket
needs:
- upload_wheels_to_gcs
- check_env_variables
runs-on: [self-hosted, ubuntu-20.04, main]
if: needs.check_env_variables.outputs.gcp-variables-set == 'true' && github.event_name != 'pull_request'
steps:
- name: List file on Google Cloud Storage Bucket
run: gsutil ls "${{ env.GCP_PATH }}*"
branch_repo_nightly:
permissions:
contents: write
name: Branch repo nightly
needs:
- build_source
- build_wheels
runs-on: [self-hosted, ubuntu-20.04, main]
timeout-minutes: 60
if: github.repository_owner == 'apache' && github.event_name == 'schedule'
steps:
- name: Checkout code on master branch
uses: actions/checkout@v4
with:
persist-credentials: false
submodules: recursive
- name: Branch commit
run: |
BRANCH_NAME=${GITHUB_REF##*/}
echo "Updating nightly-${BRANCH_NAME}"
git branch -f nightly-${BRANCH_NAME} HEAD
- name: Push branch
uses: ./.github/actions/github-push-action
with:
github_token: ${{ secrets.GITHUB_TOKEN }}
force: true
branch: nightly-${{ github.ref }}