From fd6f6f084f94271aa12649f7f19864f49ad867b7 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Fri, 29 Sep 2023 10:46:42 +0200 Subject: [PATCH 001/102] Add Github action for integration test --- .github/workflows/integration.yml | 86 +++++++++++++++++++++++++++++++ 1 file changed, 86 insertions(+) create mode 100644 .github/workflows/integration.yml diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml new file mode 100644 index 000000000..75203a1cb --- /dev/null +++ b/.github/workflows/integration.yml @@ -0,0 +1,86 @@ +# **what?** +# Runs integration tests. + +# **why?** +# Ensure code for dbt meets a certain quality standard. + +# **when?** +# This will run for all PRs, when code is pushed to a release +# branch, and when manually triggered. + +name: Integration tests + +on: + push: + branches: + - "main" + - "*.latest" + - "releases/*" + pull_request: + workflow_dispatch: + +# explicitly turn off permissions for `GITHUB_TOKEN` +permissions: read-all + +# will cancel previous workflows triggered by the same event and for the same ref for PRs or same SHA otherwise +concurrency: + group: ${{ github.workflow }}-${{ github.event_name }}-${{ contains(github.event_name, 'pull_request') && github.event.pull_request.head.ref || github.sha }} + cancel-in-progress: true + +defaults: + run: + shell: bash + +jobs: + tests: + name: test with python ${{ matrix.python-version }} + + runs-on: ubuntu-latest + timeout-minutes: 10 + + strategy: + fail-fast: false + matrix: + python-version: + - "3.8" + - "3.9" + - "3.10" + - "3.11" + + env: + TOXENV: "unit" + PYTEST_ADDOPTS: "-v --color=yes --csv test_results.csv" + + steps: + - name: Check out the repository + uses: actions/checkout@v3 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + + - uses: isbang/compose-action@v1.5.1 + with: + compose-file: "./docker/docker-compose.yml" + + - name: Install tox + run: | + python -m pip install --upgrade pip + python -m pip install tox + + - name: Run tox + run: | + tox -e integration-spark-session + tox -e integration-spark-thrift + + - name: Get current date + if: always() + id: date + run: echo "date=$(date +'%Y-%m-%dT%H_%M_%S')" >> $GITHUB_OUTPUT # Colons are not allowed in artifacts name + + - uses: actions/upload-artifact@v3 + if: always() + with: + name: tests_results_${{ matrix.python-version }}-${{ steps.date.outputs.date }}.csv + path: tests_results.csv From 795e40a01cfb1de47168eb0c8d49c231989d2e08 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Fri, 29 Sep 2023 11:54:41 +0200 Subject: [PATCH 002/102] Update tox --- tox.ini | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/tox.ini b/tox.ini index 97017a926..e456d55d0 100644 --- a/tox.ini +++ b/tox.ini @@ -56,10 +56,7 @@ deps = [testenv:integration-spark-thrift] -allowlist_externals = - /bin/bash -basepython = python3.8 -commands = /bin/bash -c '{envpython} -m pytest -v --profile apache_spark {posargs} -n4 tests/functional/adapter/*' +description = run integration tests against a Spark thrift server passenv = DBT_* PYTEST_ADDOPTS @@ -67,12 +64,10 @@ deps = -r{toxinidir}/requirements.txt -r{toxinidir}/dev-requirements.txt -e. +commands = pytest -v --profile apache_spark {posargs} -n4 tests/functional/adapter/* [testenv:integration-spark-session] -allowlist_externals = - /bin/bash -basepython = python3.10 -commands = /bin/bash -c '{envpython} -m pytest -v --profile spark_session {posargs} -n4 tests/functional/adapter/*' +description = run integration tests against a Spark session passenv = DBT_* PYTEST_* @@ -81,3 +76,4 @@ deps = -r{toxinidir}/requirements.txt -r{toxinidir}/dev-requirements.txt -e.[session] +commands = pytest -v --profile spark_session {posargs} -n4 tests/functional/adapter/* From ff39c5d065e8b8ec065e5531e29107e35ccfcd6e Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Fri, 29 Sep 2023 14:42:11 +0200 Subject: [PATCH 003/102] Fetch spark from https link --- docker/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index bb4d378ed..b310fde4d 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -14,7 +14,7 @@ ENV PATH="/usr/spark/bin:/usr/spark/sbin:${PATH}" RUN apt-get update && \ apt-get install -y wget netcat procps libpostgresql-jdbc-java && \ - wget -q "http://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz" && \ + wget -q "https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz" && \ tar xzf "spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz" && \ rm "spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz" && \ mv "spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}" /usr/spark && \ From 1505fc6fb4d26245e18e65485e73407c867a3ef3 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Fri, 29 Sep 2023 14:42:40 +0200 Subject: [PATCH 004/102] Use Spark version 3.1.2 --- docker/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index b310fde4d..d1fd5357f 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -2,7 +2,7 @@ ARG OPENJDK_VERSION=8 FROM eclipse-temurin:${OPENJDK_VERSION}-jre ARG BUILD_DATE -ARG SPARK_VERSION=3.3.2 +ARG SPARK_VERSION=3.1.2 ARG HADOOP_VERSION=3 LABEL org.label-schema.name="Apache Spark ${SPARK_VERSION}" \ From 44fe33f4bd233f508c59c527a69590de1ec5f463 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Fri, 29 Sep 2023 14:50:13 +0200 Subject: [PATCH 005/102] Seperate running Spark session and thrift --- .github/workflows/integration.yml | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index 75203a1cb..d455e804b 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -69,10 +69,11 @@ jobs: python -m pip install --upgrade pip python -m pip install tox - - name: Run tox - run: | - tox -e integration-spark-session - tox -e integration-spark-thrift + - name: Run tox for Spark session + run: tox -e integration-spark-session + + - name: Run tox for Spark thrift + run: tox -e integration-spark-thrift - name: Get current date if: always() From 2655631fa3b6db8a7515f11495710675bca0ba4e Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Fri, 29 Sep 2023 14:51:40 +0200 Subject: [PATCH 006/102] Use Spark 3.1.2 and Hadoop 3.2 --- docker/Dockerfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index d1fd5357f..85d01ba8a 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -2,8 +2,8 @@ ARG OPENJDK_VERSION=8 FROM eclipse-temurin:${OPENJDK_VERSION}-jre ARG BUILD_DATE -ARG SPARK_VERSION=3.1.2 -ARG HADOOP_VERSION=3 +ARG SPARK_VERSION=3.1.3 +ARG HADOOP_VERSION=3.2 LABEL org.label-schema.name="Apache Spark ${SPARK_VERSION}" \ org.label-schema.build-date=$BUILD_DATE \ From 915f67e9203dfb891ad4a22f3db7f9251b19ab84 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Fri, 29 Sep 2023 14:57:20 +0200 Subject: [PATCH 007/102] Reset tox.ini --- tox.ini | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/tox.ini b/tox.ini index e456d55d0..33055a211 100644 --- a/tox.ini +++ b/tox.ini @@ -57,6 +57,9 @@ deps = [testenv:integration-spark-thrift] description = run integration tests against a Spark thrift server +allowlist_externals = + /bin/bash +basepython = python3.8 passenv = DBT_* PYTEST_ADDOPTS @@ -64,10 +67,13 @@ deps = -r{toxinidir}/requirements.txt -r{toxinidir}/dev-requirements.txt -e. -commands = pytest -v --profile apache_spark {posargs} -n4 tests/functional/adapter/* +commands = /bin/bash -c '{envpython} -m pytest -v --profile apache_spark {posargs} -n4 tests/functional/adapter/*' [testenv:integration-spark-session] description = run integration tests against a Spark session +allowlist_externals = + /bin/bash +basepython = python3.10 passenv = DBT_* PYTEST_* @@ -76,4 +82,4 @@ deps = -r{toxinidir}/requirements.txt -r{toxinidir}/dev-requirements.txt -e.[session] -commands = pytest -v --profile spark_session {posargs} -n4 tests/functional/adapter/* +commands = /bin/bash -c '{envpython} -m pytest -v --profile spark_session {posargs} -n4 tests/functional/adapter/*' From f0ef215e1c8186cf4270e695ec8663a5d745d127 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Fri, 29 Sep 2023 15:08:48 +0200 Subject: [PATCH 008/102] Remove base pythons in tox.ini --- tox.ini | 2 -- 1 file changed, 2 deletions(-) diff --git a/tox.ini b/tox.ini index 33055a211..31396b5ef 100644 --- a/tox.ini +++ b/tox.ini @@ -59,7 +59,6 @@ deps = description = run integration tests against a Spark thrift server allowlist_externals = /bin/bash -basepython = python3.8 passenv = DBT_* PYTEST_ADDOPTS @@ -73,7 +72,6 @@ commands = /bin/bash -c '{envpython} -m pytest -v --profile apache_spark {posarg description = run integration tests against a Spark session allowlist_externals = /bin/bash -basepython = python3.10 passenv = DBT_* PYTEST_* From e8457df87d636324aae416c4a8eea363779f0156 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Fri, 29 Sep 2023 15:19:19 +0200 Subject: [PATCH 009/102] Fix reference to Docker compose file --- .github/workflows/integration.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index d455e804b..517815e27 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -62,7 +62,7 @@ jobs: - uses: isbang/compose-action@v1.5.1 with: - compose-file: "./docker/docker-compose.yml" + compose-file: "./docker-compose.yml" - name: Install tox run: | From 842466a2883efd3a13826410f1477a0ff84c5e8f Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Fri, 29 Sep 2023 15:42:11 +0200 Subject: [PATCH 010/102] Remove timeout --- .github/workflows/integration.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index 517815e27..8eafa5c72 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -36,7 +36,6 @@ jobs: name: test with python ${{ matrix.python-version }} runs-on: ubuntu-latest - timeout-minutes: 10 strategy: fail-fast: false From 0738f2d0bcc5f30eab1cc92b4c82720ce99e3265 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Fri, 29 Sep 2023 15:55:55 +0200 Subject: [PATCH 011/102] Remove artifact steps --- .github/workflows/integration.yml | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index 8eafa5c72..9f26bd2be 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -73,14 +73,3 @@ jobs: - name: Run tox for Spark thrift run: tox -e integration-spark-thrift - - - name: Get current date - if: always() - id: date - run: echo "date=$(date +'%Y-%m-%dT%H_%M_%S')" >> $GITHUB_OUTPUT # Colons are not allowed in artifacts name - - - uses: actions/upload-artifact@v3 - if: always() - with: - name: tests_results_${{ matrix.python-version }}-${{ steps.date.outputs.date }}.csv - path: tests_results.csv From 277bef1a2a4368d54b2b1ce41b7894c51d4f7ef1 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Fri, 29 Sep 2023 15:56:33 +0200 Subject: [PATCH 012/102] Bump Spark and Hadoop versions --- docker/Dockerfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 85d01ba8a..a9b9e0a2c 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -2,8 +2,8 @@ ARG OPENJDK_VERSION=8 FROM eclipse-temurin:${OPENJDK_VERSION}-jre ARG BUILD_DATE -ARG SPARK_VERSION=3.1.3 -ARG HADOOP_VERSION=3.2 +ARG SPARK_VERSION=3.4.1 +ARG HADOOP_VERSION=3 LABEL org.label-schema.name="Apache Spark ${SPARK_VERSION}" \ org.label-schema.build-date=$BUILD_DATE \ From 8d5853d3049c5e299ab7d824ab33fc374a9894ff Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Fri, 29 Sep 2023 16:08:16 +0200 Subject: [PATCH 013/102] Reset Spark and Hadoop version --- docker/Dockerfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index a9b9e0a2c..85d01ba8a 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -2,8 +2,8 @@ ARG OPENJDK_VERSION=8 FROM eclipse-temurin:${OPENJDK_VERSION}-jre ARG BUILD_DATE -ARG SPARK_VERSION=3.4.1 -ARG HADOOP_VERSION=3 +ARG SPARK_VERSION=3.1.3 +ARG HADOOP_VERSION=3.2 LABEL org.label-schema.name="Apache Spark ${SPARK_VERSION}" \ org.label-schema.build-date=$BUILD_DATE \ From 919528ab14dd731f9efa913d37b051bda8922e44 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Fri, 29 Sep 2023 16:09:09 +0200 Subject: [PATCH 014/102] Update comment --- .github/workflows/integration.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index 9f26bd2be..f4c34c5fb 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -2,7 +2,7 @@ # Runs integration tests. # **why?** -# Ensure code for dbt meets a certain quality standard. +# Ensure code runs as expected. # **when?** # This will run for all PRs, when code is pushed to a release From 15e48fd3f1f8d421f7f079a20ca8ba5fd5995d69 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Fri, 29 Sep 2023 16:12:25 +0200 Subject: [PATCH 015/102] Add changie --- .changes/unreleased/Under the Hood-20230929-161218.yaml | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 .changes/unreleased/Under the Hood-20230929-161218.yaml diff --git a/.changes/unreleased/Under the Hood-20230929-161218.yaml b/.changes/unreleased/Under the Hood-20230929-161218.yaml new file mode 100644 index 000000000..c82e8252e --- /dev/null +++ b/.changes/unreleased/Under the Hood-20230929-161218.yaml @@ -0,0 +1,6 @@ +kind: Under the Hood +body: Add Github action for integration testing +time: 2023-09-29T16:12:18.968755+02:00 +custom: + Author: JCZuurmond + Issue: "719" From 31cb05e7d7dc6e5e63b3027a66428f22d40f86ce Mon Sep 17 00:00:00 2001 From: Colin Date: Wed, 18 Oct 2023 16:54:42 -0700 Subject: [PATCH 016/102] add databricks and PR execution protections --- .github/scripts/update_dbt_core_branch.sh | 20 +++ .github/scripts/update_release_branch.sh | 11 ++ .github/workflows/integration.yml | 193 +++++++++++++++++++++- 3 files changed, 215 insertions(+), 9 deletions(-) create mode 100755 .github/scripts/update_dbt_core_branch.sh create mode 100644 .github/scripts/update_release_branch.sh diff --git a/.github/scripts/update_dbt_core_branch.sh b/.github/scripts/update_dbt_core_branch.sh new file mode 100755 index 000000000..d28a40c35 --- /dev/null +++ b/.github/scripts/update_dbt_core_branch.sh @@ -0,0 +1,20 @@ +#!/bin/bash -e +set -e + +git_branch=$1 +target_req_file="dev-requirements.txt" +core_req_sed_pattern="s|dbt-core.git.*#egg=dbt-core|dbt-core.git@${git_branch}#egg=dbt-core|g" +postgres_req_sed_pattern="s|dbt-core.git.*#egg=dbt-postgres|dbt-core.git@${git_branch}#egg=dbt-postgres|g" +tests_req_sed_pattern="s|dbt-core.git.*#egg=dbt-tests|dbt-core.git@${git_branch}#egg=dbt-tests|g" +if [[ "$OSTYPE" == darwin* ]]; then + # mac ships with a different version of sed that requires a delimiter arg + sed -i "" "$core_req_sed_pattern" $target_req_file + sed -i "" "$postgres_req_sed_pattern" $target_req_file + sed -i "" "$tests_req_sed_pattern" $target_req_file +else + sed -i "$core_req_sed_pattern" $target_req_file + sed -i "$postgres_req_sed_pattern" $target_req_file + sed -i "$tests_req_sed_pattern" $target_req_file +fi +core_version=$(curl "https://raw.githubusercontent.com/dbt-labs/dbt-core/${git_branch}/core/dbt/version.py" | grep "__version__ = *"|cut -d'=' -f2) +bumpversion --allow-dirty --new-version "$core_version" major diff --git a/.github/scripts/update_release_branch.sh b/.github/scripts/update_release_branch.sh new file mode 100644 index 000000000..75b9ccef6 --- /dev/null +++ b/.github/scripts/update_release_branch.sh @@ -0,0 +1,11 @@ +#!/bin/bash -e +set -e + +release_branch=$1 +target_req_file=".github/workflows/nightly-release.yml" +if [[ "$OSTYPE" == darwin* ]]; then + # mac ships with a different version of sed that requires a delimiter arg + sed -i "" "s|[0-9].[0-9].latest|$release_branch|" $target_req_file +else + sed -i "s|[0-9].[0-9].latest|$release_branch|" $target_req_file +fi diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index f4c34c5fb..684bcfab5 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -18,6 +18,11 @@ on: - "releases/*" pull_request: workflow_dispatch: + inputs: + dbt-core-branch: + description: "branch of dbt-core to use in dev-requirements.txt" + required: false + type: string # explicitly turn off permissions for `GITHUB_TOKEN` permissions: read-all @@ -32,8 +37,60 @@ defaults: shell: bash jobs: - tests: - name: test with python ${{ matrix.python-version }} + # generate test metadata about what files changed and the testing matrix to use + test-metadata: + # run if not a PR from a forked repository or has a label to mark as safe to test + if: >- + github.event_name != 'pull_request_target' || + github.event.pull_request.head.repo.full_name == github.repository || + contains(github.event.pull_request.labels.*.name, 'ok to test') + runs-on: ubuntu-latest + + outputs: + matrix: ${{ steps.generate-matrix.outputs.result }} + run-python-tests: ${{ steps.filter.outputs.bigquery-python }} + + steps: + - name: Check out the repository (non-PR) + if: github.event_name != 'pull_request_target' + uses: actions/checkout@v3 + with: + persist-credentials: false + + - name: Check out the repository (PR) + if: github.event_name == 'pull_request_target' + uses: actions/checkout@v3 + with: + persist-credentials: false + ref: ${{ github.event.pull_request.head.sha }} + - name: Check if relevant files changed + if: github.event_name == 'pull_request_target' + # https://github.com/marketplace/actions/paths-changes-filter + # For each filter, it sets output variable named by the filter to the text: + # 'true' - if any of changed files matches any of filter rules + # 'false' - if none of changed files matches any of filter rules + # also, returns: + # `changes` - JSON array with names of all filters matching any of the changed files + uses: dorny/paths-filter@v2 + id: get-changes + with: + token: ${{ secrets.GITHUB_TOKEN }} + filters: | + spark: + - 'dbt/**' + - 'tests/**' + - 'dev-requirements.txt' + local-tests: + name: test spark local against python ${{ matrix.python-version }} + + # run if not a PR from a forked repository or has a label to mark as safe to test + # also checks that the matrix generated is not empty + if: >- + ( + github.event_name != 'pull_request_target' || + github.event.pull_request.head.repo.full_name == github.repository || + contains(github.event.pull_request.labels.*.name, 'ok to test') + ) runs-on: ubuntu-latest @@ -45,31 +102,149 @@ jobs: - "3.9" - "3.10" - "3.11" + test: + - "spark-thrift" + - "spark-session" env: - TOXENV: "unit" PYTEST_ADDOPTS: "-v --color=yes --csv test_results.csv" + DBT_INVOCATION_ENV: github-actions + DD_CIVISIBILITY_AGENTLESS_ENABLED: true + DD_API_KEY: ${{ secrets.DATADOG_API_KEY }} + DD_SITE: datadoghq.com + DD_ENV: ci + DD_SERVICE: ${{ github.event.repository.name }} steps: - name: Check out the repository + if: github.event_name != 'pull_request_target' uses: actions/checkout@v3 + with: + persist-credentials: false + + # explicity checkout the branch for the PR, + # this is necessary for the `pull_request_target` event + - name: Check out the repository (PR) + if: github.event_name == 'pull_request_target' + uses: actions/checkout@v3 + with: + persist-credentials: false + ref: ${{ github.event.pull_request.head.sha }} - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v4 with: python-version: ${{ matrix.python-version }} + - name: Install python dependencies + run: | + python -m pip install --user --upgrade pip + python -m pip install tox + python -m pip --version + tox --version + + - name: Update dev_requirements.txt + if: inputs.dbt-core-branch != '' + run: | + pip install bumpversion + ./.github/scripts/update_dbt_core_branch.sh ${{ inputs.dbt-core-branch }} + - uses: isbang/compose-action@v1.5.1 + if: ${{ matrix.test == 'spark-thrift'}} with: compose-file: "./docker-compose.yml" - - name: Install tox + - name: Run tox for Spark ${{ matrix.test }} + run: tox -e integration-${{ matrix.test }} + + databricks-tests: + name: test spark databricks against python ${{ matrix.python-version }} + # run if not a PR from a forked repository or has a label to mark as safe to test + # also checks that the matrix generated is not empty + if: >- + ( + github.event_name != 'pull_request_target' || + github.event.pull_request.head.repo.full_name == github.repository || + contains(github.event.pull_request.labels.*.name, 'ok to test') + ) + + runs-on: ubuntu-latest + container: + image: "fishtownanalytics/test-container:10" + strategy: + fail-fast: false + matrix: + python-version: + - "3.8" + - "3.9" + - "3.10" + - "3.11" + test: + - "databricks-odbc-sql-endpoint" + - "databricks-odbc-cluster" + - "spark-databricks-http" + + env: + PYTEST_ADDOPTS: "-v --color=yes --csv test_results.csv" + DBT_INVOCATION_ENV: github-actions + DD_CIVISIBILITY_AGENTLESS_ENABLED: true + DD_API_KEY: ${{ secrets.DATADOG_API_KEY }} + DD_SITE: datadoghq.com + DD_ENV: ci + DD_SERVICE: ${{ github.event.repository.name }} + DBT_DATABRICKS_CLUSTER_NAME: ${{ secrets.DBT_DATABRICKS_CLUSTER_NAME }} + DBT_DATABRICKS_HOSTNAME: ${{ secrets.DBT_DATABRICKS_HOST }} + DBT_DATABRICKS_ENDPOINT: ${{ secrets.DBT_DATABRICKS_ENDPOINT }} + DBT_DATABRICKS_TOKEN: ${{ secrets.DBT_DATABRICKS_TOKEN }} + DBT_DATABRICS_USER: ${{ secrets.DBT_DATABRICKS_USER }} + + steps: + - name: Check out the repository + if: github.event_name != 'pull_request_target' + uses: actions/checkout@v3 + with: + persist-credentials: false + + # explicity checkout the branch for the PR, + # this is necessary for the `pull_request_target` event + - name: Check out the repository (PR) + if: github.event_name == 'pull_request_target' + uses: actions/checkout@v3 + with: + persist-credentials: false + ref: ${{ github.event.pull_request.head.sha }} + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + + - name: Install python dependencies run: | - python -m pip install --upgrade pip + python -m pip install --user --upgrade pip python -m pip install tox + python -m pip --version + tox --version - - name: Run tox for Spark session - run: tox -e integration-spark-session + - name: Update dev_requirements.txt + if: inputs.dbt-core-branch != '' + run: | + pip install bumpversion + ./.github/scripts/update_dbt_core_branch.sh ${{ inputs.dbt-core-branch }} - - name: Run tox for Spark thrift - run: tox -e integration-spark-thrift + - name: Configure ODBC + if: ${{ matrix.test != "spark-databricks-http" }} + run: | + apt-get update && apt-get install -y --no-install-recommends \ + g++ \ + unixodbc-dev \ + unzip + + unzip /tmp/simba_odbc.zip -d /tmp/ \ + && dpkg -i /tmp/SimbaSparkODBC-*/*.deb \ + && echo "[Simba]\nDriver = /opt/simba/spark/lib/64/libsparkodbc_sb64.so" >> /etc/odbcinst.ini \ + && rm /tmp/simba_odbc.zip \ + && rm -rf /tmp/SimbaSparkODBC* + + - name: Run tox for Spark ${{ matrix.test }} + run: tox -e integration-${{ matrix.test }} \ No newline at end of file From fd54d7f78ccc3b42ac12d7b3f95b99992996e606 Mon Sep 17 00:00:00 2001 From: Colin Date: Mon, 23 Oct 2023 09:47:53 -0700 Subject: [PATCH 017/102] use single quotes --- .github/workflows/integration.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index 684bcfab5..a37744ca2 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -233,7 +233,7 @@ jobs: ./.github/scripts/update_dbt_core_branch.sh ${{ inputs.dbt-core-branch }} - name: Configure ODBC - if: ${{ matrix.test != "spark-databricks-http" }} + if: ${{ matrix.test != 'spark-databricks-http' }} run: | apt-get update && apt-get install -y --no-install-recommends \ g++ \ From 8de83390c8b7c4a169df33982cc61b59337e1dc2 Mon Sep 17 00:00:00 2001 From: Colin Date: Mon, 23 Oct 2023 09:53:06 -0700 Subject: [PATCH 018/102] remove `_target` suffix --- .github/workflows/integration.yml | 30 ++++++++++++------------------ 1 file changed, 12 insertions(+), 18 deletions(-) diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index a37744ca2..f33ade986 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -41,7 +41,7 @@ jobs: test-metadata: # run if not a PR from a forked repository or has a label to mark as safe to test if: >- - github.event_name != 'pull_request_target' || + github.event_name != 'pull_request' || github.event.pull_request.head.repo.full_name == github.repository || contains(github.event.pull_request.labels.*.name, 'ok to test') runs-on: ubuntu-latest @@ -52,19 +52,19 @@ jobs: steps: - name: Check out the repository (non-PR) - if: github.event_name != 'pull_request_target' + if: github.event_name != 'pull_request' uses: actions/checkout@v3 with: persist-credentials: false - name: Check out the repository (PR) - if: github.event_name == 'pull_request_target' + if: github.event_name == 'pull_request' uses: actions/checkout@v3 with: persist-credentials: false ref: ${{ github.event.pull_request.head.sha }} - name: Check if relevant files changed - if: github.event_name == 'pull_request_target' + if: github.event_name == 'pull_request' # https://github.com/marketplace/actions/paths-changes-filter # For each filter, it sets output variable named by the filter to the text: # 'true' - if any of changed files matches any of filter rules @@ -87,7 +87,7 @@ jobs: # also checks that the matrix generated is not empty if: >- ( - github.event_name != 'pull_request_target' || + github.event_name != 'pull_request' || github.event.pull_request.head.repo.full_name == github.repository || contains(github.event.pull_request.labels.*.name, 'ok to test') ) @@ -99,9 +99,6 @@ jobs: matrix: python-version: - "3.8" - - "3.9" - - "3.10" - - "3.11" test: - "spark-thrift" - "spark-session" @@ -117,15 +114,15 @@ jobs: steps: - name: Check out the repository - if: github.event_name != 'pull_request_target' + if: github.event_name != 'pull_request' uses: actions/checkout@v3 with: persist-credentials: false # explicity checkout the branch for the PR, - # this is necessary for the `pull_request_target` event + # this is necessary for the `pull_request` event - name: Check out the repository (PR) - if: github.event_name == 'pull_request_target' + if: github.event_name == 'pull_request' uses: actions/checkout@v3 with: persist-credentials: false @@ -163,7 +160,7 @@ jobs: # also checks that the matrix generated is not empty if: >- ( - github.event_name != 'pull_request_target' || + github.event_name != 'pull_request' || github.event.pull_request.head.repo.full_name == github.repository || contains(github.event.pull_request.labels.*.name, 'ok to test') ) @@ -176,9 +173,6 @@ jobs: matrix: python-version: - "3.8" - - "3.9" - - "3.10" - - "3.11" test: - "databricks-odbc-sql-endpoint" - "databricks-odbc-cluster" @@ -200,15 +194,15 @@ jobs: steps: - name: Check out the repository - if: github.event_name != 'pull_request_target' + if: github.event_name != 'pull_request' uses: actions/checkout@v3 with: persist-credentials: false # explicity checkout the branch for the PR, - # this is necessary for the `pull_request_target` event + # this is necessary for the `pull_request` event - name: Check out the repository (PR) - if: github.event_name == 'pull_request_target' + if: github.event_name == 'pull_request' uses: actions/checkout@v3 with: persist-credentials: false From e85232f3e476f4f80dfe188f3395612589245f7b Mon Sep 17 00:00:00 2001 From: Colin Date: Mon, 23 Oct 2023 10:33:28 -0700 Subject: [PATCH 019/102] add comment to test --- .github/workflows/integration.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index f33ade986..b4f78a1c9 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -63,6 +63,7 @@ jobs: with: persist-credentials: false ref: ${{ github.event.pull_request.head.sha }} + - name: Check if relevant files changed if: github.event_name == 'pull_request' # https://github.com/marketplace/actions/paths-changes-filter @@ -80,6 +81,7 @@ jobs: - 'dbt/**' - 'tests/**' - 'dev-requirements.txt' + local-tests: name: test spark local against python ${{ matrix.python-version }} From fe3300e22b830b4f78c6e9877ff8521ccc838019 Mon Sep 17 00:00:00 2001 From: Colin Date: Mon, 23 Oct 2023 10:58:39 -0700 Subject: [PATCH 020/102] specify container user as root --- .github/workflows/integration.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index b4f78a1c9..4f45fc6ae 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -170,6 +170,7 @@ jobs: runs-on: ubuntu-latest container: image: "fishtownanalytics/test-container:10" + options: --user root strategy: fail-fast: false matrix: From b37e14b9dc2c0279d669c2a8fcb8b098834cd27b Mon Sep 17 00:00:00 2001 From: Colin Date: Mon, 23 Oct 2023 10:59:08 -0700 Subject: [PATCH 021/102] formatting --- .github/workflows/integration.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index 4f45fc6ae..72a86c92e 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -236,7 +236,7 @@ jobs: g++ \ unixodbc-dev \ unzip - + unzip /tmp/simba_odbc.zip -d /tmp/ \ && dpkg -i /tmp/SimbaSparkODBC-*/*.deb \ && echo "[Simba]\nDriver = /opt/simba/spark/lib/64/libsparkodbc_sb64.so" >> /etc/odbcinst.ini \ @@ -244,4 +244,4 @@ jobs: && rm -rf /tmp/SimbaSparkODBC* - name: Run tox for Spark ${{ matrix.test }} - run: tox -e integration-${{ matrix.test }} \ No newline at end of file + run: tox -e integration-${{ matrix.test }} From 51511ecfee08958080dbb0a9c8dbe881bec7c9b3 Mon Sep 17 00:00:00 2001 From: Colin Date: Mon, 23 Oct 2023 11:01:03 -0700 Subject: [PATCH 022/102] remove python setup for pre-existing container --- .github/workflows/integration.yml | 5 ----- 1 file changed, 5 deletions(-) diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index 72a86c92e..288c7ea18 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -211,11 +211,6 @@ jobs: persist-credentials: false ref: ${{ github.event.pull_request.head.sha }} - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v4 - with: - python-version: ${{ matrix.python-version }} - - name: Install python dependencies run: | python -m pip install --user --upgrade pip From 98607b61458199d006ce8526e763bcc89f5426a6 Mon Sep 17 00:00:00 2001 From: Colin Date: Mon, 23 Oct 2023 11:21:38 -0700 Subject: [PATCH 023/102] download simba --- .github/workflows/integration.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index 288c7ea18..5f6e4b45b 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -230,8 +230,13 @@ jobs: apt-get update && apt-get install -y --no-install-recommends \ g++ \ unixodbc-dev \ + curl \ unzip + curl -OL \ + https://databricks.com/wp-content/uploads/drivers-2020/SimbaSparkODBC-2.6.16.1019-Debian-64bit.zip \ + /tmp/simba_odbc.zip + unzip /tmp/simba_odbc.zip -d /tmp/ \ && dpkg -i /tmp/SimbaSparkODBC-*/*.deb \ && echo "[Simba]\nDriver = /opt/simba/spark/lib/64/libsparkodbc_sb64.so" >> /etc/odbcinst.ini \ From e6ec41460d986cc552fa46024be471147f152920 Mon Sep 17 00:00:00 2001 From: Colin Date: Mon, 23 Oct 2023 11:31:45 -0700 Subject: [PATCH 024/102] fix curl call --- .github/workflows/integration.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index 5f6e4b45b..764038394 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -233,9 +233,9 @@ jobs: curl \ unzip - curl -OL \ - https://databricks.com/wp-content/uploads/drivers-2020/SimbaSparkODBC-2.6.16.1019-Debian-64bit.zip \ - /tmp/simba_odbc.zip + curl --create-dirs -OL \ + --output-dir "/tmp/simba_odbc.zip" \ + "https://databricks.com/wp-content/uploads/drivers-2020/SimbaSparkODBC-2.6.16.1019-Debian-64bit.zip" unzip /tmp/simba_odbc.zip -d /tmp/ \ && dpkg -i /tmp/SimbaSparkODBC-*/*.deb \ From 05a2c0858434686ecc5f64ac4dd3d0bc3344c325 Mon Sep 17 00:00:00 2001 From: Colin Date: Mon, 23 Oct 2023 11:33:31 -0700 Subject: [PATCH 025/102] fix curl call --- .github/workflows/integration.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index 764038394..9fcd701fe 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -230,8 +230,9 @@ jobs: apt-get update && apt-get install -y --no-install-recommends \ g++ \ unixodbc-dev \ - curl \ unzip + + apt-get install curl curl --create-dirs -OL \ --output-dir "/tmp/simba_odbc.zip" \ From a89ec581eff88b1c24a1da3cebd19c8981b6cd88 Mon Sep 17 00:00:00 2001 From: Colin Date: Mon, 23 Oct 2023 11:35:13 -0700 Subject: [PATCH 026/102] fix curl call --- .github/workflows/integration.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index 9fcd701fe..f8fa81ceb 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -232,7 +232,7 @@ jobs: unixodbc-dev \ unzip - apt-get install curl + apt-get install -y curl curl --create-dirs -OL \ --output-dir "/tmp/simba_odbc.zip" \ From 2a18fad185a748cb9ac82198653d97b7f3a5b597 Mon Sep 17 00:00:00 2001 From: Colin Date: Mon, 23 Oct 2023 11:39:53 -0700 Subject: [PATCH 027/102] fix curl call --- .github/workflows/integration.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index f8fa81ceb..be6443a13 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -226,6 +226,7 @@ jobs: - name: Configure ODBC if: ${{ matrix.test != 'spark-databricks-http' }} + shell: bash run: | apt-get update && apt-get install -y --no-install-recommends \ g++ \ @@ -235,7 +236,7 @@ jobs: apt-get install -y curl curl --create-dirs -OL \ - --output-dir "/tmp/simba_odbc.zip" \ + --output "/tmp/simba_odbc.zip" \ "https://databricks.com/wp-content/uploads/drivers-2020/SimbaSparkODBC-2.6.16.1019-Debian-64bit.zip" unzip /tmp/simba_odbc.zip -d /tmp/ \ From 1481396d6307b93f0b21aed722a6299bb50d29ba Mon Sep 17 00:00:00 2001 From: Colin Date: Mon, 23 Oct 2023 11:44:48 -0700 Subject: [PATCH 028/102] fix curl call --- .github/workflows/integration.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index be6443a13..a47d5271f 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -235,7 +235,7 @@ jobs: apt-get install -y curl - curl --create-dirs -OL \ + curl --create-dirs \ --output "/tmp/simba_odbc.zip" \ "https://databricks.com/wp-content/uploads/drivers-2020/SimbaSparkODBC-2.6.16.1019-Debian-64bit.zip" From 31b427c47b6c064ba284b91818d964b3b03eff3a Mon Sep 17 00:00:00 2001 From: Colin Date: Mon, 23 Oct 2023 11:50:51 -0700 Subject: [PATCH 029/102] fix curl call --- .github/workflows/integration.yml | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index a47d5271f..d9e71d5e6 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -234,16 +234,14 @@ jobs: unzip apt-get install -y curl + rm -rf /tmp && mkdir /tmp + + curl -OL "https://databricks.com/wp-content/uploads/drivers-2020/SimbaSparkODBC-2.6.16.1019-Debian-64bit.zip" \ - curl --create-dirs \ - --output "/tmp/simba_odbc.zip" \ - "https://databricks.com/wp-content/uploads/drivers-2020/SimbaSparkODBC-2.6.16.1019-Debian-64bit.zip" - - unzip /tmp/simba_odbc.zip -d /tmp/ \ + unzip SimbaSparkODBC-2.6.16.1019-Debian-64bit.zip -d /tmp/ \ && dpkg -i /tmp/SimbaSparkODBC-*/*.deb \ && echo "[Simba]\nDriver = /opt/simba/spark/lib/64/libsparkodbc_sb64.so" >> /etc/odbcinst.ini \ - && rm /tmp/simba_odbc.zip \ - && rm -rf /tmp/SimbaSparkODBC* + && rm -rf /tmp - name: Run tox for Spark ${{ matrix.test }} run: tox -e integration-${{ matrix.test }} From 15ba1da4adcb33dedec541dcdda6e0e1de1728a2 Mon Sep 17 00:00:00 2001 From: Colin Date: Mon, 23 Oct 2023 12:00:02 -0700 Subject: [PATCH 030/102] fix db test naming --- .github/workflows/integration.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index d9e71d5e6..ff48a9b30 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -177,8 +177,8 @@ jobs: python-version: - "3.8" test: - - "databricks-odbc-sql-endpoint" - - "databricks-odbc-cluster" + - "spark-databricks-odbc-sql-endpoint" + - "spark-databricks-odbc-cluster" - "spark-databricks-http" env: @@ -190,7 +190,7 @@ jobs: DD_ENV: ci DD_SERVICE: ${{ github.event.repository.name }} DBT_DATABRICKS_CLUSTER_NAME: ${{ secrets.DBT_DATABRICKS_CLUSTER_NAME }} - DBT_DATABRICKS_HOSTNAME: ${{ secrets.DBT_DATABRICKS_HOST }} + DBT_DATABRICKS_HOST_NAME: ${{ secrets.DBT_DATABRICKS_HOST }} DBT_DATABRICKS_ENDPOINT: ${{ secrets.DBT_DATABRICKS_ENDPOINT }} DBT_DATABRICKS_TOKEN: ${{ secrets.DBT_DATABRICKS_TOKEN }} DBT_DATABRICS_USER: ${{ secrets.DBT_DATABRICKS_USER }} From ca33a236ebbdd9fa9cef5b1a703b0002b03257fe Mon Sep 17 00:00:00 2001 From: Colin Date: Mon, 23 Oct 2023 13:52:37 -0700 Subject: [PATCH 031/102] confirm ODBC driver installed --- .github/workflows/integration.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index ff48a9b30..a8a131a61 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -241,7 +241,9 @@ jobs: unzip SimbaSparkODBC-2.6.16.1019-Debian-64bit.zip -d /tmp/ \ && dpkg -i /tmp/SimbaSparkODBC-*/*.deb \ && echo "[Simba]\nDriver = /opt/simba/spark/lib/64/libsparkodbc_sb64.so" >> /etc/odbcinst.ini \ - && rm -rf /tmp + && rm -rf /tmp \ + && dpkg -l | grep Simba # confirm that the driver is installed + - name: Run tox for Spark ${{ matrix.test }} run: tox -e integration-${{ matrix.test }} From 6274d77151ba32cb4b45abddb300603d88d860c6 Mon Sep 17 00:00:00 2001 From: Colin Date: Mon, 23 Oct 2023 14:17:52 -0700 Subject: [PATCH 032/102] add odbc driver env var --- .github/workflows/integration.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index a8a131a61..27f5d6bda 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -194,7 +194,7 @@ jobs: DBT_DATABRICKS_ENDPOINT: ${{ secrets.DBT_DATABRICKS_ENDPOINT }} DBT_DATABRICKS_TOKEN: ${{ secrets.DBT_DATABRICKS_TOKEN }} DBT_DATABRICS_USER: ${{ secrets.DBT_DATABRICKS_USER }} - + ODBC_DRIVER: "Simba" steps: - name: Check out the repository if: github.event_name != 'pull_request' @@ -240,7 +240,7 @@ jobs: unzip SimbaSparkODBC-2.6.16.1019-Debian-64bit.zip -d /tmp/ \ && dpkg -i /tmp/SimbaSparkODBC-*/*.deb \ - && echo "[Simba]\nDriver = /opt/simba/spark/lib/64/libsparkodbc_sb64.so" >> /etc/odbcinst.ini \ + && echo "[Simba]\nDriver = $HOME/opt/simba/spark/lib/64/libsparkodbc_sb64.so" >> /etc/odbcinst.ini \ && rm -rf /tmp \ && dpkg -l | grep Simba # confirm that the driver is installed From 0ba91a2ebc553e322fd20ff3ebb49c9aa810e656 Mon Sep 17 00:00:00 2001 From: Colin Date: Mon, 23 Oct 2023 14:40:02 -0700 Subject: [PATCH 033/102] add odbc driver env var --- .github/workflows/integration.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index 27f5d6bda..1dd657085 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -240,7 +240,7 @@ jobs: unzip SimbaSparkODBC-2.6.16.1019-Debian-64bit.zip -d /tmp/ \ && dpkg -i /tmp/SimbaSparkODBC-*/*.deb \ - && echo "[Simba]\nDriver = $HOME/opt/simba/spark/lib/64/libsparkodbc_sb64.so" >> /etc/odbcinst.ini \ + && echo "[Simba]\nDriver = /opt/simba/spark/lib/64/libsparkodbc_sb64.so" >> /etc/odbcinst.ini \ && rm -rf /tmp \ && dpkg -l | grep Simba # confirm that the driver is installed From f09202681f49ac144508d4bc4c0f72460767455c Mon Sep 17 00:00:00 2001 From: Colin Date: Mon, 23 Oct 2023 15:11:32 -0700 Subject: [PATCH 034/102] specify platform --- .github/workflows/integration.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index 1dd657085..38b8faa35 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -170,7 +170,7 @@ jobs: runs-on: ubuntu-latest container: image: "fishtownanalytics/test-container:10" - options: --user root + options: --user root --platform linux/amd64 strategy: fail-fast: false matrix: From b968985be43080580252b9ac38e410248103e4e6 Mon Sep 17 00:00:00 2001 From: Colin Date: Mon, 23 Oct 2023 15:47:48 -0700 Subject: [PATCH 035/102] check odbc driver integrity --- .github/workflows/integration.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index 38b8faa35..61cf5a634 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -157,7 +157,7 @@ jobs: run: tox -e integration-${{ matrix.test }} databricks-tests: - name: test spark databricks against python ${{ matrix.python-version }} + name: run ${{ matrix.test }} against python ${{ matrix.python-version }} # run if not a PR from a forked repository or has a label to mark as safe to test # also checks that the matrix generated is not empty if: >- @@ -193,7 +193,6 @@ jobs: DBT_DATABRICKS_HOST_NAME: ${{ secrets.DBT_DATABRICKS_HOST }} DBT_DATABRICKS_ENDPOINT: ${{ secrets.DBT_DATABRICKS_ENDPOINT }} DBT_DATABRICKS_TOKEN: ${{ secrets.DBT_DATABRICKS_TOKEN }} - DBT_DATABRICS_USER: ${{ secrets.DBT_DATABRICKS_USER }} ODBC_DRIVER: "Simba" steps: - name: Check out the repository @@ -244,6 +243,8 @@ jobs: && rm -rf /tmp \ && dpkg -l | grep Simba # confirm that the driver is installed + ldd /opt/simba/spark/lib/64/libsparkodbc_sb64.so + - name: Run tox for Spark ${{ matrix.test }} run: tox -e integration-${{ matrix.test }} From 8a49567fcf3c9748dd75e6ff9c629759b92a4bbd Mon Sep 17 00:00:00 2001 From: Colin Date: Mon, 23 Oct 2023 15:53:42 -0700 Subject: [PATCH 036/102] add dbt user env var --- .github/workflows/integration.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index 61cf5a634..41177f054 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -193,6 +193,10 @@ jobs: DBT_DATABRICKS_HOST_NAME: ${{ secrets.DBT_DATABRICKS_HOST }} DBT_DATABRICKS_ENDPOINT: ${{ secrets.DBT_DATABRICKS_ENDPOINT }} DBT_DATABRICKS_TOKEN: ${{ secrets.DBT_DATABRICKS_TOKEN }} + DBT_DATABRICKS_USERNAME: ${{ secrets.DBT_DATABRICKS_USERNAME }} + DBT_TEST_USER_1: "buildbot+dbt_test_user_1@dbtlabs.com" + DBT_TEST_USER_2: "buildbot+dbt_test_user_2@dbtlabs.com" + DBT_TEST_USER_3: "buildbot+dbt_test_user_3@dbtlabs.com" ODBC_DRIVER: "Simba" steps: - name: Check out the repository From 7723e8d90e7af6c2513b8e435ca40805591fcedc Mon Sep 17 00:00:00 2001 From: Colin Date: Mon, 23 Oct 2023 15:58:29 -0700 Subject: [PATCH 037/102] add dbt user env var --- .github/workflows/integration.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index 41177f054..c91dc9bbb 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -193,7 +193,7 @@ jobs: DBT_DATABRICKS_HOST_NAME: ${{ secrets.DBT_DATABRICKS_HOST }} DBT_DATABRICKS_ENDPOINT: ${{ secrets.DBT_DATABRICKS_ENDPOINT }} DBT_DATABRICKS_TOKEN: ${{ secrets.DBT_DATABRICKS_TOKEN }} - DBT_DATABRICKS_USERNAME: ${{ secrets.DBT_DATABRICKS_USERNAME }} + DBT_DATABRICKS_USER: ${{ secrets.DBT_DATABRICKS_USERNAME }} DBT_TEST_USER_1: "buildbot+dbt_test_user_1@dbtlabs.com" DBT_TEST_USER_2: "buildbot+dbt_test_user_2@dbtlabs.com" DBT_TEST_USER_3: "buildbot+dbt_test_user_3@dbtlabs.com" From ea5ebfa32a90c909cbbc87e79bd094eb16030a1d Mon Sep 17 00:00:00 2001 From: Colin Date: Mon, 23 Oct 2023 16:42:13 -0700 Subject: [PATCH 038/102] fix host_name env var --- .github/workflows/integration.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index c91dc9bbb..5ee981c45 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -190,7 +190,7 @@ jobs: DD_ENV: ci DD_SERVICE: ${{ github.event.repository.name }} DBT_DATABRICKS_CLUSTER_NAME: ${{ secrets.DBT_DATABRICKS_CLUSTER_NAME }} - DBT_DATABRICKS_HOST_NAME: ${{ secrets.DBT_DATABRICKS_HOST }} + DBT_DATABRICKS_HOST_NAME: ${{ secrets.DBT_DATABRICKS_HOST_NAME }} DBT_DATABRICKS_ENDPOINT: ${{ secrets.DBT_DATABRICKS_ENDPOINT }} DBT_DATABRICKS_TOKEN: ${{ secrets.DBT_DATABRICKS_TOKEN }} DBT_DATABRICKS_USER: ${{ secrets.DBT_DATABRICKS_USERNAME }} From 610e5e912bebdcf105fcd64f777a035983fbffcb Mon Sep 17 00:00:00 2001 From: Colin Date: Tue, 24 Oct 2023 09:55:55 -0700 Subject: [PATCH 039/102] try removing architecture arg --- .github/workflows/integration.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index 5ee981c45..631e8a6de 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -170,7 +170,7 @@ jobs: runs-on: ubuntu-latest container: image: "fishtownanalytics/test-container:10" - options: --user root --platform linux/amd64 + options: --user root strategy: fail-fast: false matrix: From b4411ab011bb285cf2d07bf0be2ff90ee185f682 Mon Sep 17 00:00:00 2001 From: Colin Date: Tue, 24 Oct 2023 10:01:01 -0700 Subject: [PATCH 040/102] swap back to pull_request_target --- .github/workflows/integration.yml | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index 631e8a6de..62e276cc1 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -16,7 +16,7 @@ on: - "main" - "*.latest" - "releases/*" - pull_request: + pull_request_target: workflow_dispatch: inputs: dbt-core-branch: @@ -29,7 +29,7 @@ permissions: read-all # will cancel previous workflows triggered by the same event and for the same ref for PRs or same SHA otherwise concurrency: - group: ${{ github.workflow }}-${{ github.event_name }}-${{ contains(github.event_name, 'pull_request') && github.event.pull_request.head.ref || github.sha }} + group: ${{ github.workflow }}-${{ github.event_name }}-${{ contains(github.event_name, 'pull_request_target') && github.event.pull_request.head.ref || github.sha }} cancel-in-progress: true defaults: @@ -41,7 +41,7 @@ jobs: test-metadata: # run if not a PR from a forked repository or has a label to mark as safe to test if: >- - github.event_name != 'pull_request' || + github.event_name != 'pull_request_target' || github.event.pull_request.head.repo.full_name == github.repository || contains(github.event.pull_request.labels.*.name, 'ok to test') runs-on: ubuntu-latest @@ -52,20 +52,20 @@ jobs: steps: - name: Check out the repository (non-PR) - if: github.event_name != 'pull_request' + if: github.event_name != 'pull_request_target' uses: actions/checkout@v3 with: persist-credentials: false - name: Check out the repository (PR) - if: github.event_name == 'pull_request' + if: github.event_name == 'pull_request_target' uses: actions/checkout@v3 with: persist-credentials: false ref: ${{ github.event.pull_request.head.sha }} - name: Check if relevant files changed - if: github.event_name == 'pull_request' + if: github.event_name == 'pull_request_target' # https://github.com/marketplace/actions/paths-changes-filter # For each filter, it sets output variable named by the filter to the text: # 'true' - if any of changed files matches any of filter rules @@ -89,7 +89,7 @@ jobs: # also checks that the matrix generated is not empty if: >- ( - github.event_name != 'pull_request' || + github.event_name != 'pull_request_target' || github.event.pull_request.head.repo.full_name == github.repository || contains(github.event.pull_request.labels.*.name, 'ok to test') ) @@ -116,7 +116,7 @@ jobs: steps: - name: Check out the repository - if: github.event_name != 'pull_request' + if: github.event_name != 'pull_request_target' uses: actions/checkout@v3 with: persist-credentials: false @@ -124,7 +124,7 @@ jobs: # explicity checkout the branch for the PR, # this is necessary for the `pull_request` event - name: Check out the repository (PR) - if: github.event_name == 'pull_request' + if: github.event_name == 'pull_request_target' uses: actions/checkout@v3 with: persist-credentials: false @@ -162,7 +162,7 @@ jobs: # also checks that the matrix generated is not empty if: >- ( - github.event_name != 'pull_request' || + github.event_name != 'pull_request_target' || github.event.pull_request.head.repo.full_name == github.repository || contains(github.event.pull_request.labels.*.name, 'ok to test') ) @@ -200,7 +200,7 @@ jobs: ODBC_DRIVER: "Simba" steps: - name: Check out the repository - if: github.event_name != 'pull_request' + if: github.event_name != 'pull_request_target' uses: actions/checkout@v3 with: persist-credentials: false @@ -208,7 +208,7 @@ jobs: # explicity checkout the branch for the PR, # this is necessary for the `pull_request` event - name: Check out the repository (PR) - if: github.event_name == 'pull_request' + if: github.event_name == 'pull_request_target' uses: actions/checkout@v3 with: persist-credentials: false From cae6c8abc0abfc57d9a17dba3c0abb0495841249 Mon Sep 17 00:00:00 2001 From: Colin Date: Tue, 24 Oct 2023 12:13:18 -0700 Subject: [PATCH 041/102] try running on host instead of container --- .github/workflows/integration.yml | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index 62e276cc1..10f9ce6f0 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -168,9 +168,6 @@ jobs: ) runs-on: ubuntu-latest - container: - image: "fishtownanalytics/test-container:10" - options: --user root strategy: fail-fast: false matrix: @@ -214,6 +211,11 @@ jobs: persist-credentials: false ref: ${{ github.event.pull_request.head.sha }} + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + - name: Install python dependencies run: | python -m pip install --user --upgrade pip From 0c689720b96d592ff2f8e8267bb5ef0e1e0a9736 Mon Sep 17 00:00:00 2001 From: colin-rogers-dbt <111200756+colin-rogers-dbt@users.noreply.github.com> Date: Tue, 24 Oct 2023 12:13:43 -0700 Subject: [PATCH 042/102] Update .github/workflows/integration.yml Co-authored-by: Emily Rockman --- .github/workflows/integration.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index 10f9ce6f0..d1829197b 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -203,7 +203,7 @@ jobs: persist-credentials: false # explicity checkout the branch for the PR, - # this is necessary for the `pull_request` event + # this is necessary for the `pull_request_target` event - name: Check out the repository (PR) if: github.event_name == 'pull_request_target' uses: actions/checkout@v3 From b2f63bd09fb59ba9f751bc425f81242afeef8bd6 Mon Sep 17 00:00:00 2001 From: Colin Date: Tue, 24 Oct 2023 14:37:41 -0700 Subject: [PATCH 043/102] try running odbcinst -j --- .github/workflows/integration.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index d1829197b..f3368d11a 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -250,7 +250,8 @@ jobs: && dpkg -l | grep Simba # confirm that the driver is installed ldd /opt/simba/spark/lib/64/libsparkodbc_sb64.so - + echo "--------------------------------------------" + odbcinst -j - name: Run tox for Spark ${{ matrix.test }} run: tox -e integration-${{ matrix.test }} From 80eb7e45e25316dfa539786975c34b6655d77e88 Mon Sep 17 00:00:00 2001 From: Colin Date: Tue, 24 Oct 2023 14:51:32 -0700 Subject: [PATCH 044/102] remove bash --- .github/workflows/integration.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index f3368d11a..45e313482 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -17,6 +17,10 @@ on: - "*.latest" - "releases/*" pull_request_target: + types: + - opened + - synchronize + - labeled workflow_dispatch: inputs: dbt-core-branch: @@ -231,7 +235,6 @@ jobs: - name: Configure ODBC if: ${{ matrix.test != 'spark-databricks-http' }} - shell: bash run: | apt-get update && apt-get install -y --no-install-recommends \ g++ \ From 4bbfa71b2c80f056a1e67c1587dbe06ac8fa3613 Mon Sep 17 00:00:00 2001 From: Colin Date: Tue, 24 Oct 2023 14:54:33 -0700 Subject: [PATCH 045/102] add sudo --- .github/workflows/integration.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index 45e313482..90e2782a8 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -236,12 +236,12 @@ jobs: - name: Configure ODBC if: ${{ matrix.test != 'spark-databricks-http' }} run: | - apt-get update && apt-get install -y --no-install-recommends \ + sudo apt-get update && sudo apt-get install -y --no-install-recommends \ g++ \ unixodbc-dev \ unzip - apt-get install -y curl + sudo apt-get install -y curl rm -rf /tmp && mkdir /tmp curl -OL "https://databricks.com/wp-content/uploads/drivers-2020/SimbaSparkODBC-2.6.16.1019-Debian-64bit.zip" \ From b1d202023f10aaeb5b7742996ddcdf7ca4bc7abf Mon Sep 17 00:00:00 2001 From: Colin Date: Tue, 24 Oct 2023 14:55:46 -0700 Subject: [PATCH 046/102] add sudo --- .github/workflows/integration.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index 90e2782a8..142752b66 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -247,12 +247,12 @@ jobs: curl -OL "https://databricks.com/wp-content/uploads/drivers-2020/SimbaSparkODBC-2.6.16.1019-Debian-64bit.zip" \ unzip SimbaSparkODBC-2.6.16.1019-Debian-64bit.zip -d /tmp/ \ - && dpkg -i /tmp/SimbaSparkODBC-*/*.deb \ + && sudo dpkg -i /tmp/SimbaSparkODBC-*/*.deb \ && echo "[Simba]\nDriver = /opt/simba/spark/lib/64/libsparkodbc_sb64.so" >> /etc/odbcinst.ini \ && rm -rf /tmp \ - && dpkg -l | grep Simba # confirm that the driver is installed + && sudo dpkg -l | grep Simba # confirm that the driver is installed - ldd /opt/simba/spark/lib/64/libsparkodbc_sb64.so + sudo ldd /opt/simba/spark/lib/64/libsparkodbc_sb64.so echo "--------------------------------------------" odbcinst -j From 38fda3d22f8103c07ce0091a1b3b530c5d36d26f Mon Sep 17 00:00:00 2001 From: Colin Date: Tue, 24 Oct 2023 14:59:11 -0700 Subject: [PATCH 047/102] update odbc.ini --- .github/workflows/integration.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index 142752b66..08f55f848 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -248,6 +248,7 @@ jobs: unzip SimbaSparkODBC-2.6.16.1019-Debian-64bit.zip -d /tmp/ \ && sudo dpkg -i /tmp/SimbaSparkODBC-*/*.deb \ + && echo "[ODBC Data Sources]\nSimba=Databricks ODBC Connector" >> /etc/odbc.ini \ && echo "[Simba]\nDriver = /opt/simba/spark/lib/64/libsparkodbc_sb64.so" >> /etc/odbcinst.ini \ && rm -rf /tmp \ && sudo dpkg -l | grep Simba # confirm that the driver is installed From 6b599a1eceb755a5ef5b91d95760b01a364f648c Mon Sep 17 00:00:00 2001 From: Colin Date: Tue, 24 Oct 2023 15:02:54 -0700 Subject: [PATCH 048/102] install libsasl2-modules-gssapi-mit --- .github/workflows/integration.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index 08f55f848..6dfd716b2 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -239,6 +239,7 @@ jobs: sudo apt-get update && sudo apt-get install -y --no-install-recommends \ g++ \ unixodbc-dev \ + libsasl2-modules-gssapi-mit \ unzip sudo apt-get install -y curl From 0976c4f70fe8e36169dfb34b922c4e5cdc1f2238 Mon Sep 17 00:00:00 2001 From: Colin Date: Tue, 24 Oct 2023 15:08:07 -0700 Subject: [PATCH 049/102] install libsasl2-modules-gssapi-mit --- .github/workflows/integration.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index 6dfd716b2..6807507df 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -249,8 +249,8 @@ jobs: unzip SimbaSparkODBC-2.6.16.1019-Debian-64bit.zip -d /tmp/ \ && sudo dpkg -i /tmp/SimbaSparkODBC-*/*.deb \ - && echo "[ODBC Data Sources]\nSimba=Databricks ODBC Connector" >> /etc/odbc.ini \ - && echo "[Simba]\nDriver = /opt/simba/spark/lib/64/libsparkodbc_sb64.so" >> /etc/odbcinst.ini \ + && sudo echo "[ODBC Data Sources]\nSimba=Databricks ODBC Connector" >> /etc/odbc.ini \ + && sudo echo "[Simba]\nDriver = /opt/simba/spark/lib/64/libsparkodbc_sb64.so" >> /etc/odbcinst.ini \ && rm -rf /tmp \ && sudo dpkg -l | grep Simba # confirm that the driver is installed From 42f2784210514349c14b54dcba673139f0226470 Mon Sep 17 00:00:00 2001 From: Colin Date: Tue, 24 Oct 2023 15:09:23 -0700 Subject: [PATCH 050/102] set -e on odbc install --- .github/workflows/integration.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index 6807507df..235fb49e2 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -236,6 +236,7 @@ jobs: - name: Configure ODBC if: ${{ matrix.test != 'spark-databricks-http' }} run: | + set -e sudo apt-get update && sudo apt-get install -y --no-install-recommends \ g++ \ unixodbc-dev \ From 4f11291045081be0c2975772475b917ee24e4173 Mon Sep 17 00:00:00 2001 From: Colin Date: Tue, 24 Oct 2023 15:13:07 -0700 Subject: [PATCH 051/102] set -e on odbc install --- .github/workflows/integration.yml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index 235fb49e2..92794b427 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -248,12 +248,12 @@ jobs: curl -OL "https://databricks.com/wp-content/uploads/drivers-2020/SimbaSparkODBC-2.6.16.1019-Debian-64bit.zip" \ - unzip SimbaSparkODBC-2.6.16.1019-Debian-64bit.zip -d /tmp/ \ - && sudo dpkg -i /tmp/SimbaSparkODBC-*/*.deb \ - && sudo echo "[ODBC Data Sources]\nSimba=Databricks ODBC Connector" >> /etc/odbc.ini \ - && sudo echo "[Simba]\nDriver = /opt/simba/spark/lib/64/libsparkodbc_sb64.so" >> /etc/odbcinst.ini \ - && rm -rf /tmp \ - && sudo dpkg -l | grep Simba # confirm that the driver is installed + unzip SimbaSparkODBC-2.6.16.1019-Debian-64bit.zip -d /tmp/ + sudo dpkg -i /tmp/SimbaSparkODBC-*/*.deb \ + echo "--------------------------------------------" + sudo echo "[Simba]\nDriver = /opt/simba/spark/lib/64/libsparkodbc_sb64.so" >> /etc/odbcinst.ini \ + rm -rf /tmp \ + sudo dpkg -l | grep Simba # confirm that the driver is installed sudo ldd /opt/simba/spark/lib/64/libsparkodbc_sb64.so echo "--------------------------------------------" From 1384084e4d08c3b3c9b449229192685eb90c96e0 Mon Sep 17 00:00:00 2001 From: Colin Date: Tue, 24 Oct 2023 15:14:20 -0700 Subject: [PATCH 052/102] set -e on odbc install --- .github/workflows/integration.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index 92794b427..e76a5d9ac 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -249,7 +249,7 @@ jobs: curl -OL "https://databricks.com/wp-content/uploads/drivers-2020/SimbaSparkODBC-2.6.16.1019-Debian-64bit.zip" \ unzip SimbaSparkODBC-2.6.16.1019-Debian-64bit.zip -d /tmp/ - sudo dpkg -i /tmp/SimbaSparkODBC-*/*.deb \ + sudo dpkg -i /tmp/SimbaSparkODBC-*/*.deb echo "--------------------------------------------" sudo echo "[Simba]\nDriver = /opt/simba/spark/lib/64/libsparkodbc_sb64.so" >> /etc/odbcinst.ini \ rm -rf /tmp \ From 543e321077ed193d05e60a3c3acaba7aca2c0e37 Mon Sep 17 00:00:00 2001 From: Colin Date: Tue, 24 Oct 2023 15:21:08 -0700 Subject: [PATCH 053/102] sudo echo odbc.inst --- .github/workflows/integration.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index e76a5d9ac..da40dde86 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -251,8 +251,9 @@ jobs: unzip SimbaSparkODBC-2.6.16.1019-Debian-64bit.zip -d /tmp/ sudo dpkg -i /tmp/SimbaSparkODBC-*/*.deb echo "--------------------------------------------" - sudo echo "[Simba]\nDriver = /opt/simba/spark/lib/64/libsparkodbc_sb64.so" >> /etc/odbcinst.ini \ - rm -rf /tmp \ + sudo sh -c echo "[Simba]\nDriver = /opt/simba/spark/lib/64/libsparkodbc_sb64.so" >> /etc/odbcinst.ini \ + + rm -rf /tmp sudo dpkg -l | grep Simba # confirm that the driver is installed sudo ldd /opt/simba/spark/lib/64/libsparkodbc_sb64.so From f380d46a99205051d1bac84d4741009fb5f1de77 Mon Sep 17 00:00:00 2001 From: Mike Alfare Date: Wed, 1 Nov 2023 20:19:17 -0400 Subject: [PATCH 054/102] remove postgres components --- .github/scripts/update_dbt_core_branch.sh | 3 --- 1 file changed, 3 deletions(-) diff --git a/.github/scripts/update_dbt_core_branch.sh b/.github/scripts/update_dbt_core_branch.sh index d28a40c35..1a5a5c2d7 100755 --- a/.github/scripts/update_dbt_core_branch.sh +++ b/.github/scripts/update_dbt_core_branch.sh @@ -4,16 +4,13 @@ set -e git_branch=$1 target_req_file="dev-requirements.txt" core_req_sed_pattern="s|dbt-core.git.*#egg=dbt-core|dbt-core.git@${git_branch}#egg=dbt-core|g" -postgres_req_sed_pattern="s|dbt-core.git.*#egg=dbt-postgres|dbt-core.git@${git_branch}#egg=dbt-postgres|g" tests_req_sed_pattern="s|dbt-core.git.*#egg=dbt-tests|dbt-core.git@${git_branch}#egg=dbt-tests|g" if [[ "$OSTYPE" == darwin* ]]; then # mac ships with a different version of sed that requires a delimiter arg sed -i "" "$core_req_sed_pattern" $target_req_file - sed -i "" "$postgres_req_sed_pattern" $target_req_file sed -i "" "$tests_req_sed_pattern" $target_req_file else sed -i "$core_req_sed_pattern" $target_req_file - sed -i "$postgres_req_sed_pattern" $target_req_file sed -i "$tests_req_sed_pattern" $target_req_file fi core_version=$(curl "https://raw.githubusercontent.com/dbt-labs/dbt-core/${git_branch}/core/dbt/version.py" | grep "__version__ = *"|cut -d'=' -f2) From c334f3273bd7dda434d9bb4dac0f57579c2117d7 Mon Sep 17 00:00:00 2001 From: Mike Alfare Date: Wed, 1 Nov 2023 20:23:18 -0400 Subject: [PATCH 055/102] remove release related items --- .github/scripts/update_release_branch.sh | 11 ----------- 1 file changed, 11 deletions(-) delete mode 100644 .github/scripts/update_release_branch.sh diff --git a/.github/scripts/update_release_branch.sh b/.github/scripts/update_release_branch.sh deleted file mode 100644 index 75b9ccef6..000000000 --- a/.github/scripts/update_release_branch.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/bash -e -set -e - -release_branch=$1 -target_req_file=".github/workflows/nightly-release.yml" -if [[ "$OSTYPE" == darwin* ]]; then - # mac ships with a different version of sed that requires a delimiter arg - sed -i "" "s|[0-9].[0-9].latest|$release_branch|" $target_req_file -else - sed -i "s|[0-9].[0-9].latest|$release_branch|" $target_req_file -fi From 19dcff3f4f44c99ab4c4e3ad8872597a5185cefa Mon Sep 17 00:00:00 2001 From: Mike Alfare Date: Wed, 1 Nov 2023 20:33:12 -0400 Subject: [PATCH 056/102] remove irrelevant output --- .github/workflows/integration.yml | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index da40dde86..b85e058e2 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -52,7 +52,6 @@ jobs: outputs: matrix: ${{ steps.generate-matrix.outputs.result }} - run-python-tests: ${{ steps.filter.outputs.bigquery-python }} steps: - name: Check out the repository (non-PR) @@ -242,20 +241,20 @@ jobs: unixodbc-dev \ libsasl2-modules-gssapi-mit \ unzip - + sudo apt-get install -y curl rm -rf /tmp && mkdir /tmp - + curl -OL "https://databricks.com/wp-content/uploads/drivers-2020/SimbaSparkODBC-2.6.16.1019-Debian-64bit.zip" \ unzip SimbaSparkODBC-2.6.16.1019-Debian-64bit.zip -d /tmp/ sudo dpkg -i /tmp/SimbaSparkODBC-*/*.deb echo "--------------------------------------------" sudo sh -c echo "[Simba]\nDriver = /opt/simba/spark/lib/64/libsparkodbc_sb64.so" >> /etc/odbcinst.ini \ - + rm -rf /tmp sudo dpkg -l | grep Simba # confirm that the driver is installed - + sudo ldd /opt/simba/spark/lib/64/libsparkodbc_sb64.so echo "--------------------------------------------" odbcinst -j From 01b0c0cdd74b88e92c7f44d58e092e356ed01b00 Mon Sep 17 00:00:00 2001 From: Mike Alfare Date: Wed, 1 Nov 2023 20:37:19 -0400 Subject: [PATCH 057/102] move long bash script into its own file --- .github/scripts/configure_odbc.sh | 23 +++++++++++++++++++++++ .github/workflows/integration.yml | 24 +----------------------- 2 files changed, 24 insertions(+), 23 deletions(-) create mode 100644 .github/scripts/configure_odbc.sh diff --git a/.github/scripts/configure_odbc.sh b/.github/scripts/configure_odbc.sh new file mode 100644 index 000000000..e2bad8886 --- /dev/null +++ b/.github/scripts/configure_odbc.sh @@ -0,0 +1,23 @@ +set -e +sudo apt-get update && sudo apt-get install -y --no-install-recommends \ + g++ \ + unixodbc-dev \ + libsasl2-modules-gssapi-mit \ + unzip + +sudo apt-get install -y curl +rm -rf /tmp && mkdir /tmp + +curl -OL "https://databricks.com/wp-content/uploads/drivers-2020/SimbaSparkODBC-2.6.16.1019-Debian-64bit.zip" + +unzip SimbaSparkODBC-2.6.16.1019-Debian-64bit.zip -d /tmp/ +sudo dpkg -i /tmp/SimbaSparkODBC-*/*.deb +echo "--------------------------------------------" +sudo sh -c echo "[Simba]\nDriver = /opt/simba/spark/lib/64/libsparkodbc_sb64.so" >> /etc/odbcinst.ini + +rm -rf /tmp +sudo dpkg -l | grep Simba # confirm that the driver is installed + +sudo ldd /opt/simba/spark/lib/64/libsparkodbc_sb64.so +echo "--------------------------------------------" +odbcinst -j diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index b85e058e2..b9d6ddcbe 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -235,29 +235,7 @@ jobs: - name: Configure ODBC if: ${{ matrix.test != 'spark-databricks-http' }} run: | - set -e - sudo apt-get update && sudo apt-get install -y --no-install-recommends \ - g++ \ - unixodbc-dev \ - libsasl2-modules-gssapi-mit \ - unzip - - sudo apt-get install -y curl - rm -rf /tmp && mkdir /tmp - - curl -OL "https://databricks.com/wp-content/uploads/drivers-2020/SimbaSparkODBC-2.6.16.1019-Debian-64bit.zip" \ - - unzip SimbaSparkODBC-2.6.16.1019-Debian-64bit.zip -d /tmp/ - sudo dpkg -i /tmp/SimbaSparkODBC-*/*.deb - echo "--------------------------------------------" - sudo sh -c echo "[Simba]\nDriver = /opt/simba/spark/lib/64/libsparkodbc_sb64.so" >> /etc/odbcinst.ini \ - - rm -rf /tmp - sudo dpkg -l | grep Simba # confirm that the driver is installed - - sudo ldd /opt/simba/spark/lib/64/libsparkodbc_sb64.so - echo "--------------------------------------------" - odbcinst -j + ./.github/scripts/configure_odbc.sh - name: Run tox for Spark ${{ matrix.test }} run: tox -e integration-${{ matrix.test }} From d3d28446b87595580380136c4cc42a369e38a069 Mon Sep 17 00:00:00 2001 From: Mike Alfare Date: Wed, 1 Nov 2023 20:58:25 -0400 Subject: [PATCH 058/102] update integration.yml to align with other adapters --- .github/workflows/integration.yml | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index b9d6ddcbe..6bdee8c32 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -8,7 +8,7 @@ # This will run for all PRs, when code is pushed to a release # branch, and when manually triggered. -name: Integration tests +name: Adapter Integration Tests on: push: @@ -49,7 +49,6 @@ jobs: github.event.pull_request.head.repo.full_name == github.repository || contains(github.event.pull_request.labels.*.name, 'ok to test') runs-on: ubuntu-latest - outputs: matrix: ${{ steps.generate-matrix.outputs.result }} @@ -86,7 +85,7 @@ jobs: - 'dev-requirements.txt' local-tests: - name: test spark local against python ${{ matrix.python-version }} + name: ${{ matrix.test }} / python ${{ matrix.python-version }} / ubuntu-latest # run if not a PR from a forked repository or has a label to mark as safe to test # also checks that the matrix generated is not empty @@ -96,7 +95,6 @@ jobs: github.event.pull_request.head.repo.full_name == github.repository || contains(github.event.pull_request.labels.*.name, 'ok to test') ) - runs-on: ubuntu-latest strategy: @@ -109,7 +107,8 @@ jobs: - "spark-session" env: - PYTEST_ADDOPTS: "-v --color=yes --csv test_results.csv" + TOXENV: integration-${{ matrix.test }} + PYTEST_ADDOPTS: "-v --color=yes --csv integration_results.csv" DBT_INVOCATION_ENV: github-actions DD_CIVISIBILITY_AGENTLESS_ENABLED: true DD_API_KEY: ${{ secrets.DATADOG_API_KEY }} @@ -124,7 +123,7 @@ jobs: with: persist-credentials: false - # explicity checkout the branch for the PR, + # explicitly checkout the branch for the PR, # this is necessary for the `pull_request` event - name: Check out the repository (PR) if: github.event_name == 'pull_request_target' @@ -156,11 +155,12 @@ jobs: with: compose-file: "./docker-compose.yml" - - name: Run tox for Spark ${{ matrix.test }} - run: tox -e integration-${{ matrix.test }} + - name: Run tox for ${{ matrix.test }} + run: tox -- --ddtrace databricks-tests: - name: run ${{ matrix.test }} against python ${{ matrix.python-version }} + name: ${{ matrix.test }} / python ${{ matrix.python-version }} / ubuntu-latest + # run if not a PR from a forked repository or has a label to mark as safe to test # also checks that the matrix generated is not empty if: >- @@ -182,6 +182,7 @@ jobs: - "spark-databricks-http" env: + TOXENV: integration-${{ matrix.test }} PYTEST_ADDOPTS: "-v --color=yes --csv test_results.csv" DBT_INVOCATION_ENV: github-actions DD_CIVISIBILITY_AGENTLESS_ENABLED: true @@ -205,7 +206,7 @@ jobs: with: persist-credentials: false - # explicity checkout the branch for the PR, + # explicitly checkout the branch for the PR, # this is necessary for the `pull_request_target` event - name: Check out the repository (PR) if: github.event_name == 'pull_request_target' @@ -237,5 +238,5 @@ jobs: run: | ./.github/scripts/configure_odbc.sh - - name: Run tox for Spark ${{ matrix.test }} - run: tox -e integration-${{ matrix.test }} + - name: Run tox for ${{ matrix.test }} + run: tox -- --ddtrace From 72daf90d0a5a20534e2b9c5b97f79cb50ca7742c Mon Sep 17 00:00:00 2001 From: Mike Alfare Date: Wed, 1 Nov 2023 21:08:36 -0400 Subject: [PATCH 059/102] revert name change --- .github/workflows/integration.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index 6bdee8c32..37449d892 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -8,7 +8,7 @@ # This will run for all PRs, when code is pushed to a release # branch, and when manually triggered. -name: Adapter Integration Tests +name: Integration tests on: push: From b43c9d1a2e7a97ed1c59e28a74e36769de69616c Mon Sep 17 00:00:00 2001 From: Mike Alfare Date: Wed, 1 Nov 2023 21:11:00 -0400 Subject: [PATCH 060/102] revert name change --- .github/workflows/integration.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index 37449d892..6bdee8c32 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -8,7 +8,7 @@ # This will run for all PRs, when code is pushed to a release # branch, and when manually triggered. -name: Integration tests +name: Adapter Integration Tests on: push: From 91715d23a01f0a1039d961b2c24790c8f1ded30e Mon Sep 17 00:00:00 2001 From: Mike Alfare Date: Wed, 1 Nov 2023 21:23:31 -0400 Subject: [PATCH 061/102] combine databricks and spark tests --- .github/workflows/integration.yml | 86 ++++--------------------------- 1 file changed, 10 insertions(+), 76 deletions(-) diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index 6bdee8c32..1e60aee1b 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -84,7 +84,7 @@ jobs: - 'tests/**' - 'dev-requirements.txt' - local-tests: + test: name: ${{ matrix.test }} / python ${{ matrix.python-version }} / ubuntu-latest # run if not a PR from a forked repository or has a label to mark as safe to test @@ -105,85 +105,13 @@ jobs: test: - "spark-thrift" - "spark-session" - - env: - TOXENV: integration-${{ matrix.test }} - PYTEST_ADDOPTS: "-v --color=yes --csv integration_results.csv" - DBT_INVOCATION_ENV: github-actions - DD_CIVISIBILITY_AGENTLESS_ENABLED: true - DD_API_KEY: ${{ secrets.DATADOG_API_KEY }} - DD_SITE: datadoghq.com - DD_ENV: ci - DD_SERVICE: ${{ github.event.repository.name }} - - steps: - - name: Check out the repository - if: github.event_name != 'pull_request_target' - uses: actions/checkout@v3 - with: - persist-credentials: false - - # explicitly checkout the branch for the PR, - # this is necessary for the `pull_request` event - - name: Check out the repository (PR) - if: github.event_name == 'pull_request_target' - uses: actions/checkout@v3 - with: - persist-credentials: false - ref: ${{ github.event.pull_request.head.sha }} - - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v4 - with: - python-version: ${{ matrix.python-version }} - - - name: Install python dependencies - run: | - python -m pip install --user --upgrade pip - python -m pip install tox - python -m pip --version - tox --version - - - name: Update dev_requirements.txt - if: inputs.dbt-core-branch != '' - run: | - pip install bumpversion - ./.github/scripts/update_dbt_core_branch.sh ${{ inputs.dbt-core-branch }} - - - uses: isbang/compose-action@v1.5.1 - if: ${{ matrix.test == 'spark-thrift'}} - with: - compose-file: "./docker-compose.yml" - - - name: Run tox for ${{ matrix.test }} - run: tox -- --ddtrace - - databricks-tests: - name: ${{ matrix.test }} / python ${{ matrix.python-version }} / ubuntu-latest - - # run if not a PR from a forked repository or has a label to mark as safe to test - # also checks that the matrix generated is not empty - if: >- - ( - github.event_name != 'pull_request_target' || - github.event.pull_request.head.repo.full_name == github.repository || - contains(github.event.pull_request.labels.*.name, 'ok to test') - ) - - runs-on: ubuntu-latest - strategy: - fail-fast: false - matrix: - python-version: - - "3.8" - test: - "spark-databricks-odbc-sql-endpoint" - "spark-databricks-odbc-cluster" - "spark-databricks-http" env: TOXENV: integration-${{ matrix.test }} - PYTEST_ADDOPTS: "-v --color=yes --csv test_results.csv" + PYTEST_ADDOPTS: "-v --color=yes --csv integration_results.csv" DBT_INVOCATION_ENV: github-actions DD_CIVISIBILITY_AGENTLESS_ENABLED: true DD_API_KEY: ${{ secrets.DATADOG_API_KEY }} @@ -199,6 +127,7 @@ jobs: DBT_TEST_USER_2: "buildbot+dbt_test_user_2@dbtlabs.com" DBT_TEST_USER_3: "buildbot+dbt_test_user_3@dbtlabs.com" ODBC_DRIVER: "Simba" + steps: - name: Check out the repository if: github.event_name != 'pull_request_target' @@ -207,7 +136,7 @@ jobs: persist-credentials: false # explicitly checkout the branch for the PR, - # this is necessary for the `pull_request_target` event + # this is necessary for the `pull_request` event - name: Check out the repository (PR) if: github.event_name == 'pull_request_target' uses: actions/checkout@v3 @@ -233,8 +162,13 @@ jobs: pip install bumpversion ./.github/scripts/update_dbt_core_branch.sh ${{ inputs.dbt-core-branch }} + - uses: isbang/compose-action@v1.5.1 + if: ${{ matrix.test == 'spark-thrift'}} + with: + compose-file: "./docker-compose.yml" + - name: Configure ODBC - if: ${{ matrix.test != 'spark-databricks-http' }} + if: ${{ matrix.test == 'spark-databricks-odbc-sql-endpoint' || matrix.test == 'spark-databricks-odbc-cluster' }} run: | ./.github/scripts/configure_odbc.sh From 943a8dc3030a4fbff9a1f401133a1ef382bb538a Mon Sep 17 00:00:00 2001 From: Mike Alfare Date: Wed, 1 Nov 2023 21:26:27 -0400 Subject: [PATCH 062/102] combine databricks and spark tests --- .github/workflows/integration.yml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index 1e60aee1b..1389550a2 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -17,10 +17,6 @@ on: - "*.latest" - "releases/*" pull_request_target: - types: - - opened - - synchronize - - labeled workflow_dispatch: inputs: dbt-core-branch: From 3d0decefb6a2a453c6a806cc467a2763f02a9ade Mon Sep 17 00:00:00 2001 From: Colin Date: Thu, 30 Nov 2023 15:14:17 -0800 Subject: [PATCH 063/102] Add dagger --- .github/scripts/configure_odbc.sh | 23 ------------------- dagger/configure_odbc.sh | 20 ++++++++++++++++ dagger/run_dbt_spark_tests.py | 38 +++++++++++++++++++++++++++++++ 3 files changed, 58 insertions(+), 23 deletions(-) delete mode 100644 .github/scripts/configure_odbc.sh create mode 100755 dagger/configure_odbc.sh create mode 100644 dagger/run_dbt_spark_tests.py diff --git a/.github/scripts/configure_odbc.sh b/.github/scripts/configure_odbc.sh deleted file mode 100644 index e2bad8886..000000000 --- a/.github/scripts/configure_odbc.sh +++ /dev/null @@ -1,23 +0,0 @@ -set -e -sudo apt-get update && sudo apt-get install -y --no-install-recommends \ - g++ \ - unixodbc-dev \ - libsasl2-modules-gssapi-mit \ - unzip - -sudo apt-get install -y curl -rm -rf /tmp && mkdir /tmp - -curl -OL "https://databricks.com/wp-content/uploads/drivers-2020/SimbaSparkODBC-2.6.16.1019-Debian-64bit.zip" - -unzip SimbaSparkODBC-2.6.16.1019-Debian-64bit.zip -d /tmp/ -sudo dpkg -i /tmp/SimbaSparkODBC-*/*.deb -echo "--------------------------------------------" -sudo sh -c echo "[Simba]\nDriver = /opt/simba/spark/lib/64/libsparkodbc_sb64.so" >> /etc/odbcinst.ini - -rm -rf /tmp -sudo dpkg -l | grep Simba # confirm that the driver is installed - -sudo ldd /opt/simba/spark/lib/64/libsparkodbc_sb64.so -echo "--------------------------------------------" -odbcinst -j diff --git a/dagger/configure_odbc.sh b/dagger/configure_odbc.sh new file mode 100755 index 000000000..7126298c0 --- /dev/null +++ b/dagger/configure_odbc.sh @@ -0,0 +1,20 @@ +#!/bin/bash +set -e +apt update && apt install -y --no-install-recommends \ + g++ \ + git \ + curl \ + unixodbc-dev \ + libsasl2-modules-gssapi-mit \ + unzip + +rm -rf /tmp && mkdir /tmp + +curl -OL "https://databricks.com/wp-content/uploads/drivers-2020/SimbaSparkODBC-2.6.16.1019-Debian-64bit.zip" + +unzip SimbaSparkODBC-2.6.16.1019-Debian-64bit.zip -d /tmp/ +dpkg -i /tmp/*/simbaspark_2.6.16.1019-2_amd64.deb +echo "--------------------------------------------" +echo sh -c echo "[Simba]\nDriver = /opt/simba/spark/lib/64/libsparkodbc_sb64.so" >> /etc/odbcinst.ini +dpkg -l | grep Simba # confirm that the driver is installed +rm -rf /tmp diff --git a/dagger/run_dbt_spark_tests.py b/dagger/run_dbt_spark_tests.py new file mode 100644 index 000000000..85a3b8246 --- /dev/null +++ b/dagger/run_dbt_spark_tests.py @@ -0,0 +1,38 @@ +import argparse +import sys + +import anyio as anyio +import dagger as dagger + + +async def test_spark(test_args): + async with dagger.Connection(dagger.Config(log_output=sys.stderr)) as client: + install_dir = client.host().directory("./", exclude=["\\.pytest_cache/*", ".idea/*"]) + platform = dagger.Platform("linux/amd64") + tst_container = ( + client.container() + .from_("python:3.8-slim") + .with_directory("/dbt_spark", install_dir) + .with_workdir("/dbt_spark") + .with_exec("./dagger/configure_odbc.sh") + .with_exec(["pip", "install", "-r", "requirements.txt"]) + .with_exec(["pip", "install", "-r", "dev-requirements.txt"]) + ) + + result = await (tst_container + .with_workdir("/dbt_spark") + .with_exec(["python", '-m', 'pytest', '-v', + '--profile', test_args.profile, + '-n', 'auto', + 'tests/functional/'] + ) + ).stdout() + + return result + + +parser = argparse.ArgumentParser() +parser.add_argument("--profile", required=True, type=str) +args = parser.parse_args() + +anyio.run(test_spark, args) From 080b816731708bc2bdae8f648588799b358b939c Mon Sep 17 00:00:00 2001 From: Colin Date: Thu, 30 Nov 2023 15:15:07 -0800 Subject: [PATCH 064/102] remove platform --- dagger/run_dbt_spark_tests.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dagger/run_dbt_spark_tests.py b/dagger/run_dbt_spark_tests.py index 85a3b8246..a2125a310 100644 --- a/dagger/run_dbt_spark_tests.py +++ b/dagger/run_dbt_spark_tests.py @@ -8,7 +8,7 @@ async def test_spark(test_args): async with dagger.Connection(dagger.Config(log_output=sys.stderr)) as client: install_dir = client.host().directory("./", exclude=["\\.pytest_cache/*", ".idea/*"]) - platform = dagger.Platform("linux/amd64") + tst_container = ( client.container() .from_("python:3.8-slim") From c8477ced3779879a40db2beca2135de38d9c3a87 Mon Sep 17 00:00:00 2001 From: Colin Date: Mon, 8 Jan 2024 11:14:35 -0800 Subject: [PATCH 065/102] add dagger setup --- .github/workflows/integration.yml | 30 +++---- .gitignore | 2 + dagger/run_dbt_spark_tests.py | 105 +++++++++++++++++++++---- dagger/{ => scripts}/configure_odbc.sh | 16 +--- dev-requirements.txt | 4 +- tests/conftest.py | 2 +- 6 files changed, 109 insertions(+), 50 deletions(-) rename dagger/{ => scripts}/configure_odbc.sh (51%) diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index 1389550a2..88a73884f 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -96,14 +96,12 @@ jobs: strategy: fail-fast: false matrix: - python-version: - - "3.8" test: - - "spark-thrift" - - "spark-session" - - "spark-databricks-odbc-sql-endpoint" - - "spark-databricks-odbc-cluster" - - "spark-databricks-http" + - "apache_spark" + - "spark_session" + - "databricks_sql_endpoint" + - "databricks_cluster" + - "databricks_http_cluster" env: TOXENV: integration-${{ matrix.test }} @@ -143,14 +141,13 @@ jobs: - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v4 with: - python-version: ${{ matrix.python-version }} + python-version: "3.11" - name: Install python dependencies run: | python -m pip install --user --upgrade pip - python -m pip install tox python -m pip --version - tox --version + python -m pip install dagger-io~=0.8.0 - name: Update dev_requirements.txt if: inputs.dbt-core-branch != '' @@ -158,15 +155,6 @@ jobs: pip install bumpversion ./.github/scripts/update_dbt_core_branch.sh ${{ inputs.dbt-core-branch }} - - uses: isbang/compose-action@v1.5.1 - if: ${{ matrix.test == 'spark-thrift'}} - with: - compose-file: "./docker-compose.yml" - - - name: Configure ODBC - if: ${{ matrix.test == 'spark-databricks-odbc-sql-endpoint' || matrix.test == 'spark-databricks-odbc-cluster' }} - run: | - ./.github/scripts/configure_odbc.sh - - name: Run tox for ${{ matrix.test }} - run: tox -- --ddtrace + - name: Run tests for ${{ matrix.test }} + run: python dagger/run_dbt_spark_tests.py --profile ${{ matrix.test }} diff --git a/.gitignore b/.gitignore index 33a83848c..1e8ff7411 100644 --- a/.gitignore +++ b/.gitignore @@ -44,3 +44,5 @@ test.env .hive-metastore/ .spark-warehouse/ dbt-integration-tests +/.tool-versions +/.hypothesis/* diff --git a/dagger/run_dbt_spark_tests.py b/dagger/run_dbt_spark_tests.py index a2125a310..a5be95dd4 100644 --- a/dagger/run_dbt_spark_tests.py +++ b/dagger/run_dbt_spark_tests.py @@ -4,29 +4,106 @@ import anyio as anyio import dagger as dagger +PG_PORT = 5432 + + +async def get_postgres_container(client: dagger.Client) -> (dagger.Container, str): + ctr = await ( + client.container() + .from_("postgres:13") + .with_env_variable("POSTGRES_PASSWORD", "postgres") + .with_exposed_port(PG_PORT) + ) + + return ctr, "postgres_db" + + +async def get_spark_container(client: dagger.Client) -> (dagger.Container, str): + spark_dir = client.host().directory("./dagger/spark-container") + spark_ctr = ( + client.container() + .from_("eclipse-temurin:8-jre") + .with_directory("/spark_setup", spark_dir) + .with_env_variable("SPARK_HOME", "/usr/spark") + .with_env_variable("PATH", "/usr/spark/bin:/usr/spark/sbin:$PATH", expand=True) + .with_file( + "/scripts/entrypoint.sh", + client.host().file("./dagger/spark-container/entrypoint.sh"), + permissions=755, + ) + .with_file( + "/scripts/install_spark.sh", + client.host().file("./dagger/spark-container/install_spark.sh"), + permissions=755, + ) + .with_exec(["./spark_setup/install_spark.sh"]) + .with_file("/usr/spark/conf/hive-site.xml", spark_dir.file("/hive-site.xml")) + .with_file("/usr/spark/conf/spark-defaults.conf", spark_dir.file("spark-defaults.conf")) + ) + + # postgres is the metastore here + pg_ctr, pg_host = await get_postgres_container(client) + + spark_ctr = ( + spark_ctr.with_service_binding(alias=pg_host, service=pg_ctr) + .with_exec( + [ + "/scripts/entrypoint.sh", + "--class", + "org.apache.spark.sql.hive.thriftserver.HiveThriftServer2", + "--name", + "Thrift JDBC/ODBC Server", + ] + ) + .with_exposed_port(10000) + ) + + return spark_ctr, "spark_db" + async def test_spark(test_args): async with dagger.Connection(dagger.Config(log_output=sys.stderr)) as client: - install_dir = client.host().directory("./", exclude=["\\.pytest_cache/*", ".idea/*"]) - + req_files = client.host().directory("./", include=["*.txt", "*.env", "*.ini"]) + dbt_spark_dir = client.host().directory("./dbt") + test_dir = client.host().directory("./tests") + scripts = client.host().directory("./dagger/scripts") + platform = dagger.Platform("linux/amd64") tst_container = ( - client.container() + client.container(platform=platform) .from_("python:3.8-slim") - .with_directory("/dbt_spark", install_dir) - .with_workdir("/dbt_spark") - .with_exec("./dagger/configure_odbc.sh") + .with_directory("/.", req_files) + .with_directory("/dbt", dbt_spark_dir) + .with_directory("/tests", test_dir) + .with_directory("/scripts", scripts) + .with_exec("./scripts/install_os_reqs.sh") .with_exec(["pip", "install", "-r", "requirements.txt"]) .with_exec(["pip", "install", "-r", "dev-requirements.txt"]) ) - result = await (tst_container - .with_workdir("/dbt_spark") - .with_exec(["python", '-m', 'pytest', '-v', - '--profile', test_args.profile, - '-n', 'auto', - 'tests/functional/'] - ) - ).stdout() + if test_args.profile == "apache_spark": + spark_ctr, spark_host = await get_spark_container(client) + tst_container = tst_container.with_service_binding(alias=spark_host, service=spark_ctr) + + elif test_args.profile in ["databricks_cluster", "databricks_sql_endpoint"]: + tst_container = tst_container.with_exec("./scripts/configure_odbc.sh") + + elif test_args.profile == "spark_session": + tst_container = tst_container.with_exec(["pip", "install", "pyspark"]) + tst_container = tst_container.with_exec(["apt-get", "install", "openjdk-17-jre", "-y"]) + + result = await tst_container.with_exec( + [ + "python", + "-m", + "pytest", + "-v", + "--profile", + test_args.profile, + "-n", + "auto", + "tests/functional/", + ] + ).stdout() return result diff --git a/dagger/configure_odbc.sh b/dagger/scripts/configure_odbc.sh similarity index 51% rename from dagger/configure_odbc.sh rename to dagger/scripts/configure_odbc.sh index 7126298c0..50e80914d 100755 --- a/dagger/configure_odbc.sh +++ b/dagger/scripts/configure_odbc.sh @@ -1,20 +1,12 @@ #!/bin/bash -set -e -apt update && apt install -y --no-install-recommends \ - g++ \ - git \ - curl \ - unixodbc-dev \ - libsasl2-modules-gssapi-mit \ - unzip - +set -eo rm -rf /tmp && mkdir /tmp curl -OL "https://databricks.com/wp-content/uploads/drivers-2020/SimbaSparkODBC-2.6.16.1019-Debian-64bit.zip" - unzip SimbaSparkODBC-2.6.16.1019-Debian-64bit.zip -d /tmp/ -dpkg -i /tmp/*/simbaspark_2.6.16.1019-2_amd64.deb +dpkg -i /tmp/SimbaSparkODBC-2.6.16.1019-Debian-64bit/simbaspark_2.6.16.1019-2_amd64.deb echo "--------------------------------------------" -echo sh -c echo "[Simba]\nDriver = /opt/simba/spark/lib/64/libsparkodbc_sb64.so" >> /etc/odbcinst.ini +echo "[Simba]\nDriver = /opt/simba/spark/lib/64/libsparkodbc_sb64.so" >> /etc/odbcinst.ini dpkg -l | grep Simba # confirm that the driver is installed +export ODBC_DRIVER="/opt/simba/spark/lib/64/libsparkodbc_sb64.so" rm -rf /tmp diff --git a/dev-requirements.txt b/dev-requirements.txt index 8f94d509d..89c55d3f9 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -1,7 +1,7 @@ # install latest changes in dbt-core # TODO: how to automate switching from develop to version branches? -git+https://github.com/dbt-labs/dbt-core.git#egg=dbt-core&subdirectory=core -git+https://github.com/dbt-labs/dbt-core.git#egg=dbt-tests-adapter&subdirectory=tests/adapter +git+https://github.com/dbt-labs/dbt-core.git@c2bc2f009bbeeb46b3c69d082ab4d485597898af#egg=dbt-core&subdirectory=core +git+https://github.com/dbt-labs/dbt-core.git@c2bc2f009bbeeb46b3c69d082ab4d485597898af#egg=dbt-tests-adapter&subdirectory=tests/adapter # if version 1.x or greater -> pin to major version # if version 0.x -> pin to minor diff --git a/tests/conftest.py b/tests/conftest.py index 94969e406..700ade4d3 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -38,7 +38,7 @@ def dbt_profile_target(request): def apache_spark_target(): return { "type": "spark", - "host": "localhost", + "host": "spark_db", "user": "dbt", "method": "thrift", "port": 10000, From c0a37aeff43c549131299ea4b5a487baf06634ae Mon Sep 17 00:00:00 2001 From: Colin Date: Mon, 8 Jan 2024 11:15:00 -0800 Subject: [PATCH 066/102] add dagger setup --- dagger/scripts/install_os_reqs.sh | 10 +++++ dagger/spark-container/entrypoint.sh | 15 +++++++ dagger/spark-container/hive-site.xml | 46 ++++++++++++++++++++++ dagger/spark-container/install_spark.sh | 15 +++++++ dagger/spark-container/spark-defaults.conf | 9 +++++ 5 files changed, 95 insertions(+) create mode 100755 dagger/scripts/install_os_reqs.sh create mode 100644 dagger/spark-container/entrypoint.sh create mode 100644 dagger/spark-container/hive-site.xml create mode 100755 dagger/spark-container/install_spark.sh create mode 100644 dagger/spark-container/spark-defaults.conf diff --git a/dagger/scripts/install_os_reqs.sh b/dagger/scripts/install_os_reqs.sh new file mode 100755 index 000000000..47457b8d6 --- /dev/null +++ b/dagger/scripts/install_os_reqs.sh @@ -0,0 +1,10 @@ +#!/bin/bash +set -eo +apt-get update && apt-get install -y --no-install-recommends \ + g++ \ + git \ + curl \ + unixodbc \ + unixodbc-dev \ + libsasl2-modules-gssapi-mit \ + unzip \ No newline at end of file diff --git a/dagger/spark-container/entrypoint.sh b/dagger/spark-container/entrypoint.sh new file mode 100644 index 000000000..4b15cab61 --- /dev/null +++ b/dagger/spark-container/entrypoint.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +if [ -n "$WAIT_FOR" ]; then + IFS=';' read -a HOSTPORT_ARRAY <<< "$WAIT_FOR" + for HOSTPORT in "${HOSTPORT_ARRAY[@]}" + do + WAIT_FOR_HOST=${HOSTPORT%:*} + WAIT_FOR_PORT=${HOSTPORT#*:} + + echo Waiting for $WAIT_FOR_HOST to listen on $WAIT_FOR_PORT... + while ! nc -z $WAIT_FOR_HOST $WAIT_FOR_PORT; do echo sleeping; sleep 2; done + done +fi +echo "$PATH" +exec spark-submit "$@" diff --git a/dagger/spark-container/hive-site.xml b/dagger/spark-container/hive-site.xml new file mode 100644 index 000000000..93e966fb7 --- /dev/null +++ b/dagger/spark-container/hive-site.xml @@ -0,0 +1,46 @@ + + + + + + + + javax.jdo.option.ConnectionURL + jdbc:postgresql://postgres_db/postgres + + + + javax.jdo.option.ConnectionDriverName + org.postgresql.Driver + + + + javax.jdo.option.ConnectionUserName + postgres + + + + javax.jdo.option.ConnectionPassword + postgres + + + + hive.metastore.schema.verification + false + + diff --git a/dagger/spark-container/install_spark.sh b/dagger/spark-container/install_spark.sh new file mode 100755 index 000000000..476f362a9 --- /dev/null +++ b/dagger/spark-container/install_spark.sh @@ -0,0 +1,15 @@ +set -e + +SPARK_VERSION=3.1.3 +HADOOP_VERSION=3.2 + +apt-get update && \ +apt-get install -y wget netcat procps libpostgresql-jdbc-java && \ +wget -q "https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz" && \ +tar xzf "spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz" && \ +rm "spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz" && \ +mv "spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}" /usr/spark && \ +ln -s /usr/share/java/postgresql-jdbc4.jar /usr/spark/jars/postgresql-jdbc4.jar && \ +apt-get remove -y wget && \ +apt-get autoremove -y && \ +apt-get clean diff --git a/dagger/spark-container/spark-defaults.conf b/dagger/spark-container/spark-defaults.conf new file mode 100644 index 000000000..30ec59591 --- /dev/null +++ b/dagger/spark-container/spark-defaults.conf @@ -0,0 +1,9 @@ +spark.driver.memory 2g +spark.executor.memory 2g +spark.hadoop.datanucleus.autoCreateTables true +spark.hadoop.datanucleus.schema.autoCreateTables true +spark.hadoop.datanucleus.fixedDatastore false +spark.serializer org.apache.spark.serializer.KryoSerializer +spark.jars.packages org.apache.hudi:hudi-spark3-bundle_2.12:0.10.0 +spark.sql.extensions org.apache.spark.sql.hudi.HoodieSparkSessionExtension +spark.driver.userClassPathFirst true From 8c6a7455a411d8573005ff555491ef438c0aea3d Mon Sep 17 00:00:00 2001 From: Colin Date: Mon, 8 Jan 2024 11:39:10 -0800 Subject: [PATCH 067/102] set env vars --- dagger/run_dbt_spark_tests.py | 38 ++++++++++++++++++++++++++++++++--- 1 file changed, 35 insertions(+), 3 deletions(-) diff --git a/dagger/run_dbt_spark_tests.py b/dagger/run_dbt_spark_tests.py index a5be95dd4..ca7cffd3b 100644 --- a/dagger/run_dbt_spark_tests.py +++ b/dagger/run_dbt_spark_tests.py @@ -1,10 +1,38 @@ +import os + import argparse import sys import anyio as anyio import dagger as dagger +from dotenv import find_dotenv, load_dotenv PG_PORT = 5432 +load_dotenv(find_dotenv("test.env")) +DEFAULT_ENV_VARS = { +"DBT_TEST_USER_1": "buildbot+dbt_test_user_1@dbtlabs.com", +"DBT_TEST_USER_2":"buildbot+dbt_test_user_2@dbtlabs.com", +"DBT_TEST_USER_3": "buildbot+dbt_test_user_3@dbtlabs.com", +} + +def env_variables(envs: dict[str, str]): + def env_variables_inner(ctr: dagger.Container): + for key, value in envs.items(): + ctr = ctr.with_env_variable(key, value) + return ctr + + return env_variables_inner + + +def get_databricks_env_vars(): + + return { + "DBT_DATABRICKS_TOKEN": os.environ["DBT_DATABRICKS_TOKEN"], + "DBT_DATABRICKS_HOST_NAME": os.environ["DBT_DATABRICKS_HOST_NAME"], + "DBT_DATABRICKS_ENDPOINT": os.environ["DBT_DATABRICKS_ENDPOINT"], + "DBT_DATABRICKS_CLUSTER_NAME": os.environ["DBT_DATABRICKS_CLUSTER_NAME"], + "ODBC_DRIVER": "/opt/simba/spark/lib/64/libsparkodbc_sb64.so", + } async def get_postgres_container(client: dagger.Client) -> (dagger.Container, str): @@ -63,6 +91,7 @@ async def get_spark_container(client: dagger.Client) -> (dagger.Container, str): async def test_spark(test_args): async with dagger.Connection(dagger.Config(log_output=sys.stderr)) as client: + test_profile = test_args.profile req_files = client.host().directory("./", include=["*.txt", "*.env", "*.ini"]) dbt_spark_dir = client.host().directory("./dbt") test_dir = client.host().directory("./tests") @@ -80,17 +109,20 @@ async def test_spark(test_args): .with_exec(["pip", "install", "-r", "dev-requirements.txt"]) ) - if test_args.profile == "apache_spark": + if test_profile == "apache_spark": spark_ctr, spark_host = await get_spark_container(client) tst_container = tst_container.with_service_binding(alias=spark_host, service=spark_ctr) - elif test_args.profile in ["databricks_cluster", "databricks_sql_endpoint"]: + elif test_profile in ["databricks_cluster", "databricks_sql_endpoint"]: tst_container = tst_container.with_exec("./scripts/configure_odbc.sh") - elif test_args.profile == "spark_session": + elif test_profile == "spark_session": tst_container = tst_container.with_exec(["pip", "install", "pyspark"]) tst_container = tst_container.with_exec(["apt-get", "install", "openjdk-17-jre", "-y"]) + if "databricks" in test_profile: + tst_container = tst_container.with_(env_variables(get_databricks_env_vars())) + tst_container = tst_container.with_(env_variables(DEFAULT_ENV_VARS)) result = await tst_container.with_exec( [ "python", From 1ae321a264a1ebefa76ce1cb777ed2c9732bedc6 Mon Sep 17 00:00:00 2001 From: Colin Date: Mon, 8 Jan 2024 11:41:10 -0800 Subject: [PATCH 068/102] install requirements --- .github/workflows/integration.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index 88a73884f..67b6ed8e3 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -147,7 +147,7 @@ jobs: run: | python -m pip install --user --upgrade pip python -m pip --version - python -m pip install dagger-io~=0.8.0 + python -m pip install -r dagger/requirements.txt - name: Update dev_requirements.txt if: inputs.dbt-core-branch != '' From 6361429e44b7e8bb0182a629850ca2db922e0ab6 Mon Sep 17 00:00:00 2001 From: Colin Date: Mon, 8 Jan 2024 11:41:18 -0800 Subject: [PATCH 069/102] install requirements --- dagger/requirements.txt | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 dagger/requirements.txt diff --git a/dagger/requirements.txt b/dagger/requirements.txt new file mode 100644 index 000000000..3634ceeb7 --- /dev/null +++ b/dagger/requirements.txt @@ -0,0 +1,2 @@ +dagger-io~=0.8.0 +python-dotenv \ No newline at end of file From 6bca5dc715f7b142bc35c6e64c8bef7a89edbdee Mon Sep 17 00:00:00 2001 From: Colin Date: Mon, 8 Jan 2024 14:51:42 -0800 Subject: [PATCH 070/102] add DEFAULT_ENV_VARS and test_path arg --- dagger/run_dbt_spark_tests.py | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/dagger/run_dbt_spark_tests.py b/dagger/run_dbt_spark_tests.py index ca7cffd3b..864d9cad6 100644 --- a/dagger/run_dbt_spark_tests.py +++ b/dagger/run_dbt_spark_tests.py @@ -10,11 +10,12 @@ PG_PORT = 5432 load_dotenv(find_dotenv("test.env")) DEFAULT_ENV_VARS = { -"DBT_TEST_USER_1": "buildbot+dbt_test_user_1@dbtlabs.com", -"DBT_TEST_USER_2":"buildbot+dbt_test_user_2@dbtlabs.com", -"DBT_TEST_USER_3": "buildbot+dbt_test_user_3@dbtlabs.com", + "DBT_TEST_USER_1": os.getenv("DBT_TEST_USER_1", "buildbot+dbt_test_user_1@dbtlabs.com"), + "DBT_TEST_USER_2": os.getenv("DBT_TEST_USER_2","buildbot+dbt_test_user_2@dbtlabs.com"), + "DBT_TEST_USER_3": os.getenv("DBT_TEST_USER_3", "buildbot+dbt_test_user_3@dbtlabs.com"), } + def env_variables(envs: dict[str, str]): def env_variables_inner(ctr: dagger.Container): for key, value in envs.items(): @@ -25,7 +26,6 @@ def env_variables_inner(ctr: dagger.Container): def get_databricks_env_vars(): - return { "DBT_DATABRICKS_TOKEN": os.environ["DBT_DATABRICKS_TOKEN"], "DBT_DATABRICKS_HOST_NAME": os.environ["DBT_DATABRICKS_HOST_NAME"], @@ -123,18 +123,14 @@ async def test_spark(test_args): if "databricks" in test_profile: tst_container = tst_container.with_(env_variables(get_databricks_env_vars())) tst_container = tst_container.with_(env_variables(DEFAULT_ENV_VARS)) + test_path = test_args.test_path if test_args.test_path else "tests/functional/adapter" result = await tst_container.with_exec( - [ - "python", - "-m", - "pytest", - "-v", - "--profile", - test_args.profile, - "-n", - "auto", - "tests/functional/", - ] + ["python", "-m", "pytest", + "-v", + "--profile", test_args.profile, + "-n", "auto", + test_path, + ] ).stdout() return result @@ -142,6 +138,7 @@ async def test_spark(test_args): parser = argparse.ArgumentParser() parser.add_argument("--profile", required=True, type=str) +parser.add_argument("--test-path", required=False, type=str) args = parser.parse_args() anyio.run(test_spark, args) From f4293e0999276393d7ce4e288dbd87c58d3adc32 Mon Sep 17 00:00:00 2001 From: Colin Date: Mon, 8 Jan 2024 15:00:54 -0800 Subject: [PATCH 071/102] remove circle ci --- .circleci/config.yml | 136 ------------------------------------------- README.md | 3 - 2 files changed, 139 deletions(-) delete mode 100644 .circleci/config.yml diff --git a/.circleci/config.yml b/.circleci/config.yml deleted file mode 100644 index f2a3b6357..000000000 --- a/.circleci/config.yml +++ /dev/null @@ -1,136 +0,0 @@ -version: 2.1 - -jobs: - unit: - environment: - DBT_INVOCATION_ENV: circle - docker: - - image: fishtownanalytics/test-container:10 - steps: - - checkout - - run: tox -e flake8,unit - -# Turning off for now due to flaky runs of tests will turn back on at later date. - integration-spark-session: - environment: - DBT_INVOCATION_ENV: circle - docker: - - image: godatadriven/pyspark:3.1 - steps: - - checkout - - run: apt-get update - - run: conda install python=3.10 - - run: python3 -m pip install --upgrade pip - - run: apt-get install -y git gcc g++ unixodbc-dev libsasl2-dev libxml2-dev libxslt-dev - - run: python3 -m pip install tox - - run: - name: Run integration tests - command: tox -e integration-spark-session - no_output_timeout: 1h - - store_artifacts: - path: ./logs - - integration-spark-thrift: - environment: - DBT_INVOCATION_ENV: circle - docker: - - image: fishtownanalytics/test-container:10 - - image: godatadriven/spark:3.1.1 - environment: - WAIT_FOR: localhost:5432 - command: > - --class org.apache.spark.sql.hive.thriftserver.HiveThriftServer2 - --name Thrift JDBC/ODBC Server - - image: postgres:9.6.17-alpine - environment: - POSTGRES_USER: dbt - POSTGRES_PASSWORD: dbt - POSTGRES_DB: metastore - - steps: - - checkout - - - run: - name: Wait for Spark-Thrift - command: dockerize -wait tcp://localhost:10000 -timeout 15m -wait-retry-interval 5s - - - run: - name: Run integration tests - command: tox -e integration-spark-thrift - no_output_timeout: 1h - - store_artifacts: - path: ./logs - - integration-spark-databricks-http: - environment: - DBT_INVOCATION_ENV: circle - DBT_DATABRICKS_RETRY_ALL: True - DBT_TEST_USER_1: "buildbot+dbt_test_user_1@dbtlabs.com" - DBT_TEST_USER_2: "buildbot+dbt_test_user_2@dbtlabs.com" - DBT_TEST_USER_3: "buildbot+dbt_test_user_3@dbtlabs.com" - docker: - - image: fishtownanalytics/test-container:10 - steps: - - checkout - - run: - name: Run integration tests - command: tox -e integration-spark-databricks-http - no_output_timeout: 1h - - store_artifacts: - path: ./logs - - integration-spark-databricks-odbc-cluster: &databricks-odbc - environment: - DBT_INVOCATION_ENV: circle - ODBC_DRIVER: Simba # TODO: move env var to Docker image - DBT_TEST_USER_1: "buildbot+dbt_test_user_1@dbtlabs.com" - DBT_TEST_USER_2: "buildbot+dbt_test_user_2@dbtlabs.com" - DBT_TEST_USER_3: "buildbot+dbt_test_user_3@dbtlabs.com" - docker: - # image based on `fishtownanalytics/test-container` w/ Simba ODBC Spark driver installed - - image: 828731156495.dkr.ecr.us-east-1.amazonaws.com/dbt-spark-odbc-test-container:latest - aws_auth: - aws_access_key_id: $AWS_ACCESS_KEY_ID_STAGING - aws_secret_access_key: $AWS_SECRET_ACCESS_KEY_STAGING - steps: - - checkout - - run: - name: Run integration tests - command: tox -e integration-spark-databricks-odbc-cluster - no_output_timeout: 1h - - store_artifacts: - path: ./logs - - integration-spark-databricks-odbc-endpoint: - <<: *databricks-odbc - steps: - - checkout - - run: - name: Run integration tests - command: tox -e integration-spark-databricks-odbc-sql-endpoint - no_output_timeout: 1h - - store_artifacts: - path: ./logs - -workflows: - version: 2 - test-everything: - jobs: - - unit - - integration-spark-session: - requires: - - unit - - integration-spark-thrift: - requires: - - unit - - integration-spark-databricks-http: - requires: - - integration-spark-thrift - - integration-spark-databricks-odbc-cluster: - context: aws-credentials - requires: - - integration-spark-thrift - - integration-spark-databricks-odbc-endpoint: - context: aws-credentials - requires: - - integration-spark-thrift diff --git a/README.md b/README.md index 2d2586795..7e95b1fc3 100644 --- a/README.md +++ b/README.md @@ -5,9 +5,6 @@ Unit Tests Badge - - Integration Tests Badge -

**[dbt](https://www.getdbt.com/)** enables data analysts and engineers to transform their data using the same practices that software engineers use to build applications. From d39806558844a5babd6c1c0ad8e4712be7b89a4f Mon Sep 17 00:00:00 2001 From: Colin Date: Mon, 8 Jan 2024 19:45:29 -0800 Subject: [PATCH 072/102] formatting --- dagger/requirements.txt | 2 +- dagger/run_dbt_spark_tests.py | 9 ++------- dagger/scripts/install_os_reqs.sh | 2 +- 3 files changed, 4 insertions(+), 9 deletions(-) diff --git a/dagger/requirements.txt b/dagger/requirements.txt index 3634ceeb7..df36543c2 100644 --- a/dagger/requirements.txt +++ b/dagger/requirements.txt @@ -1,2 +1,2 @@ dagger-io~=0.8.0 -python-dotenv \ No newline at end of file +python-dotenv diff --git a/dagger/run_dbt_spark_tests.py b/dagger/run_dbt_spark_tests.py index 864d9cad6..c9455bdde 100644 --- a/dagger/run_dbt_spark_tests.py +++ b/dagger/run_dbt_spark_tests.py @@ -11,7 +11,7 @@ load_dotenv(find_dotenv("test.env")) DEFAULT_ENV_VARS = { "DBT_TEST_USER_1": os.getenv("DBT_TEST_USER_1", "buildbot+dbt_test_user_1@dbtlabs.com"), - "DBT_TEST_USER_2": os.getenv("DBT_TEST_USER_2","buildbot+dbt_test_user_2@dbtlabs.com"), + "DBT_TEST_USER_2": os.getenv("DBT_TEST_USER_2", "buildbot+dbt_test_user_2@dbtlabs.com"), "DBT_TEST_USER_3": os.getenv("DBT_TEST_USER_3", "buildbot+dbt_test_user_3@dbtlabs.com"), } @@ -125,12 +125,7 @@ async def test_spark(test_args): tst_container = tst_container.with_(env_variables(DEFAULT_ENV_VARS)) test_path = test_args.test_path if test_args.test_path else "tests/functional/adapter" result = await tst_container.with_exec( - ["python", "-m", "pytest", - "-v", - "--profile", test_args.profile, - "-n", "auto", - test_path, - ] + ["pytest", "-v", "--profile", test_profile, "-n", "auto", test_path] ).stdout() return result diff --git a/dagger/scripts/install_os_reqs.sh b/dagger/scripts/install_os_reqs.sh index 47457b8d6..b50027f52 100755 --- a/dagger/scripts/install_os_reqs.sh +++ b/dagger/scripts/install_os_reqs.sh @@ -7,4 +7,4 @@ apt-get update && apt-get install -y --no-install-recommends \ unixodbc \ unixodbc-dev \ libsasl2-modules-gssapi-mit \ - unzip \ No newline at end of file + unzip From 6108d4405630639022346d48f5e8a9e39286757e Mon Sep 17 00:00:00 2001 From: Colin Date: Mon, 8 Jan 2024 19:52:59 -0800 Subject: [PATCH 073/102] update changie --- .changes/unreleased/Under the Hood-20230929-161218.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.changes/unreleased/Under the Hood-20230929-161218.yaml b/.changes/unreleased/Under the Hood-20230929-161218.yaml index c82e8252e..4dc54ae5c 100644 --- a/.changes/unreleased/Under the Hood-20230929-161218.yaml +++ b/.changes/unreleased/Under the Hood-20230929-161218.yaml @@ -1,6 +1,6 @@ kind: Under the Hood -body: Add Github action for integration testing +body: Add Github action for integration testing, use dagger-io to run tests. Remove circle ci workflow. time: 2023-09-29T16:12:18.968755+02:00 custom: - Author: JCZuurmond + Author: JCZuurmond, colin-rogers-dbt Issue: "719" From d472f3b61a4d84bc93323431638869a8ed1687b5 Mon Sep 17 00:00:00 2001 From: colin-rogers-dbt <111200756+colin-rogers-dbt@users.noreply.github.com> Date: Tue, 9 Jan 2024 09:33:57 -0800 Subject: [PATCH 074/102] Update .changes/unreleased/Under the Hood-20230929-161218.yaml Co-authored-by: Emily Rockman --- .changes/unreleased/Under the Hood-20230929-161218.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.changes/unreleased/Under the Hood-20230929-161218.yaml b/.changes/unreleased/Under the Hood-20230929-161218.yaml index 4dc54ae5c..9b5c6818b 100644 --- a/.changes/unreleased/Under the Hood-20230929-161218.yaml +++ b/.changes/unreleased/Under the Hood-20230929-161218.yaml @@ -1,5 +1,5 @@ kind: Under the Hood -body: Add Github action for integration testing, use dagger-io to run tests. Remove circle ci workflow. +body: Add GitHub action for integration testing and use dagger-io to run tests. Remove CircleCI workflow. time: 2023-09-29T16:12:18.968755+02:00 custom: Author: JCZuurmond, colin-rogers-dbt From ce92bcf4a9063d75beed734d9009a3e8f4be1dd0 Mon Sep 17 00:00:00 2001 From: Colin Date: Tue, 9 Jan 2024 09:50:03 -0800 Subject: [PATCH 075/102] formatting fixes and simplify env_var handling --- dagger/run_dbt_spark_tests.py | 26 ++++++++------------------ dagger/scripts/configure_odbc.sh | 1 - docker/Dockerfile | 6 +++--- 3 files changed, 11 insertions(+), 22 deletions(-) diff --git a/dagger/run_dbt_spark_tests.py b/dagger/run_dbt_spark_tests.py index c9455bdde..3e4c8347f 100644 --- a/dagger/run_dbt_spark_tests.py +++ b/dagger/run_dbt_spark_tests.py @@ -9,11 +9,13 @@ PG_PORT = 5432 load_dotenv(find_dotenv("test.env")) -DEFAULT_ENV_VARS = { - "DBT_TEST_USER_1": os.getenv("DBT_TEST_USER_1", "buildbot+dbt_test_user_1@dbtlabs.com"), - "DBT_TEST_USER_2": os.getenv("DBT_TEST_USER_2", "buildbot+dbt_test_user_2@dbtlabs.com"), - "DBT_TEST_USER_3": os.getenv("DBT_TEST_USER_3", "buildbot+dbt_test_user_3@dbtlabs.com"), -} +# if env vars aren't specified in test.env (i.e. in github actions worker), use the ones from the host +TESTING_ENV_VARS = {env_name: os.environ[env_name] for env_name in os.environ + if env_name.startswith(("DD_", "DBT_"))} + +TESTING_ENV_VARS.update({ + "ODBC_DRIVER": "/opt/simba/spark/lib/64/libsparkodbc_sb64.so", +}) def env_variables(envs: dict[str, str]): @@ -25,16 +27,6 @@ def env_variables_inner(ctr: dagger.Container): return env_variables_inner -def get_databricks_env_vars(): - return { - "DBT_DATABRICKS_TOKEN": os.environ["DBT_DATABRICKS_TOKEN"], - "DBT_DATABRICKS_HOST_NAME": os.environ["DBT_DATABRICKS_HOST_NAME"], - "DBT_DATABRICKS_ENDPOINT": os.environ["DBT_DATABRICKS_ENDPOINT"], - "DBT_DATABRICKS_CLUSTER_NAME": os.environ["DBT_DATABRICKS_CLUSTER_NAME"], - "ODBC_DRIVER": "/opt/simba/spark/lib/64/libsparkodbc_sb64.so", - } - - async def get_postgres_container(client: dagger.Client) -> (dagger.Container, str): ctr = await ( client.container() @@ -120,9 +112,7 @@ async def test_spark(test_args): tst_container = tst_container.with_exec(["pip", "install", "pyspark"]) tst_container = tst_container.with_exec(["apt-get", "install", "openjdk-17-jre", "-y"]) - if "databricks" in test_profile: - tst_container = tst_container.with_(env_variables(get_databricks_env_vars())) - tst_container = tst_container.with_(env_variables(DEFAULT_ENV_VARS)) + tst_container = tst_container.with_(env_variables(TESTING_ENV_VARS)) test_path = test_args.test_path if test_args.test_path else "tests/functional/adapter" result = await tst_container.with_exec( ["pytest", "-v", "--profile", test_profile, "-n", "auto", test_path] diff --git a/dagger/scripts/configure_odbc.sh b/dagger/scripts/configure_odbc.sh index 50e80914d..ddf020ad2 100755 --- a/dagger/scripts/configure_odbc.sh +++ b/dagger/scripts/configure_odbc.sh @@ -8,5 +8,4 @@ dpkg -i /tmp/SimbaSparkODBC-2.6.16.1019-Debian-64bit/simbaspark_2.6.16.1019-2_am echo "--------------------------------------------" echo "[Simba]\nDriver = /opt/simba/spark/lib/64/libsparkodbc_sb64.so" >> /etc/odbcinst.ini dpkg -l | grep Simba # confirm that the driver is installed -export ODBC_DRIVER="/opt/simba/spark/lib/64/libsparkodbc_sb64.so" rm -rf /tmp diff --git a/docker/Dockerfile b/docker/Dockerfile index 85d01ba8a..bb4d378ed 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -2,8 +2,8 @@ ARG OPENJDK_VERSION=8 FROM eclipse-temurin:${OPENJDK_VERSION}-jre ARG BUILD_DATE -ARG SPARK_VERSION=3.1.3 -ARG HADOOP_VERSION=3.2 +ARG SPARK_VERSION=3.3.2 +ARG HADOOP_VERSION=3 LABEL org.label-schema.name="Apache Spark ${SPARK_VERSION}" \ org.label-schema.build-date=$BUILD_DATE \ @@ -14,7 +14,7 @@ ENV PATH="/usr/spark/bin:/usr/spark/sbin:${PATH}" RUN apt-get update && \ apt-get install -y wget netcat procps libpostgresql-jdbc-java && \ - wget -q "https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz" && \ + wget -q "http://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz" && \ tar xzf "spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz" && \ rm "spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz" && \ mv "spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}" /usr/spark && \ From 56b14bcd3702cfe85de73d3c8bdf6b794aeb1664 Mon Sep 17 00:00:00 2001 From: Colin Date: Tue, 9 Jan 2024 13:30:42 -0800 Subject: [PATCH 076/102] remove tox, update CONTRIBUTING.md and cleanup GHA workflows --- .github/workflows/integration.yml | 66 ++++-------------------- .github/workflows/main.yml | 12 ++--- CONTRIBUTING.md | 24 +++++++-- dagger/run_dbt_spark_tests.py | 2 +- tox.ini | 83 ------------------------------- 5 files changed, 33 insertions(+), 154 deletions(-) delete mode 100644 tox.ini diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index 67b6ed8e3..53fb9c2ac 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -15,8 +15,14 @@ on: branches: - "main" - "*.latest" - - "releases/*" + pull_request_target: + paths-ignore: + - ".changes/**" + - ".flake8" + - ".gitignore" + - "**.md" + workflow_dispatch: inputs: dbt-core-branch: @@ -37,60 +43,9 @@ defaults: shell: bash jobs: - # generate test metadata about what files changed and the testing matrix to use - test-metadata: - # run if not a PR from a forked repository or has a label to mark as safe to test - if: >- - github.event_name != 'pull_request_target' || - github.event.pull_request.head.repo.full_name == github.repository || - contains(github.event.pull_request.labels.*.name, 'ok to test') - runs-on: ubuntu-latest - outputs: - matrix: ${{ steps.generate-matrix.outputs.result }} - - steps: - - name: Check out the repository (non-PR) - if: github.event_name != 'pull_request_target' - uses: actions/checkout@v3 - with: - persist-credentials: false - - - name: Check out the repository (PR) - if: github.event_name == 'pull_request_target' - uses: actions/checkout@v3 - with: - persist-credentials: false - ref: ${{ github.event.pull_request.head.sha }} - - - name: Check if relevant files changed - if: github.event_name == 'pull_request_target' - # https://github.com/marketplace/actions/paths-changes-filter - # For each filter, it sets output variable named by the filter to the text: - # 'true' - if any of changed files matches any of filter rules - # 'false' - if none of changed files matches any of filter rules - # also, returns: - # `changes` - JSON array with names of all filters matching any of the changed files - uses: dorny/paths-filter@v2 - id: get-changes - with: - token: ${{ secrets.GITHUB_TOKEN }} - filters: | - spark: - - 'dbt/**' - - 'tests/**' - - 'dev-requirements.txt' test: - name: ${{ matrix.test }} / python ${{ matrix.python-version }} / ubuntu-latest - - # run if not a PR from a forked repository or has a label to mark as safe to test - # also checks that the matrix generated is not empty - if: >- - ( - github.event_name != 'pull_request_target' || - github.event.pull_request.head.repo.full_name == github.repository || - contains(github.event.pull_request.labels.*.name, 'ok to test') - ) + name: ${{ matrix.test }} runs-on: ubuntu-latest strategy: @@ -104,8 +59,6 @@ jobs: - "databricks_http_cluster" env: - TOXENV: integration-${{ matrix.test }} - PYTEST_ADDOPTS: "-v --color=yes --csv integration_results.csv" DBT_INVOCATION_ENV: github-actions DD_CIVISIBILITY_AGENTLESS_ENABLED: true DD_API_KEY: ${{ secrets.DATADOG_API_KEY }} @@ -138,7 +91,7 @@ jobs: persist-credentials: false ref: ${{ github.event.pull_request.head.sha }} - - name: Set up Python ${{ matrix.python-version }} + - name: Set up Python uses: actions/setup-python@v4 with: python-version: "3.11" @@ -155,6 +108,5 @@ jobs: pip install bumpversion ./.github/scripts/update_dbt_core_branch.sh ${{ inputs.dbt-core-branch }} - - name: Run tests for ${{ matrix.test }} run: python dagger/run_dbt_spark_tests.py --profile ${{ matrix.test }} diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 30126325e..338413116 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -19,7 +19,6 @@ on: branches: - "main" - "*.latest" - - "releases/*" pull_request: workflow_dispatch: @@ -81,10 +80,6 @@ jobs: matrix: python-version: ["3.8", "3.9", "3.10", "3.11"] - env: - TOXENV: "unit" - PYTEST_ADDOPTS: "-v --color=yes --csv unit_results.csv" - steps: - name: Check out the repository uses: actions/checkout@v3 @@ -100,10 +95,9 @@ jobs: sudo apt-get install libsasl2-dev python -m pip install --user --upgrade pip python -m pip --version - python -m pip install tox - tox --version - - name: Run tox - run: tox + + - name: Run unit tests + run: python -m pytest --color=yes --csv unit_results.csv -v tests/unit - name: Get current date if: always() diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index a61306ea5..9145436b6 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -65,11 +65,27 @@ $EDITOR test.env ### Test commands There are a few methods for running tests locally. -#### `tox` -`tox` takes care of managing Python virtualenvs and installing dependencies in order to run tests. You can also run tests in parallel, for example you can run unit tests for Python 3.8, Python 3.9, and `flake8` checks in parallel with `tox -p`. Also, you can run unit tests for specific python versions with `tox -e py38`. The configuration of these tests are located in `tox.ini`. +#### dagger +To run functional tests we rely on [dagger](https://dagger.io/). This launches a virtual container or containers to test against. -#### `pytest` -Finally, you can also run a specific test or group of tests using `pytest` directly. With a Python virtualenv active and dev dependencies installed you can do things like: +```sh +pip install -r dagger/requirements.txt +python dagger/run_dbt_spark_tests.py --profile databricks_sql_endpoint --test-path tests/functional/adapter/test_basic.py::TestSimpleMaterializationsSpark::test_base +``` + +`--profile`: required, this is the kind of spark connection to test against + +_options_: + - "apache_spark" + - "spark_session" + - "databricks_sql_endpoint" + - "databricks_cluster" + - "databricks_http_cluster" + +`--test-path`: optional, this is the path to the test file you want to run. If not specified, all tests will be run. + +#### pytest +Finally, you can also run a specific test or group of tests using `pytest` directly (if you have all the dependencies set up on your machine). With a Python virtualenv active and dev dependencies installed you can do things like: ```sh # run all functional tests diff --git a/dagger/run_dbt_spark_tests.py b/dagger/run_dbt_spark_tests.py index 3e4c8347f..4cb16f7a0 100644 --- a/dagger/run_dbt_spark_tests.py +++ b/dagger/run_dbt_spark_tests.py @@ -14,7 +14,7 @@ if env_name.startswith(("DD_", "DBT_"))} TESTING_ENV_VARS.update({ - "ODBC_DRIVER": "/opt/simba/spark/lib/64/libsparkodbc_sb64.so", + "ODBC_DRIVER": "Simba", }) diff --git a/tox.ini b/tox.ini deleted file mode 100644 index 31396b5ef..000000000 --- a/tox.ini +++ /dev/null @@ -1,83 +0,0 @@ -[tox] -skipsdist = True -envlist = unit, flake8, integration-spark-thrift - -[testenv:{unit,py38,py39,py310,py}] -allowlist_externals = - /bin/bash -commands = /bin/bash -c '{envpython} -m pytest -v {posargs} tests/unit' -passenv = - DBT_* - PYTEST_ADDOPTS -deps = - -r{toxinidir}/requirements.txt - -r{toxinidir}/dev-requirements.txt - -[testenv:integration-spark-databricks-http] -allowlist_externals = - /bin/bash -basepython = python3.8 -commands = /bin/bash -c '{envpython} -m pytest -v --profile databricks_http_cluster {posargs} -n4 tests/functional/adapter/*' -passenv = - DBT_* - PYTEST_ADDOPTS -deps = - -r{toxinidir}/requirements.txt - -r{toxinidir}/dev-requirements.txt - -e. - -[testenv:integration-spark-databricks-odbc-cluster] -allowlist_externals = - /bin/bash -basepython = python3.8 -commands = /bin/bash -c '{envpython} -m pytest -v --profile databricks_cluster {posargs} -n4 tests/functional/adapter/*' -passenv = - DBT_* - PYTEST_ADDOPTS - ODBC_DRIVER -deps = - -r{toxinidir}/requirements.txt - -r{toxinidir}/dev-requirements.txt - -e. - -[testenv:integration-spark-databricks-odbc-sql-endpoint] -allowlist_externals = - /bin/bash -basepython = python3.8 -commands = /bin/bash -c '{envpython} -m pytest -v --profile databricks_sql_endpoint {posargs} -n4 tests/functional/adapter/*' -passenv = - DBT_* - PYTEST_ADDOPTS - ODBC_DRIVER -deps = - -r{toxinidir}/requirements.txt - -r{toxinidir}/dev-requirements.txt - -e. - - -[testenv:integration-spark-thrift] -description = run integration tests against a Spark thrift server -allowlist_externals = - /bin/bash -passenv = - DBT_* - PYTEST_ADDOPTS -deps = - -r{toxinidir}/requirements.txt - -r{toxinidir}/dev-requirements.txt - -e. -commands = /bin/bash -c '{envpython} -m pytest -v --profile apache_spark {posargs} -n4 tests/functional/adapter/*' - -[testenv:integration-spark-session] -description = run integration tests against a Spark session -allowlist_externals = - /bin/bash -passenv = - DBT_* - PYTEST_* - PIP_CACHE_DIR -deps = - -r{toxinidir}/requirements.txt - -r{toxinidir}/dev-requirements.txt - -e.[session] -commands = /bin/bash -c '{envpython} -m pytest -v --profile spark_session {posargs} -n4 tests/functional/adapter/*' From 9849c1c2b4e3c14a772ef59b5f331e0b5785d673 Mon Sep 17 00:00:00 2001 From: Colin Date: Tue, 9 Jan 2024 13:34:17 -0800 Subject: [PATCH 077/102] remove tox, update CONTRIBUTING.md and cleanup GHA workflows --- .github/workflows/integration.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index 53fb9c2ac..e2f0dcfdc 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -73,7 +73,6 @@ jobs: DBT_TEST_USER_1: "buildbot+dbt_test_user_1@dbtlabs.com" DBT_TEST_USER_2: "buildbot+dbt_test_user_2@dbtlabs.com" DBT_TEST_USER_3: "buildbot+dbt_test_user_3@dbtlabs.com" - ODBC_DRIVER: "Simba" steps: - name: Check out the repository From f9a4c585a263d0a76c009ba1c9c7acc30f3bf462 Mon Sep 17 00:00:00 2001 From: Colin Date: Tue, 9 Jan 2024 13:42:48 -0800 Subject: [PATCH 078/102] install test reqs in main.yml --- .github/workflows/main.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 338413116..c16a16206 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -95,6 +95,8 @@ jobs: sudo apt-get install libsasl2-dev python -m pip install --user --upgrade pip python -m pip --version + python -m pip install -e . + python -m pip install -r dev-requirements.txt - name: Run unit tests run: python -m pytest --color=yes --csv unit_results.csv -v tests/unit From bbe17a8fa8a2c181d5d98aafdf12eba9c371d96e Mon Sep 17 00:00:00 2001 From: Colin Date: Tue, 9 Jan 2024 13:45:37 -0800 Subject: [PATCH 079/102] install test reqs in main.yml --- .github/workflows/main.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index c16a16206..20f3f88f4 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -95,8 +95,9 @@ jobs: sudo apt-get install libsasl2-dev python -m pip install --user --upgrade pip python -m pip --version - python -m pip install -e . + python -m pip install -r requirements.txt python -m pip install -r dev-requirements.txt + python -m pip install -e . - name: Run unit tests run: python -m pytest --color=yes --csv unit_results.csv -v tests/unit From 3f44e9663db6606a9fe0c5d5208ab2c2d31a791b Mon Sep 17 00:00:00 2001 From: Colin Date: Tue, 9 Jan 2024 13:51:23 -0800 Subject: [PATCH 080/102] formatting --- CONTRIBUTING.md | 4 ++-- dagger/run_dbt_spark_tests.py | 11 ++++++----- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 9145436b6..6fcaacea8 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -66,14 +66,14 @@ $EDITOR test.env There are a few methods for running tests locally. #### dagger -To run functional tests we rely on [dagger](https://dagger.io/). This launches a virtual container or containers to test against. +To run functional tests we rely on [dagger](https://dagger.io/). This launches a virtual container or containers to test against. ```sh pip install -r dagger/requirements.txt python dagger/run_dbt_spark_tests.py --profile databricks_sql_endpoint --test-path tests/functional/adapter/test_basic.py::TestSimpleMaterializationsSpark::test_base ``` -`--profile`: required, this is the kind of spark connection to test against +`--profile`: required, this is the kind of spark connection to test against _options_: - "apache_spark" diff --git a/dagger/run_dbt_spark_tests.py b/dagger/run_dbt_spark_tests.py index 4cb16f7a0..dd1a4395d 100644 --- a/dagger/run_dbt_spark_tests.py +++ b/dagger/run_dbt_spark_tests.py @@ -10,12 +10,13 @@ PG_PORT = 5432 load_dotenv(find_dotenv("test.env")) # if env vars aren't specified in test.env (i.e. in github actions worker), use the ones from the host -TESTING_ENV_VARS = {env_name: os.environ[env_name] for env_name in os.environ - if env_name.startswith(("DD_", "DBT_"))} +TESTING_ENV_VARS = { + env_name: os.environ[env_name] + for env_name in os.environ + if env_name.startswith(("DD_", "DBT_")) +} -TESTING_ENV_VARS.update({ - "ODBC_DRIVER": "Simba", -}) +TESTING_ENV_VARS.update({"ODBC_DRIVER": "Simba"}) def env_variables(envs: dict[str, str]): From afd3866a4b39c0df0999bbcbc333d78eff9927eb Mon Sep 17 00:00:00 2001 From: Colin Date: Wed, 10 Jan 2024 09:59:30 -0800 Subject: [PATCH 081/102] remove tox from dev-requirements.txt and Makefile --- Makefile | 7 ++++--- dev-requirements.txt | 1 - 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Makefile b/Makefile index cc1d9f75d..2bd1055fa 100644 --- a/Makefile +++ b/Makefile @@ -3,7 +3,7 @@ .PHONY: dev dev: ## Installs adapter in develop mode along with development dependencies @\ - pip install -e . -r requirements.txt -r dev-requirements.txt && pre-commit install + pip install -e . -r requirements.txt -r dev-requirements.txt -r dagger/requirements.txt && pre-commit install .PHONY: dev-uninstall dev-uninstall: ## Uninstalls all packages while maintaining the virtual environment @@ -40,12 +40,13 @@ linecheck: ## Checks for all Python lines 100 characters or more .PHONY: unit unit: ## Runs unit tests with py38. @\ - tox -e py38 + python -m pytest tests/unit .PHONY: test test: ## Runs unit tests with py38 and code checks against staged changes. @\ - tox -p -e py38; \ + python -m pytest tests/unit; \ + python dagger/run_dbt_spark_tests.py --profile spark_session \ pre-commit run black-check --hook-stage manual | grep -v "INFO"; \ pre-commit run flake8-check --hook-stage manual | grep -v "INFO"; \ pre-commit run mypy-check --hook-stage manual | grep -v "INFO" diff --git a/dev-requirements.txt b/dev-requirements.txt index bb3282b44..765482e25 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -22,7 +22,6 @@ pytest-dotenv~=0.5.2 pytest-logbook~=1.2 pytest-xdist~=3.5 pytz~=2023.3 -tox~=4.11 types-pytz~=2023.3 types-requests~=2.31 twine~=4.0 From 259ebc7cbe75a7f22bff8075e7c7bba0581cd585 Mon Sep 17 00:00:00 2001 From: Colin Date: Wed, 10 Jan 2024 10:33:50 -0800 Subject: [PATCH 082/102] clarify spark crt instantiation --- dagger/run_dbt_spark_tests.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dagger/run_dbt_spark_tests.py b/dagger/run_dbt_spark_tests.py index dd1a4395d..718519909 100644 --- a/dagger/run_dbt_spark_tests.py +++ b/dagger/run_dbt_spark_tests.py @@ -41,7 +41,7 @@ async def get_postgres_container(client: dagger.Client) -> (dagger.Container, st async def get_spark_container(client: dagger.Client) -> (dagger.Container, str): spark_dir = client.host().directory("./dagger/spark-container") - spark_ctr = ( + spark_ctr_base = ( client.container() .from_("eclipse-temurin:8-jre") .with_directory("/spark_setup", spark_dir) @@ -66,7 +66,7 @@ async def get_spark_container(client: dagger.Client) -> (dagger.Container, str): pg_ctr, pg_host = await get_postgres_container(client) spark_ctr = ( - spark_ctr.with_service_binding(alias=pg_host, service=pg_ctr) + spark_ctr_base.with_service_binding(alias=pg_host, service=pg_ctr) .with_exec( [ "/scripts/entrypoint.sh", From a8a7010d934c951512cd66f8b8cbf13d71c45176 Mon Sep 17 00:00:00 2001 From: Colin Date: Wed, 10 Jan 2024 11:52:57 -0800 Subject: [PATCH 083/102] add comments on python-version --- .github/workflows/integration.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index e2f0dcfdc..94dece350 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -90,7 +90,8 @@ jobs: persist-credentials: false ref: ${{ github.event.pull_request.head.sha }} - - name: Set up Python + # the python version used here is not what is used in the tests themselves + - name: Set up Python for dagger uses: actions/setup-python@v4 with: python-version: "3.11" From fcf074b0510b163523b8ac998c605a39e6ead7bd Mon Sep 17 00:00:00 2001 From: Colin Date: Wed, 10 Jan 2024 13:41:29 -0800 Subject: [PATCH 084/102] initial migration changes --- dbt/adapters/spark/column.py | 2 +- dbt/adapters/spark/connections.py | 9 +++++---- dbt/adapters/spark/impl.py | 13 +++++++------ dbt/adapters/spark/relation.py | 2 +- dbt/adapters/spark/session.py | 2 +- dev-requirements.txt | 4 ++-- 6 files changed, 17 insertions(+), 15 deletions(-) diff --git a/dbt/adapters/spark/column.py b/dbt/adapters/spark/column.py index a57fa0565..dbc872051 100644 --- a/dbt/adapters/spark/column.py +++ b/dbt/adapters/spark/column.py @@ -2,7 +2,7 @@ from typing import Any, Dict, Optional, TypeVar, Union from dbt.adapters.base.column import Column -from dbt.dataclass_schema import dbtClassMixin +from dbt.common.dataclass_schema import dbtClassMixin Self = TypeVar("Self", bound="SparkColumn") diff --git a/dbt/adapters/spark/connections.py b/dbt/adapters/spark/connections.py index 966f5584e..76390a2bc 100644 --- a/dbt/adapters/spark/connections.py +++ b/dbt/adapters/spark/connections.py @@ -2,11 +2,13 @@ import dbt.exceptions from dbt.adapters.base import Credentials +from dbt.adapters.contracts.connection import AdapterResponse, ConnectionState +from dbt.adapters.events.logging import AdapterLogger from dbt.adapters.sql import SQLConnectionManager -from dbt.contracts.connection import ConnectionState, AdapterResponse -from dbt.events import AdapterLogger + from dbt.utils import DECIMALS from dbt.adapters.spark import __version__ +from dbt.adapters.spark.session import Connection try: from TCLIService.ttypes import TOperationState as ThriftState @@ -22,8 +24,7 @@ pyodbc = None from datetime import datetime import sqlparams -from dbt.contracts.connection import Connection -from dbt.dataclass_schema import StrEnum +from dbt.common.dataclass_schema import StrEnum from dataclasses import dataclass, field from typing import Any, Dict, Optional, Union, Tuple, List, Generator, Iterable, Sequence diff --git a/dbt/adapters/spark/impl.py b/dbt/adapters/spark/impl.py index 16c3a3cb7..325139911 100644 --- a/dbt/adapters/spark/impl.py +++ b/dbt/adapters/spark/impl.py @@ -4,6 +4,9 @@ from typing import Any, Dict, Iterable, List, Optional, Union, Type, Tuple, Callable, Set from dbt.adapters.base.relation import InformationSchema +from dbt.adapters.contracts.connection import AdapterResponse +from dbt.adapters.events.logging import AdapterLogger +from dbt.common.utils import AttrDict, executor from dbt.contracts.graph.manifest import Manifest from typing_extensions import TypeAlias @@ -13,6 +16,7 @@ import dbt import dbt.exceptions + from dbt.adapters.base import AdapterConfig, PythonJobHelper from dbt.adapters.base.impl import catch_as_completed, ConstraintSupport from dbt.adapters.sql import SQLAdapter @@ -24,12 +28,9 @@ AllPurposeClusterPythonJobHelper, ) from dbt.adapters.base import BaseRelation -from dbt.clients.agate_helper import DEFAULT_TYPE_TESTER -from dbt.contracts.connection import AdapterResponse -from dbt.contracts.graph.nodes import ConstraintType -from dbt.contracts.relation import RelationType -from dbt.events import AdapterLogger -from dbt.utils import executor, AttrDict +from dbt.adapters.contracts.relation import RelationType +from dbt.common.clients.agate_helper import DEFAULT_TYPE_TESTER +from dbt.common.contracts.constraints import ConstraintType logger = AdapterLogger("Spark") diff --git a/dbt/adapters/spark/relation.py b/dbt/adapters/spark/relation.py index e80f2623f..1fa1272f4 100644 --- a/dbt/adapters/spark/relation.py +++ b/dbt/adapters/spark/relation.py @@ -2,9 +2,9 @@ from dataclasses import dataclass, field from dbt.adapters.base.relation import BaseRelation, Policy +from dbt.adapters.events.logging import AdapterLogger from dbt.exceptions import DbtRuntimeError -from dbt.events import AdapterLogger logger = AdapterLogger("Spark") diff --git a/dbt/adapters/spark/session.py b/dbt/adapters/spark/session.py index b5b2bebdb..1def33be1 100644 --- a/dbt/adapters/spark/session.py +++ b/dbt/adapters/spark/session.py @@ -7,7 +7,7 @@ from typing import Any, Dict, List, Optional, Tuple, Union, Sequence from dbt.adapters.spark.connections import SparkConnectionWrapper -from dbt.events import AdapterLogger +from dbt.adapters.events.logging import AdapterLogger from dbt.utils import DECIMALS from dbt.exceptions import DbtRuntimeError from pyspark.sql import DataFrame, Row, SparkSession diff --git a/dev-requirements.txt b/dev-requirements.txt index 765482e25..e56b221c7 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -1,7 +1,7 @@ # install latest changes in dbt-core # TODO: how to automate switching from develop to version branches? -git+https://github.com/dbt-labs/dbt-core.git@c2bc2f009bbeeb46b3c69d082ab4d485597898af#egg=dbt-core&subdirectory=core -git+https://github.com/dbt-labs/dbt-core.git@c2bc2f009bbeeb46b3c69d082ab4d485597898af#egg=dbt-tests-adapter&subdirectory=tests/adapter +git+https://github.com/dbt-labs/dbt-core.git#egg=dbt-core&subdirectory=core +git+https://github.com/dbt-labs/dbt-core.git#egg=dbt-tests-adapter&subdirectory=tests/adapter # if version 1.x or greater -> pin to major version # if version 0.x -> pin to minor From 1b1fcec674317324527c8f2900549157adf6d8b2 Mon Sep 17 00:00:00 2001 From: Colin Date: Wed, 10 Jan 2024 13:42:16 -0800 Subject: [PATCH 085/102] unpin --- dev-requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dev-requirements.txt b/dev-requirements.txt index 765482e25..e56b221c7 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -1,7 +1,7 @@ # install latest changes in dbt-core # TODO: how to automate switching from develop to version branches? -git+https://github.com/dbt-labs/dbt-core.git@c2bc2f009bbeeb46b3c69d082ab4d485597898af#egg=dbt-core&subdirectory=core -git+https://github.com/dbt-labs/dbt-core.git@c2bc2f009bbeeb46b3c69d082ab4d485597898af#egg=dbt-tests-adapter&subdirectory=tests/adapter +git+https://github.com/dbt-labs/dbt-core.git#egg=dbt-core&subdirectory=core +git+https://github.com/dbt-labs/dbt-core.git#egg=dbt-tests-adapter&subdirectory=tests/adapter # if version 1.x or greater -> pin to major version # if version 0.x -> pin to minor From 0a2b73db07bd2128519b90952985a837a38d9a01 Mon Sep 17 00:00:00 2001 From: Colin Date: Wed, 10 Jan 2024 17:19:03 -0800 Subject: [PATCH 086/102] implement core / adapters decoupling --- dagger/run_dbt_spark_tests.py | 3 ++- dbt/adapters/spark/connections.py | 12 ++++++------ dbt/adapters/spark/impl.py | 11 +++++------ dbt/adapters/spark/relation.py | 2 +- dbt/adapters/spark/session.py | 4 ++-- tests/unit/test_adapter.py | 27 ++++++++++++++------------- tests/unit/utils.py | 2 +- 7 files changed, 31 insertions(+), 30 deletions(-) diff --git a/dagger/run_dbt_spark_tests.py b/dagger/run_dbt_spark_tests.py index 718519909..2fde4a25d 100644 --- a/dagger/run_dbt_spark_tests.py +++ b/dagger/run_dbt_spark_tests.py @@ -2,6 +2,7 @@ import argparse import sys +from typing import Dict import anyio as anyio import dagger as dagger @@ -19,7 +20,7 @@ TESTING_ENV_VARS.update({"ODBC_DRIVER": "Simba"}) -def env_variables(envs: dict[str, str]): +def env_variables(envs: Dict[str, str]): def env_variables_inner(ctr: dagger.Container): for key, value in envs.items(): ctr = ctr.with_env_variable(key, value) diff --git a/dbt/adapters/spark/connections.py b/dbt/adapters/spark/connections.py index 76390a2bc..fa6f48f52 100644 --- a/dbt/adapters/spark/connections.py +++ b/dbt/adapters/spark/connections.py @@ -2,13 +2,13 @@ import dbt.exceptions from dbt.adapters.base import Credentials -from dbt.adapters.contracts.connection import AdapterResponse, ConnectionState +from dbt.adapters.contracts.connection import AdapterResponse, ConnectionState, Connection from dbt.adapters.events.logging import AdapterLogger from dbt.adapters.sql import SQLConnectionManager +from dbt.common.exceptions import DbtConfigError -from dbt.utils import DECIMALS +from dbt.common.utils.encoding import DECIMALS from dbt.adapters.spark import __version__ -from dbt.adapters.spark.session import Connection try: from TCLIService.ttypes import TOperationState as ThriftState @@ -391,7 +391,7 @@ def validate_creds(cls, creds: Any, required: Iterable[str]) -> None: for key in required: if not hasattr(creds, key): - raise dbt.exceptions.DbtProfileError( + raise DbtConfigError( "The config '{}' is required when using the {} method" " to connect to Spark".format(key, method) ) @@ -482,7 +482,7 @@ def open(cls, connection: Connection) -> Connection: endpoint=creds.endpoint ) else: - raise dbt.exceptions.DbtProfileError( + raise DbtConfigError( "Either `cluster` or `endpoint` must set when" " using the odbc method to connect to Spark" ) @@ -526,7 +526,7 @@ def open(cls, connection: Connection) -> Connection: Connection(server_side_parameters=creds.server_side_parameters) ) else: - raise dbt.exceptions.DbtProfileError( + raise DbtConfigError( f"invalid credential method: {creds.method}" ) break diff --git a/dbt/adapters/spark/impl.py b/dbt/adapters/spark/impl.py index 325139911..8cc7d848b 100644 --- a/dbt/adapters/spark/impl.py +++ b/dbt/adapters/spark/impl.py @@ -7,7 +7,6 @@ from dbt.adapters.contracts.connection import AdapterResponse from dbt.adapters.events.logging import AdapterLogger from dbt.common.utils import AttrDict, executor -from dbt.contracts.graph.manifest import Manifest from typing_extensions import TypeAlias @@ -28,7 +27,7 @@ AllPurposeClusterPythonJobHelper, ) from dbt.adapters.base import BaseRelation -from dbt.adapters.contracts.relation import RelationType +from dbt.adapters.contracts.relation import RelationType, RelationConfig from dbt.common.clients.agate_helper import DEFAULT_TYPE_TESTER from dbt.common.contracts.constraints import ConstraintType @@ -353,9 +352,9 @@ def _get_columns_for_catalog(self, relation: BaseRelation) -> Iterable[Dict[str, yield as_dict def get_catalog( - self, manifest: Manifest, selected_nodes: Optional[Set] = None + self, relation_configs: Iterable[RelationConfig], selected_nodes: Optional[Set] = None ) -> Tuple[agate.Table, List[Exception]]: - schema_map = self._get_catalog_schemas(manifest) + schema_map = self._get_catalog_schemas(relation_configs) if len(schema_map) > 1: raise dbt.exceptions.CompilationError( f"Expected only one database in get_catalog, found " f"{list(schema_map)}" @@ -372,7 +371,7 @@ def get_catalog( self._get_one_catalog, info, [schema], - manifest, + relation_configs, ) ) catalogs, exceptions = catch_as_completed(futures) @@ -382,7 +381,7 @@ def _get_one_catalog( self, information_schema: InformationSchema, schemas: Set[str], - manifest: Manifest, + relation_configs: Iterable[RelationConfig], ) -> agate.Table: if len(schemas) != 1: raise dbt.exceptions.CompilationError( diff --git a/dbt/adapters/spark/relation.py b/dbt/adapters/spark/relation.py index 1fa1272f4..a6d679d56 100644 --- a/dbt/adapters/spark/relation.py +++ b/dbt/adapters/spark/relation.py @@ -4,7 +4,7 @@ from dbt.adapters.base.relation import BaseRelation, Policy from dbt.adapters.events.logging import AdapterLogger -from dbt.exceptions import DbtRuntimeError +from dbt.common.exceptions import DbtRuntimeError logger = AdapterLogger("Spark") diff --git a/dbt/adapters/spark/session.py b/dbt/adapters/spark/session.py index 1def33be1..d5d3ff050 100644 --- a/dbt/adapters/spark/session.py +++ b/dbt/adapters/spark/session.py @@ -8,8 +8,8 @@ from dbt.adapters.spark.connections import SparkConnectionWrapper from dbt.adapters.events.logging import AdapterLogger -from dbt.utils import DECIMALS -from dbt.exceptions import DbtRuntimeError +from dbt.common.utils.encoding import DECIMALS +from dbt.common.exceptions import DbtRuntimeError from pyspark.sql import DataFrame, Row, SparkSession from pyspark.sql.utils import AnalysisException diff --git a/tests/unit/test_adapter.py b/tests/unit/test_adapter.py index a7da63301..b46f7eef6 100644 --- a/tests/unit/test_adapter.py +++ b/tests/unit/test_adapter.py @@ -1,4 +1,5 @@ import unittest +from multiprocessing import get_context from unittest import mock import dbt.flags as flags @@ -146,7 +147,7 @@ def _get_target_odbc_sql_endpoint(self, project): def test_http_connection(self): config = self._get_target_http(self.project_cfg) - adapter = SparkAdapter(config) + adapter = SparkAdapter(config, get_context("spawn")) def hive_http_connect(thrift_transport, configuration): self.assertEqual(thrift_transport.scheme, "https") @@ -171,7 +172,7 @@ def hive_http_connect(thrift_transport, configuration): def test_thrift_connection(self): config = self._get_target_thrift(self.project_cfg) - adapter = SparkAdapter(config) + adapter = SparkAdapter(config, get_context("spawn")) def hive_thrift_connect( host, port, username, auth, kerberos_service_name, password, configuration @@ -195,7 +196,7 @@ def hive_thrift_connect( def test_thrift_ssl_connection(self): config = self._get_target_use_ssl_thrift(self.project_cfg) - adapter = SparkAdapter(config) + adapter = SparkAdapter(config, get_context("spawn")) def hive_thrift_connect(thrift_transport, configuration): self.assertIsNotNone(thrift_transport) @@ -215,7 +216,7 @@ def hive_thrift_connect(thrift_transport, configuration): def test_thrift_connection_kerberos(self): config = self._get_target_thrift_kerberos(self.project_cfg) - adapter = SparkAdapter(config) + adapter = SparkAdapter(config, get_context("spawn")) def hive_thrift_connect( host, port, username, auth, kerberos_service_name, password, configuration @@ -239,7 +240,7 @@ def hive_thrift_connect( def test_odbc_cluster_connection(self): config = self._get_target_odbc_cluster(self.project_cfg) - adapter = SparkAdapter(config) + adapter = SparkAdapter(config, get_context("spawn")) def pyodbc_connect(connection_str, autocommit): self.assertTrue(autocommit) @@ -266,7 +267,7 @@ def pyodbc_connect(connection_str, autocommit): def test_odbc_endpoint_connection(self): config = self._get_target_odbc_sql_endpoint(self.project_cfg) - adapter = SparkAdapter(config) + adapter = SparkAdapter(config, get_context("spawn")) def pyodbc_connect(connection_str, autocommit): self.assertTrue(autocommit) @@ -329,7 +330,7 @@ def test_parse_relation(self): input_cols = [Row(keys=["col_name", "data_type"], values=r) for r in plain_rows] config = self._get_target_http(self.project_cfg) - rows = SparkAdapter(config).parse_describe_extended(relation, input_cols) + rows = SparkAdapter(config, get_context("spawn")).parse_describe_extended(relation, input_cols) self.assertEqual(len(rows), 4) self.assertEqual( rows[0].to_column_dict(omit_none=False), @@ -418,7 +419,7 @@ def test_parse_relation_with_integer_owner(self): input_cols = [Row(keys=["col_name", "data_type"], values=r) for r in plain_rows] config = self._get_target_http(self.project_cfg) - rows = SparkAdapter(config).parse_describe_extended(relation, input_cols) + rows = SparkAdapter(config, get_context("spawn")).parse_describe_extended(relation, input_cols) self.assertEqual(rows[0].to_column_dict().get("table_owner"), "1234") @@ -454,7 +455,7 @@ def test_parse_relation_with_statistics(self): input_cols = [Row(keys=["col_name", "data_type"], values=r) for r in plain_rows] config = self._get_target_http(self.project_cfg) - rows = SparkAdapter(config).parse_describe_extended(relation, input_cols) + rows = SparkAdapter(config, get_context("spawn")).parse_describe_extended(relation, input_cols) self.assertEqual(len(rows), 1) self.assertEqual( rows[0].to_column_dict(omit_none=False), @@ -483,7 +484,7 @@ def test_parse_relation_with_statistics(self): def test_relation_with_database(self): config = self._get_target_http(self.project_cfg) - adapter = SparkAdapter(config) + adapter = SparkAdapter(config, get_context("spawn")) # fine adapter.Relation.create(schema="different", identifier="table") with self.assertRaises(DbtRuntimeError): @@ -564,7 +565,7 @@ def test_parse_columns_from_information_with_table_type_and_delta_provider(self) ) config = self._get_target_http(self.project_cfg) - columns = SparkAdapter(config).parse_columns_from_information(relation) + columns = SparkAdapter(config, get_context("spawn")).parse_columns_from_information(relation) self.assertEqual(len(columns), 4) self.assertEqual( columns[0].to_column_dict(omit_none=False), @@ -649,7 +650,7 @@ def test_parse_columns_from_information_with_view_type(self): ) config = self._get_target_http(self.project_cfg) - columns = SparkAdapter(config).parse_columns_from_information(relation) + columns = SparkAdapter(config, get_context("spawn")).parse_columns_from_information(relation) self.assertEqual(len(columns), 4) self.assertEqual( columns[1].to_column_dict(omit_none=False), @@ -715,7 +716,7 @@ def test_parse_columns_from_information_with_table_type_and_parquet_provider(sel ) config = self._get_target_http(self.project_cfg) - columns = SparkAdapter(config).parse_columns_from_information(relation) + columns = SparkAdapter(config, get_context("spawn")).parse_columns_from_information(relation) self.assertEqual(len(columns), 4) self.assertEqual( diff --git a/tests/unit/utils.py b/tests/unit/utils.py index ac8c62244..a32d6608d 100644 --- a/tests/unit/utils.py +++ b/tests/unit/utils.py @@ -9,7 +9,7 @@ import agate import pytest -from dbt.dataclass_schema import ValidationError +from dbt.common.dataclass_schema import ValidationError from dbt.config.project import PartialProject From bd86ee1a3b7f9eebccf6410b78e51681244b05e0 Mon Sep 17 00:00:00 2001 From: Colin Date: Thu, 11 Jan 2024 11:28:01 -0800 Subject: [PATCH 087/102] fix list_relations --- dbt/adapters/spark/impl.py | 6 ++++++ dbt/include/spark/macros/adapters.sql | 4 ++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/dbt/adapters/spark/impl.py b/dbt/adapters/spark/impl.py index 8cc7d848b..abc6a6ff6 100644 --- a/dbt/adapters/spark/impl.py +++ b/dbt/adapters/spark/impl.py @@ -1,3 +1,4 @@ +import os import re from concurrent.futures import Future from dataclasses import dataclass @@ -32,6 +33,11 @@ from dbt.common.contracts.constraints import ConstraintType logger = AdapterLogger("Spark") +packages = ["pyhive.hive", "thrift.transport", "thrift.protocol"] +log_level = os.getenv("DBT_SPARK_LOG_LEVEL", "ERROR") +for package in packages: + logger.debug(f"Setting {package} logging to {log_level}") + logger.set_adapter_dependency_log_level(package, log_level) GET_COLUMNS_IN_RELATION_RAW_MACRO_NAME = "get_columns_in_relation_raw" LIST_SCHEMAS_MACRO_NAME = "list_schemas" diff --git a/dbt/include/spark/macros/adapters.sql b/dbt/include/spark/macros/adapters.sql index bfc1f198d..bf9f63cf9 100644 --- a/dbt/include/spark/macros/adapters.sql +++ b/dbt/include/spark/macros/adapters.sql @@ -294,7 +294,7 @@ {% macro spark__list_relations_without_caching(relation) %} {% call statement('list_relations_without_caching', fetch_result=True) -%} - show table extended in {{ relation }} like '*' + show table extended in {{ relation.schema }} like '*' {% endcall %} {% do return(load_result('list_relations_without_caching').table) %} @@ -305,7 +305,7 @@ {#-- V2 iceberg tables #} {#-- https://issues.apache.org/jira/browse/SPARK-33393 #} {% call statement('list_relations_without_caching_show_tables', fetch_result=True) -%} - show tables in {{ schema_relation }} like '*' + show tables in {{ schema_relation.schema }} like '*' {% endcall %} {% do return(load_result('list_relations_without_caching_show_tables').table) %} From cb5e05c783c4c1bad1d1d400bc97131e22f866e2 Mon Sep 17 00:00:00 2001 From: Colin Date: Thu, 11 Jan 2024 11:38:06 -0800 Subject: [PATCH 088/102] fix typing and exception imports --- dbt/adapters/spark/connections.py | 13 ++++++------- dbt/adapters/spark/impl.py | 20 +++++++++++++++++--- tests/unit/test_adapter.py | 24 ++++++++++++++++++------ 3 files changed, 41 insertions(+), 16 deletions(-) diff --git a/dbt/adapters/spark/connections.py b/dbt/adapters/spark/connections.py index fa6f48f52..c9c69294f 100644 --- a/dbt/adapters/spark/connections.py +++ b/dbt/adapters/spark/connections.py @@ -4,6 +4,7 @@ from dbt.adapters.base import Credentials from dbt.adapters.contracts.connection import AdapterResponse, ConnectionState, Connection from dbt.adapters.events.logging import AdapterLogger +from dbt.adapters.exceptions import FailedToConnectError from dbt.adapters.sql import SQLConnectionManager from dbt.common.exceptions import DbtConfigError @@ -292,11 +293,11 @@ def execute(self, sql: str, bindings: Optional[List[Any]] = None) -> None: if poll_state.errorMessage: logger.debug("Poll response: {}".format(poll_state)) logger.debug("Poll status: {}".format(state)) - raise dbt.exceptions.DbtDatabaseError(poll_state.errorMessage) + raise dbt.common.exceptions.DbtDatabaseError(poll_state.errorMessage) elif state not in STATE_SUCCESS: status_type = ThriftState._VALUES_TO_NAMES.get(state, "Unknown<{!r}>".format(state)) - raise dbt.exceptions.DbtDatabaseError( + raise dbt.common.exceptions.DbtDatabaseError( "Query failed with status: {}".format(status_type) ) @@ -526,9 +527,7 @@ def open(cls, connection: Connection) -> Connection: Connection(server_side_parameters=creds.server_side_parameters) ) else: - raise DbtConfigError( - f"invalid credential method: {creds.method}" - ) + raise DbtConfigError(f"invalid credential method: {creds.method}") break except Exception as e: exc = e @@ -538,7 +537,7 @@ def open(cls, connection: Connection) -> Connection: msg = "Failed to connect" if creds.token is not None: msg += ", is your token valid?" - raise dbt.exceptions.FailedToConnectError(msg) from e + raise FailedToConnectError(msg) from e retryable_message = _is_retryable_error(e) if retryable_message and creds.connect_retries > 0: msg = ( @@ -559,7 +558,7 @@ def open(cls, connection: Connection) -> Connection: logger.warning(msg) time.sleep(creds.connect_timeout) else: - raise dbt.exceptions.FailedToConnectError("failed to connect") from e + raise FailedToConnectError("failed to connect") from e else: raise exc # type: ignore diff --git a/dbt/adapters/spark/impl.py b/dbt/adapters/spark/impl.py index abc6a6ff6..e206dac92 100644 --- a/dbt/adapters/spark/impl.py +++ b/dbt/adapters/spark/impl.py @@ -2,7 +2,19 @@ import re from concurrent.futures import Future from dataclasses import dataclass -from typing import Any, Dict, Iterable, List, Optional, Union, Type, Tuple, Callable, Set +from typing import ( + Any, + Dict, + Iterable, + List, + Optional, + Union, + Type, + Tuple, + Callable, + Set, + FrozenSet, +) from dbt.adapters.base.relation import InformationSchema from dbt.adapters.contracts.connection import AdapterResponse @@ -358,7 +370,9 @@ def _get_columns_for_catalog(self, relation: BaseRelation) -> Iterable[Dict[str, yield as_dict def get_catalog( - self, relation_configs: Iterable[RelationConfig], selected_nodes: Optional[Set] = None + self, + relation_configs: Iterable[RelationConfig], + used_schemas: FrozenSet[Tuple[str, str]], ) -> Tuple[agate.Table, List[Exception]]: schema_map = self._get_catalog_schemas(relation_configs) if len(schema_map) > 1: @@ -387,7 +401,7 @@ def _get_one_catalog( self, information_schema: InformationSchema, schemas: Set[str], - relation_configs: Iterable[RelationConfig], + used_schemas: FrozenSet[Tuple[str, str]], ) -> agate.Table: if len(schemas) != 1: raise dbt.exceptions.CompilationError( diff --git a/tests/unit/test_adapter.py b/tests/unit/test_adapter.py index b46f7eef6..54e9f0158 100644 --- a/tests/unit/test_adapter.py +++ b/tests/unit/test_adapter.py @@ -330,7 +330,9 @@ def test_parse_relation(self): input_cols = [Row(keys=["col_name", "data_type"], values=r) for r in plain_rows] config = self._get_target_http(self.project_cfg) - rows = SparkAdapter(config, get_context("spawn")).parse_describe_extended(relation, input_cols) + rows = SparkAdapter(config, get_context("spawn")).parse_describe_extended( + relation, input_cols + ) self.assertEqual(len(rows), 4) self.assertEqual( rows[0].to_column_dict(omit_none=False), @@ -419,7 +421,9 @@ def test_parse_relation_with_integer_owner(self): input_cols = [Row(keys=["col_name", "data_type"], values=r) for r in plain_rows] config = self._get_target_http(self.project_cfg) - rows = SparkAdapter(config, get_context("spawn")).parse_describe_extended(relation, input_cols) + rows = SparkAdapter(config, get_context("spawn")).parse_describe_extended( + relation, input_cols + ) self.assertEqual(rows[0].to_column_dict().get("table_owner"), "1234") @@ -455,7 +459,9 @@ def test_parse_relation_with_statistics(self): input_cols = [Row(keys=["col_name", "data_type"], values=r) for r in plain_rows] config = self._get_target_http(self.project_cfg) - rows = SparkAdapter(config, get_context("spawn")).parse_describe_extended(relation, input_cols) + rows = SparkAdapter(config, get_context("spawn")).parse_describe_extended( + relation, input_cols + ) self.assertEqual(len(rows), 1) self.assertEqual( rows[0].to_column_dict(omit_none=False), @@ -565,7 +571,9 @@ def test_parse_columns_from_information_with_table_type_and_delta_provider(self) ) config = self._get_target_http(self.project_cfg) - columns = SparkAdapter(config, get_context("spawn")).parse_columns_from_information(relation) + columns = SparkAdapter(config, get_context("spawn")).parse_columns_from_information( + relation + ) self.assertEqual(len(columns), 4) self.assertEqual( columns[0].to_column_dict(omit_none=False), @@ -650,7 +658,9 @@ def test_parse_columns_from_information_with_view_type(self): ) config = self._get_target_http(self.project_cfg) - columns = SparkAdapter(config, get_context("spawn")).parse_columns_from_information(relation) + columns = SparkAdapter(config, get_context("spawn")).parse_columns_from_information( + relation + ) self.assertEqual(len(columns), 4) self.assertEqual( columns[1].to_column_dict(omit_none=False), @@ -716,7 +726,9 @@ def test_parse_columns_from_information_with_table_type_and_parquet_provider(sel ) config = self._get_target_http(self.project_cfg) - columns = SparkAdapter(config, get_context("spawn")).parse_columns_from_information(relation) + columns = SparkAdapter(config, get_context("spawn")).parse_columns_from_information( + relation + ) self.assertEqual(len(columns), 4) self.assertEqual( From fd7a22fef6c598e06d703af52c3fff7b3b2f60ea Mon Sep 17 00:00:00 2001 From: Colin Date: Thu, 11 Jan 2024 11:42:19 -0800 Subject: [PATCH 089/102] fix typing and exception imports --- dbt/adapters/spark/connections.py | 29 +++++++++---------- dbt/adapters/spark/impl.py | 21 ++++++-------- dbt/adapters/spark/python_submissions.py | 37 +++++++++--------------- 3 files changed, 35 insertions(+), 52 deletions(-) diff --git a/dbt/adapters/spark/connections.py b/dbt/adapters/spark/connections.py index c9c69294f..1f2bc944a 100644 --- a/dbt/adapters/spark/connections.py +++ b/dbt/adapters/spark/connections.py @@ -1,12 +1,11 @@ from contextlib import contextmanager -import dbt.exceptions from dbt.adapters.base import Credentials from dbt.adapters.contracts.connection import AdapterResponse, ConnectionState, Connection from dbt.adapters.events.logging import AdapterLogger from dbt.adapters.exceptions import FailedToConnectError from dbt.adapters.sql import SQLConnectionManager -from dbt.common.exceptions import DbtConfigError +from dbt.common.exceptions import DbtConfigError, DbtRuntimeError, DbtDatabaseError from dbt.common.utils.encoding import DECIMALS from dbt.adapters.spark import __version__ @@ -94,15 +93,15 @@ def cluster_id(self) -> Optional[str]: def __post_init__(self) -> None: if self.method is None: - raise dbt.exceptions.DbtRuntimeError("Must specify `method` in profile") + raise DbtRuntimeError("Must specify `method` in profile") if self.host is None: - raise dbt.exceptions.DbtRuntimeError("Must specify `host` in profile") + raise DbtRuntimeError("Must specify `host` in profile") if self.schema is None: - raise dbt.exceptions.DbtRuntimeError("Must specify `schema` in profile") + raise DbtRuntimeError("Must specify `schema` in profile") # spark classifies database and schema as the same thing if self.database is not None and self.database != self.schema: - raise dbt.exceptions.DbtRuntimeError( + raise DbtRuntimeError( f" schema: {self.schema} \n" f" database: {self.database} \n" f"On Spark, database must be omitted or have the same value as" @@ -114,7 +113,7 @@ def __post_init__(self) -> None: try: import pyodbc # noqa: F401 except ImportError as e: - raise dbt.exceptions.DbtRuntimeError( + raise DbtRuntimeError( f"{self.method} connection method requires " "additional dependencies. \n" "Install the additional required dependencies with " @@ -123,7 +122,7 @@ def __post_init__(self) -> None: ) from e if self.method == SparkConnectionMethod.ODBC and self.cluster and self.endpoint: - raise dbt.exceptions.DbtRuntimeError( + raise DbtRuntimeError( "`cluster` and `endpoint` cannot both be set when" f" using {self.method} method to connect to Spark" ) @@ -132,7 +131,7 @@ def __post_init__(self) -> None: self.method == SparkConnectionMethod.HTTP or self.method == SparkConnectionMethod.THRIFT ) and not (ThriftState and THttpClient and hive): - raise dbt.exceptions.DbtRuntimeError( + raise DbtRuntimeError( f"{self.method} connection method requires " "additional dependencies. \n" "Install the additional required dependencies with " @@ -143,7 +142,7 @@ def __post_init__(self) -> None: try: import pyspark # noqa: F401 except ImportError as e: - raise dbt.exceptions.DbtRuntimeError( + raise DbtRuntimeError( f"{self.method} connection method requires " "additional dependencies. \n" "Install the additional required dependencies with " @@ -293,13 +292,11 @@ def execute(self, sql: str, bindings: Optional[List[Any]] = None) -> None: if poll_state.errorMessage: logger.debug("Poll response: {}".format(poll_state)) logger.debug("Poll status: {}".format(state)) - raise dbt.common.exceptions.DbtDatabaseError(poll_state.errorMessage) + raise DbtDatabaseError(poll_state.errorMessage) elif state not in STATE_SUCCESS: status_type = ThriftState._VALUES_TO_NAMES.get(state, "Unknown<{!r}>".format(state)) - raise dbt.common.exceptions.DbtDatabaseError( - "Query failed with status: {}".format(status_type) - ) + raise DbtDatabaseError("Query failed with status: {}".format(status_type)) logger.debug("Poll status: {}, query complete".format(state)) @@ -360,9 +357,9 @@ def exception_handler(self, sql: str) -> Generator[None, None, None]: thrift_resp = exc.args[0] if hasattr(thrift_resp, "status"): msg = thrift_resp.status.errorMessage - raise dbt.exceptions.DbtRuntimeError(msg) + raise DbtRuntimeError(msg) else: - raise dbt.exceptions.DbtRuntimeError(str(exc)) + raise DbtRuntimeError(str(exc)) def cancel(self, connection: Connection) -> None: connection.handle.cancel() diff --git a/dbt/adapters/spark/impl.py b/dbt/adapters/spark/impl.py index e206dac92..7e6c70e04 100644 --- a/dbt/adapters/spark/impl.py +++ b/dbt/adapters/spark/impl.py @@ -19,16 +19,13 @@ from dbt.adapters.base.relation import InformationSchema from dbt.adapters.contracts.connection import AdapterResponse from dbt.adapters.events.logging import AdapterLogger +from dbt.common.exceptions import DbtRuntimeError, CompilationError from dbt.common.utils import AttrDict, executor from typing_extensions import TypeAlias import agate -import dbt -import dbt.exceptions - - from dbt.adapters.base import AdapterConfig, PythonJobHelper from dbt.adapters.base.impl import catch_as_completed, ConstraintSupport from dbt.adapters.sql import SQLAdapter @@ -162,7 +159,7 @@ def _get_relation_information(self, row: agate.Row) -> RelationInfo: try: _schema, name, _, information = row except ValueError: - raise dbt.exceptions.DbtRuntimeError( + raise DbtRuntimeError( f'Invalid value from "show tables extended ...", got {len(row)} values, expected 4' ) @@ -173,7 +170,7 @@ def _get_relation_information_using_describe(self, row: agate.Row) -> RelationIn try: _schema, name, _ = row except ValueError: - raise dbt.exceptions.DbtRuntimeError( + raise DbtRuntimeError( f'Invalid value from "show tables ...", got {len(row)} values, expected 3' ) @@ -182,7 +179,7 @@ def _get_relation_information_using_describe(self, row: agate.Row) -> RelationIn table_results = self.execute_macro( DESCRIBE_TABLE_EXTENDED_MACRO_NAME, kwargs={"table_name": table_name} ) - except dbt.exceptions.DbtRuntimeError as e: + except DbtRuntimeError as e: logger.debug(f"Error while retrieving information about {table_name}: {e.msg}") table_results = AttrDict() @@ -237,7 +234,7 @@ def list_relations_without_caching(self, schema_relation: BaseRelation) -> List[ row_list=show_table_extended_rows, relation_info_func=self._get_relation_information, ) - except dbt.exceptions.DbtRuntimeError as e: + except DbtRuntimeError as e: errmsg = getattr(e, "msg", "") if f"Database '{schema_relation}' not found" in errmsg: return [] @@ -254,7 +251,7 @@ def list_relations_without_caching(self, schema_relation: BaseRelation) -> List[ row_list=show_table_rows, relation_info_func=self._get_relation_information_using_describe, ) - except dbt.exceptions.DbtRuntimeError as e: + except DbtRuntimeError as e: description = "Error while retrieving information about" logger.debug(f"{description} {schema_relation}: {e.msg}") return [] @@ -316,7 +313,7 @@ def get_columns_in_relation(self, relation: BaseRelation) -> List[SparkColumn]: GET_COLUMNS_IN_RELATION_RAW_MACRO_NAME, kwargs={"relation": relation} ) columns = self.parse_describe_extended(relation, rows) - except dbt.exceptions.DbtRuntimeError as e: + except DbtRuntimeError as e: # spark would throw error when table doesn't exist, where other # CDW would just return and empty list, normalizing the behavior here errmsg = getattr(e, "msg", "") @@ -376,7 +373,7 @@ def get_catalog( ) -> Tuple[agate.Table, List[Exception]]: schema_map = self._get_catalog_schemas(relation_configs) if len(schema_map) > 1: - raise dbt.exceptions.CompilationError( + raise CompilationError( f"Expected only one database in get_catalog, found " f"{list(schema_map)}" ) @@ -404,7 +401,7 @@ def _get_one_catalog( used_schemas: FrozenSet[Tuple[str, str]], ) -> agate.Table: if len(schemas) != 1: - raise dbt.exceptions.CompilationError( + raise CompilationError( f"Expected only one schema in spark _get_one_catalog, found " f"{schemas}" ) diff --git a/dbt/adapters/spark/python_submissions.py b/dbt/adapters/spark/python_submissions.py index 89831ca7f..0443b1d00 100644 --- a/dbt/adapters/spark/python_submissions.py +++ b/dbt/adapters/spark/python_submissions.py @@ -4,8 +4,9 @@ from typing import Any, Dict, Callable, Iterable import uuid -import dbt.exceptions from dbt.adapters.base import PythonJobHelper +from dbt.common.exceptions import DbtRuntimeError + from dbt.adapters.spark import SparkCredentials from dbt.adapters.spark import __version__ @@ -53,7 +54,7 @@ def _create_work_dir(self, path: str) -> None: }, ) if response.status_code != 200: - raise dbt.exceptions.DbtRuntimeError( + raise DbtRuntimeError( f"Error creating work_dir for python notebooks\n {response.content!r}" ) @@ -71,9 +72,7 @@ def _upload_notebook(self, path: str, compiled_code: str) -> None: }, ) if response.status_code != 200: - raise dbt.exceptions.DbtRuntimeError( - f"Error creating python notebook.\n {response.content!r}" - ) + raise DbtRuntimeError(f"Error creating python notebook.\n {response.content!r}") def _submit_job(self, path: str, cluster_spec: dict) -> str: job_spec = { @@ -99,9 +98,7 @@ def _submit_job(self, path: str, cluster_spec: dict) -> str: json=job_spec, ) if submit_response.status_code != 200: - raise dbt.exceptions.DbtRuntimeError( - f"Error creating python run.\n {submit_response.content!r}" - ) + raise DbtRuntimeError(f"Error creating python run.\n {submit_response.content!r}") return submit_response.json()["run_id"] def _submit_through_notebook(self, compiled_code: str, cluster_spec: dict) -> None: @@ -135,7 +132,7 @@ def _submit_through_notebook(self, compiled_code: str, cluster_spec: dict) -> No json_run_output = run_output.json() result_state = json_run_output["metadata"]["state"]["result_state"] if result_state != "SUCCESS": - raise dbt.exceptions.DbtRuntimeError( + raise DbtRuntimeError( "Python model failed with traceback as:\n" "(Note that the line number here does not " "match the line number in your code due to dbt templating)\n" @@ -169,9 +166,9 @@ def polling( response = status_func(**status_func_kwargs) state = get_state_func(response) if exceeded_timeout: - raise dbt.exceptions.DbtRuntimeError("python model run timed out") + raise DbtRuntimeError("python model run timed out") if state != expected_end_state: - raise dbt.exceptions.DbtRuntimeError( + raise DbtRuntimeError( "python model run ended in state" f"{state} with state_message\n{get_state_msg_func(response)}" ) @@ -205,9 +202,7 @@ def create(self) -> str: }, ) if response.status_code != 200: - raise dbt.exceptions.DbtRuntimeError( - f"Error creating an execution context.\n {response.content!r}" - ) + raise DbtRuntimeError(f"Error creating an execution context.\n {response.content!r}") return response.json()["id"] def destroy(self, context_id: str) -> str: @@ -221,9 +216,7 @@ def destroy(self, context_id: str) -> str: }, ) if response.status_code != 200: - raise dbt.exceptions.DbtRuntimeError( - f"Error deleting an execution context.\n {response.content!r}" - ) + raise DbtRuntimeError(f"Error deleting an execution context.\n {response.content!r}") return response.json()["id"] @@ -246,9 +239,7 @@ def execute(self, context_id: str, command: str) -> str: }, ) if response.status_code != 200: - raise dbt.exceptions.DbtRuntimeError( - f"Error creating a command.\n {response.content!r}" - ) + raise DbtRuntimeError(f"Error creating a command.\n {response.content!r}") return response.json()["id"] def status(self, context_id: str, command_id: str) -> Dict[str, Any]: @@ -263,9 +254,7 @@ def status(self, context_id: str, command_id: str) -> Dict[str, Any]: }, ) if response.status_code != 200: - raise dbt.exceptions.DbtRuntimeError( - f"Error getting status of command.\n {response.content!r}" - ) + raise DbtRuntimeError(f"Error getting status of command.\n {response.content!r}") return response.json() @@ -298,7 +287,7 @@ def submit(self, compiled_code: str) -> None: get_state_msg_func=lambda response: response.json()["results"]["data"], ) if response["results"]["resultType"] == "error": - raise dbt.exceptions.DbtRuntimeError( + raise DbtRuntimeError( f"Python model failed with traceback as:\n" f"{response['results']['cause']}" ) From 77df8b743a8b0078de6bd8ec8f43b6ad8283e309 Mon Sep 17 00:00:00 2001 From: Colin Date: Thu, 11 Jan 2024 11:48:16 -0800 Subject: [PATCH 090/102] add changie --- .changes/unreleased/Under the Hood-20240111-114806.yaml | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 .changes/unreleased/Under the Hood-20240111-114806.yaml diff --git a/.changes/unreleased/Under the Hood-20240111-114806.yaml b/.changes/unreleased/Under the Hood-20240111-114806.yaml new file mode 100644 index 000000000..31705f468 --- /dev/null +++ b/.changes/unreleased/Under the Hood-20240111-114806.yaml @@ -0,0 +1,6 @@ +kind: Under the Hood +body: Update import paths and list_relations to support decoupling adapters/core +time: 2024-01-11T11:48:06.120111-08:00 +custom: + Author: colin-rogers-dbt + Issue: "972" From dfd58858d58ad0e14c4c912a10b748b11554e712 Mon Sep 17 00:00:00 2001 From: Colin Date: Fri, 12 Jan 2024 14:42:55 -0800 Subject: [PATCH 091/102] replace dbt.common with dbt_common --- dbt/adapters/spark/column.py | 2 +- dbt/adapters/spark/connections.py | 6 +++--- dbt/adapters/spark/impl.py | 8 ++++---- dbt/adapters/spark/python_submissions.py | 2 +- dbt/adapters/spark/relation.py | 2 +- dbt/adapters/spark/session.py | 4 ++-- dev-requirements.txt | 4 ++-- tests/unit/utils.py | 2 +- 8 files changed, 15 insertions(+), 15 deletions(-) diff --git a/dbt/adapters/spark/column.py b/dbt/adapters/spark/column.py index dbc872051..39f6f529e 100644 --- a/dbt/adapters/spark/column.py +++ b/dbt/adapters/spark/column.py @@ -2,7 +2,7 @@ from typing import Any, Dict, Optional, TypeVar, Union from dbt.adapters.base.column import Column -from dbt.common.dataclass_schema import dbtClassMixin +from dbt_common.dataclass_schema import dbtClassMixin Self = TypeVar("Self", bound="SparkColumn") diff --git a/dbt/adapters/spark/connections.py b/dbt/adapters/spark/connections.py index 1f2bc944a..6e9e631b7 100644 --- a/dbt/adapters/spark/connections.py +++ b/dbt/adapters/spark/connections.py @@ -5,9 +5,9 @@ from dbt.adapters.events.logging import AdapterLogger from dbt.adapters.exceptions import FailedToConnectError from dbt.adapters.sql import SQLConnectionManager -from dbt.common.exceptions import DbtConfigError, DbtRuntimeError, DbtDatabaseError +from dbt_common.exceptions import DbtConfigError, DbtRuntimeError, DbtDatabaseError -from dbt.common.utils.encoding import DECIMALS +from dbt_common.utils.encoding import DECIMALS from dbt.adapters.spark import __version__ try: @@ -24,7 +24,7 @@ pyodbc = None from datetime import datetime import sqlparams -from dbt.common.dataclass_schema import StrEnum +from dbt_common.dataclass_schema import StrEnum from dataclasses import dataclass, field from typing import Any, Dict, Optional, Union, Tuple, List, Generator, Iterable, Sequence diff --git a/dbt/adapters/spark/impl.py b/dbt/adapters/spark/impl.py index 7e6c70e04..9a1a7ec06 100644 --- a/dbt/adapters/spark/impl.py +++ b/dbt/adapters/spark/impl.py @@ -19,8 +19,8 @@ from dbt.adapters.base.relation import InformationSchema from dbt.adapters.contracts.connection import AdapterResponse from dbt.adapters.events.logging import AdapterLogger -from dbt.common.exceptions import DbtRuntimeError, CompilationError -from dbt.common.utils import AttrDict, executor +from dbt_common.exceptions import DbtRuntimeError, CompilationError +from dbt_common.utils import AttrDict, executor from typing_extensions import TypeAlias @@ -38,8 +38,8 @@ ) from dbt.adapters.base import BaseRelation from dbt.adapters.contracts.relation import RelationType, RelationConfig -from dbt.common.clients.agate_helper import DEFAULT_TYPE_TESTER -from dbt.common.contracts.constraints import ConstraintType +from dbt_common.clients.agate_helper import DEFAULT_TYPE_TESTER +from dbt_common.contracts.constraints import ConstraintType logger = AdapterLogger("Spark") packages = ["pyhive.hive", "thrift.transport", "thrift.protocol"] diff --git a/dbt/adapters/spark/python_submissions.py b/dbt/adapters/spark/python_submissions.py index 0443b1d00..e3e7cb370 100644 --- a/dbt/adapters/spark/python_submissions.py +++ b/dbt/adapters/spark/python_submissions.py @@ -5,7 +5,7 @@ import uuid from dbt.adapters.base import PythonJobHelper -from dbt.common.exceptions import DbtRuntimeError +from dbt_common.exceptions import DbtRuntimeError from dbt.adapters.spark import SparkCredentials from dbt.adapters.spark import __version__ diff --git a/dbt/adapters/spark/relation.py b/dbt/adapters/spark/relation.py index a6d679d56..860935cbd 100644 --- a/dbt/adapters/spark/relation.py +++ b/dbt/adapters/spark/relation.py @@ -4,7 +4,7 @@ from dbt.adapters.base.relation import BaseRelation, Policy from dbt.adapters.events.logging import AdapterLogger -from dbt.common.exceptions import DbtRuntimeError +from dbt_common.exceptions import DbtRuntimeError logger = AdapterLogger("Spark") diff --git a/dbt/adapters/spark/session.py b/dbt/adapters/spark/session.py index d5d3ff050..7a6982e50 100644 --- a/dbt/adapters/spark/session.py +++ b/dbt/adapters/spark/session.py @@ -8,8 +8,8 @@ from dbt.adapters.spark.connections import SparkConnectionWrapper from dbt.adapters.events.logging import AdapterLogger -from dbt.common.utils.encoding import DECIMALS -from dbt.common.exceptions import DbtRuntimeError +from dbt_common.utils.encoding import DECIMALS +from dbt_common.exceptions import DbtRuntimeError from pyspark.sql import DataFrame, Row, SparkSession from pyspark.sql.utils import AnalysisException diff --git a/dev-requirements.txt b/dev-requirements.txt index e56b221c7..3dd8eb727 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -1,7 +1,7 @@ # install latest changes in dbt-core # TODO: how to automate switching from develop to version branches? -git+https://github.com/dbt-labs/dbt-core.git#egg=dbt-core&subdirectory=core -git+https://github.com/dbt-labs/dbt-core.git#egg=dbt-tests-adapter&subdirectory=tests/adapter +git+https://github.com/dbt-labs/dbt-core.git@er/remove-common-req-keep-legacy-logger#egg=dbt-core&subdirectory=core +git+https://github.com/dbt-labs/dbt-core.git@er/remove-common-req-keep-legacy-logger#egg=dbt-tests-adapter&subdirectory=tests/adapter # if version 1.x or greater -> pin to major version # if version 0.x -> pin to minor diff --git a/tests/unit/utils.py b/tests/unit/utils.py index a32d6608d..17cd3ee78 100644 --- a/tests/unit/utils.py +++ b/tests/unit/utils.py @@ -9,7 +9,7 @@ import agate import pytest -from dbt.common.dataclass_schema import ValidationError +from dbt_common.dataclass_schema import ValidationError from dbt.config.project import PartialProject From 3fc6d07fa731b68bbcd904839b7ac11e5941ea9b Mon Sep 17 00:00:00 2001 From: Colin Date: Fri, 12 Jan 2024 14:51:57 -0800 Subject: [PATCH 092/102] update setup.py --- setup.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 301b4a41f..8e839c842 100644 --- a/setup.py +++ b/setup.py @@ -20,7 +20,6 @@ print('Please upgrade setuptools with "pip install --upgrade setuptools" ' "and try again") sys.exit(1) - # pull long description from README this_directory = os.path.abspath(os.path.dirname(__file__)) with open(os.path.join(this_directory, "README.md"), "r", encoding="utf8") as f: @@ -73,8 +72,8 @@ def _get_dbt_core_version(): packages=find_namespace_packages(include=["dbt", "dbt.*"]), include_package_data=True, install_requires=[ - "dbt-core~={}".format(dbt_core_version), "sqlparams>=3.0.0", + "dbt-common @ git+https://github.com/dbt-labs/dbt-common.git#egg=dbt", ], extras_require={ "ODBC": odbc_extras, From 17607c11482bdbee0ead33791eb59e43b5c99efe Mon Sep 17 00:00:00 2001 From: Colin Date: Tue, 16 Jan 2024 15:46:01 -0800 Subject: [PATCH 093/102] add dbt-adapters --- dagger/run_dbt_spark_tests.py | 3 ++- dev-requirements.txt | 4 ++-- setup.py | 1 + 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/dagger/run_dbt_spark_tests.py b/dagger/run_dbt_spark_tests.py index 7adb352a2..512bb755b 100644 --- a/dagger/run_dbt_spark_tests.py +++ b/dagger/run_dbt_spark_tests.py @@ -86,7 +86,7 @@ async def get_spark_container(client: dagger.Client) -> (dagger.Container, str): async def test_spark(test_args): async with dagger.Connection(dagger.Config(log_output=sys.stderr)) as client: test_profile = test_args.profile - req_files = client.host().directory("./", include=["*.txt", "*.env", "*.ini"]) + req_files = client.host().directory("./", include=["*.txt", "*.env", "*.ini", "*.md", "setup.py"]) dbt_spark_dir = client.host().directory("./dbt") test_dir = client.host().directory("./tests") scripts = client.host().directory("./dagger/scripts") @@ -99,6 +99,7 @@ async def test_spark(test_args): .with_directory("/tests", test_dir) .with_directory("/scripts", scripts) .with_exec("./scripts/install_os_reqs.sh") + .with_exec(["pip", "install", "-e", "."]) .with_exec(["pip", "install", "-r", "requirements.txt"]) .with_exec(["pip", "install", "-r", "dev-requirements.txt"]) ) diff --git a/dev-requirements.txt b/dev-requirements.txt index 3dd8eb727..e56b221c7 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -1,7 +1,7 @@ # install latest changes in dbt-core # TODO: how to automate switching from develop to version branches? -git+https://github.com/dbt-labs/dbt-core.git@er/remove-common-req-keep-legacy-logger#egg=dbt-core&subdirectory=core -git+https://github.com/dbt-labs/dbt-core.git@er/remove-common-req-keep-legacy-logger#egg=dbt-tests-adapter&subdirectory=tests/adapter +git+https://github.com/dbt-labs/dbt-core.git#egg=dbt-core&subdirectory=core +git+https://github.com/dbt-labs/dbt-core.git#egg=dbt-tests-adapter&subdirectory=tests/adapter # if version 1.x or greater -> pin to major version # if version 0.x -> pin to minor diff --git a/setup.py b/setup.py index 8e839c842..6d2a14686 100644 --- a/setup.py +++ b/setup.py @@ -74,6 +74,7 @@ def _get_dbt_core_version(): install_requires=[ "sqlparams>=3.0.0", "dbt-common @ git+https://github.com/dbt-labs/dbt-common.git#egg=dbt", + "dbt-adapters @ git+https://github.com/dbt-labs/dbt-adapters.git#egg=dbt", ], extras_require={ "ODBC": odbc_extras, From 79d74aa422a9178f8432346c8c982ad9af3f3844 Mon Sep 17 00:00:00 2001 From: Colin Date: Mon, 22 Jan 2024 11:43:32 -0800 Subject: [PATCH 094/102] update setup.py --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 6d2a14686..d45b787f2 100644 --- a/setup.py +++ b/setup.py @@ -73,8 +73,8 @@ def _get_dbt_core_version(): include_package_data=True, install_requires=[ "sqlparams>=3.0.0", - "dbt-common @ git+https://github.com/dbt-labs/dbt-common.git#egg=dbt", - "dbt-adapters @ git+https://github.com/dbt-labs/dbt-adapters.git#egg=dbt", + "dbt-common<1.0", + "dbt-adapters~=0.1.0a1", ], extras_require={ "ODBC": odbc_extras, From 011c9b59b5089dbadd516e7feea43dcc5bc970c2 Mon Sep 17 00:00:00 2001 From: Colin Date: Mon, 22 Jan 2024 11:57:37 -0800 Subject: [PATCH 095/102] fix credentials import --- dbt/adapters/spark/connections.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/dbt/adapters/spark/connections.py b/dbt/adapters/spark/connections.py index 6e9e631b7..83048f921 100644 --- a/dbt/adapters/spark/connections.py +++ b/dbt/adapters/spark/connections.py @@ -1,7 +1,11 @@ from contextlib import contextmanager -from dbt.adapters.base import Credentials -from dbt.adapters.contracts.connection import AdapterResponse, ConnectionState, Connection +from dbt.adapters.contracts.connection import ( + AdapterResponse, + ConnectionState, + Connection, + Credentials, +) from dbt.adapters.events.logging import AdapterLogger from dbt.adapters.exceptions import FailedToConnectError from dbt.adapters.sql import SQLConnectionManager From a40b07c241010d1a899a8f17e8b99f3421e565da Mon Sep 17 00:00:00 2001 From: Colin Date: Mon, 22 Jan 2024 13:55:37 -0800 Subject: [PATCH 096/102] fix dev-requirements.txt --- dagger/run_dbt_spark_tests.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/dagger/run_dbt_spark_tests.py b/dagger/run_dbt_spark_tests.py index 512bb755b..80e4e5fa9 100644 --- a/dagger/run_dbt_spark_tests.py +++ b/dagger/run_dbt_spark_tests.py @@ -86,7 +86,9 @@ async def get_spark_container(client: dagger.Client) -> (dagger.Container, str): async def test_spark(test_args): async with dagger.Connection(dagger.Config(log_output=sys.stderr)) as client: test_profile = test_args.profile - req_files = client.host().directory("./", include=["*.txt", "*.env", "*.ini", "*.md", "setup.py"]) + req_files = client.host().directory( + "./", include=["*.txt", "*.env", "*.ini", "*.md", "setup.py"] + ) dbt_spark_dir = client.host().directory("./dbt") test_dir = client.host().directory("./tests") scripts = client.host().directory("./dagger/scripts") From 8aac398821d7aa9d20970bfe96a79c2c3d7029d2 Mon Sep 17 00:00:00 2001 From: Colin Date: Wed, 24 Jan 2024 15:16:10 -0800 Subject: [PATCH 097/102] dagger improvements to caching and installing package under test --- dagger/run_dbt_spark_tests.py | 40 ++++++++++++++++++++++++----------- setup.py | 1 + tests/conftest.py | 8 +++---- 3 files changed, 33 insertions(+), 16 deletions(-) diff --git a/dagger/run_dbt_spark_tests.py b/dagger/run_dbt_spark_tests.py index 80e4e5fa9..61eaed30c 100644 --- a/dagger/run_dbt_spark_tests.py +++ b/dagger/run_dbt_spark_tests.py @@ -29,18 +29,19 @@ def env_variables_inner(ctr: dagger.Container): return env_variables_inner -async def get_postgres_container(client: dagger.Client) -> (dagger.Container, str): - ctr = await ( +def get_postgres_container(client: dagger.Client) -> (dagger.Container, str): + ctr = ( client.container() .from_("postgres:13") .with_env_variable("POSTGRES_PASSWORD", "postgres") .with_exposed_port(PG_PORT) + .as_service() ) return ctr, "postgres_db" -async def get_spark_container(client: dagger.Client) -> (dagger.Container, str): +def get_spark_container(client: dagger.Client) -> (dagger.Service, str): spark_dir = client.host().directory("./dagger/spark-container") spark_ctr_base = ( client.container() @@ -64,7 +65,7 @@ async def get_spark_container(client: dagger.Client) -> (dagger.Container, str): ) # postgres is the metastore here - pg_ctr, pg_host = await get_postgres_container(client) + pg_ctr, pg_host = get_postgres_container(client) spark_ctr = ( spark_ctr_base.with_service_binding(alias=pg_host, service=pg_ctr) @@ -78,6 +79,7 @@ async def get_spark_container(client: dagger.Client) -> (dagger.Container, str): ] ) .with_exposed_port(10000) + .as_service() ) return spark_ctr, "spark_db" @@ -86,32 +88,46 @@ async def get_spark_container(client: dagger.Client) -> (dagger.Container, str): async def test_spark(test_args): async with dagger.Connection(dagger.Config(log_output=sys.stderr)) as client: test_profile = test_args.profile + + # create cache volumes, these are persisted between runs saving time when developing locally + os_reqs_cache = client.cache_volume("os_reqs") + pip_cache = client.cache_volume("pip") + + # setup directories as we don't want to copy the whole repo into the container req_files = client.host().directory( "./", include=["*.txt", "*.env", "*.ini", "*.md", "setup.py"] ) dbt_spark_dir = client.host().directory("./dbt") test_dir = client.host().directory("./tests") scripts = client.host().directory("./dagger/scripts") + platform = dagger.Platform("linux/amd64") tst_container = ( client.container(platform=platform) .from_("python:3.8-slim") - .with_directory("/.", req_files) - .with_directory("/dbt", dbt_spark_dir) - .with_directory("/tests", test_dir) - .with_directory("/scripts", scripts) - .with_exec("./scripts/install_os_reqs.sh") - .with_exec(["pip", "install", "-e", "."]) + .with_directory("/src", req_files) + .with_directory("/src/dbt", dbt_spark_dir) + .with_directory("/src/tests", test_dir) + .with_directory("/src/scripts", scripts) + .with_workdir("/src") + .with_mounted_cache("/var/cache/apt/archives", os_reqs_cache) + .with_exec(["./scripts/install_os_reqs.sh"]) + ) + + tst_container = ( + tst_container.with_mounted_cache("/root/.cache/pip", pip_cache) + .with_exec(["pip", "install", "-U", "pip"]) .with_exec(["pip", "install", "-r", "requirements.txt"]) .with_exec(["pip", "install", "-r", "dev-requirements.txt"]) + .with_exec(["pip", "install", "-e", "."]) ) if test_profile == "apache_spark": - spark_ctr, spark_host = await get_spark_container(client) + spark_ctr, spark_host = get_spark_container(client) tst_container = tst_container.with_service_binding(alias=spark_host, service=spark_ctr) elif test_profile in ["databricks_cluster", "databricks_sql_endpoint"]: - tst_container = tst_container.with_exec("./scripts/configure_odbc.sh") + tst_container = tst_container.with_exec(["./scripts/configure_odbc.sh"]) elif test_profile == "spark_session": tst_container = tst_container.with_exec(["pip", "install", "pyspark"]) diff --git a/setup.py b/setup.py index d45b787f2..c1b439190 100644 --- a/setup.py +++ b/setup.py @@ -73,6 +73,7 @@ def _get_dbt_core_version(): include_package_data=True, install_requires=[ "sqlparams>=3.0.0", + "dbt-core~={}".format(dbt_core_version), "dbt-common<1.0", "dbt-adapters~=0.1.0a1", ], diff --git a/tests/conftest.py b/tests/conftest.py index 700ade4d3..efd309c6a 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -42,9 +42,9 @@ def apache_spark_target(): "user": "dbt", "method": "thrift", "port": 10000, - "connect_retries": 3, - "connect_timeout": 5, - "retry_all": True, + "connect_retries": 2, + "connect_timeout": 3, + "retry_all": False, } @@ -59,7 +59,7 @@ def databricks_cluster_target(): "port": 443, "connect_retries": 3, "connect_timeout": 5, - "retry_all": True, + "retry_all": False, "user": os.getenv("DBT_DATABRICKS_USER"), } From 6edcdcfdbf097a8f8315d5f8eec2a49e99b5e6d7 Mon Sep 17 00:00:00 2001 From: Colin Date: Wed, 24 Jan 2024 15:16:31 -0800 Subject: [PATCH 098/102] update requirements --- dagger/requirements.txt | 2 +- dev-requirements.txt | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/dagger/requirements.txt b/dagger/requirements.txt index df36543c2..b50c448d3 100644 --- a/dagger/requirements.txt +++ b/dagger/requirements.txt @@ -1,2 +1,2 @@ -dagger-io~=0.8.0 +dagger-io~=0.9.7 python-dotenv diff --git a/dev-requirements.txt b/dev-requirements.txt index e56b221c7..28a626fc3 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -1,7 +1,6 @@ # install latest changes in dbt-core # TODO: how to automate switching from develop to version branches? -git+https://github.com/dbt-labs/dbt-core.git#egg=dbt-core&subdirectory=core -git+https://github.com/dbt-labs/dbt-core.git#egg=dbt-tests-adapter&subdirectory=tests/adapter +git+https://github.com/dbt-labs/dbt-adapters.git#subdirectory=dbt-tests-adapter # if version 1.x or greater -> pin to major version # if version 0.x -> pin to minor From eeba17f132b0ff24502436a72284d7a2d3cc036f Mon Sep 17 00:00:00 2001 From: Colin Date: Wed, 24 Jan 2024 15:17:13 -0800 Subject: [PATCH 099/102] add cluster start fixture --- tests/functional/conftest.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) create mode 100644 tests/functional/conftest.py diff --git a/tests/functional/conftest.py b/tests/functional/conftest.py new file mode 100644 index 000000000..07419aa40 --- /dev/null +++ b/tests/functional/conftest.py @@ -0,0 +1,18 @@ +from multiprocessing import Lock + +import pytest + +_db_start_lock = Lock() +_DB_CLUSTER_STARTED = False + + +@pytest.fixture(scope="class", autouse=True) +def start_databricks_cluster(project, request): + global _DB_CLUSTER_STARTED + profile_type = request.config.getoption("--profile") + with _db_start_lock: + if "databricks" in profile_type and not _DB_CLUSTER_STARTED: + print("Starting Databricks cluster") + project.run_sql("SELECT 1") + + _DB_CLUSTER_STARTED = True From f3a4c2d5e9ea921656306933a64018db6fbfbe3e Mon Sep 17 00:00:00 2001 From: Colin Date: Wed, 24 Jan 2024 16:09:00 -0800 Subject: [PATCH 100/102] update conftest.py --- tests/conftest.py | 8 +++----- tests/functional/conftest.py | 1 + 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index efd309c6a..efba41a5f 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -87,11 +87,9 @@ def databricks_http_cluster_target(): "token": os.getenv("DBT_DATABRICKS_TOKEN"), "method": "http", "port": 443, - # more retries + longer timout to handle unavailability while cluster is restarting - # return failures quickly in dev, retry all failures in CI (up to 5 min) - "connect_retries": 5, - "connect_timeout": 60, - "retry_all": bool(os.getenv("DBT_DATABRICKS_RETRY_ALL", False)), + "connect_retries": 3, + "connect_timeout": 5, + "retry_all": False, "user": os.getenv("DBT_DATABRICKS_USER"), } diff --git a/tests/functional/conftest.py b/tests/functional/conftest.py index 07419aa40..c1a0397bd 100644 --- a/tests/functional/conftest.py +++ b/tests/functional/conftest.py @@ -6,6 +6,7 @@ _DB_CLUSTER_STARTED = False +# Running this should prevent tests from needing to be retried because the Databricks cluster isn't available @pytest.fixture(scope="class", autouse=True) def start_databricks_cluster(project, request): global _DB_CLUSTER_STARTED From 32c05bbd08e90f1245d9c77d00e063498484cbd7 Mon Sep 17 00:00:00 2001 From: Colin Date: Wed, 24 Jan 2024 16:43:17 -0800 Subject: [PATCH 101/102] re-order dagger setup to reduce cache invalidation --- dagger/run_dbt_spark_tests.py | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/dagger/run_dbt_spark_tests.py b/dagger/run_dbt_spark_tests.py index 61eaed30c..436cb1e92 100644 --- a/dagger/run_dbt_spark_tests.py +++ b/dagger/run_dbt_spark_tests.py @@ -105,17 +105,16 @@ async def test_spark(test_args): tst_container = ( client.container(platform=platform) .from_("python:3.8-slim") - .with_directory("/src", req_files) - .with_directory("/src/dbt", dbt_spark_dir) - .with_directory("/src/tests", test_dir) - .with_directory("/src/scripts", scripts) - .with_workdir("/src") .with_mounted_cache("/var/cache/apt/archives", os_reqs_cache) + .with_mounted_cache("/root/.cache/pip", pip_cache) + # install OS deps first so any local changes don't invalidate the cache + .with_directory("/scripts", scripts) .with_exec(["./scripts/install_os_reqs.sh"]) - ) - - tst_container = ( - tst_container.with_mounted_cache("/root/.cache/pip", pip_cache) + # install dbt-spark + python deps + .with_directory("/src", req_files) + .with_directory("src/dbt", dbt_spark_dir) + .with_directory("src/tests", test_dir) + .with_workdir("/src") .with_exec(["pip", "install", "-U", "pip"]) .with_exec(["pip", "install", "-r", "requirements.txt"]) .with_exec(["pip", "install", "-r", "dev-requirements.txt"]) @@ -127,7 +126,11 @@ async def test_spark(test_args): tst_container = tst_container.with_service_binding(alias=spark_host, service=spark_ctr) elif test_profile in ["databricks_cluster", "databricks_sql_endpoint"]: - tst_container = tst_container.with_exec(["./scripts/configure_odbc.sh"]) + tst_container = ( + tst_container.with_workdir("/") + .with_exec(["./scripts/configure_odbc.sh"]) + .with_workdir("/src") + ) elif test_profile == "spark_session": tst_container = tst_container.with_exec(["pip", "install", "pyspark"]) From e8e4543a0e2c74a43179bcb916c719591f372968 Mon Sep 17 00:00:00 2001 From: Colin Date: Wed, 24 Jan 2024 16:43:50 -0800 Subject: [PATCH 102/102] renove dbt-core version dependency version check --- setup.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/setup.py b/setup.py index c1b439190..2d6e00e53 100644 --- a/setup.py +++ b/setup.py @@ -39,17 +39,8 @@ def _get_plugin_version_dict(): return match.groupdict() -# require a compatible minor version (~=), prerelease if this is a prerelease -def _get_dbt_core_version(): - parts = _get_plugin_version_dict() - minor = "{major}.{minor}.0".format(**parts) - pre = parts["prekind"] + "1" if parts["prekind"] else "" - return f"{minor}{pre}" - - package_name = "dbt-spark" package_version = "1.8.0a1" -dbt_core_version = _get_dbt_core_version() description = """The Apache Spark adapter plugin for dbt""" odbc_extras = ["pyodbc~=4.0.39"] @@ -73,7 +64,6 @@ def _get_dbt_core_version(): include_package_data=True, install_requires=[ "sqlparams>=3.0.0", - "dbt-core~={}".format(dbt_core_version), "dbt-common<1.0", "dbt-adapters~=0.1.0a1", ],