From 1cf3c96ac54f4f6c368c32a8c4c8037287d984bf Mon Sep 17 00:00:00 2001 From: Mat Moore Date: Tue, 28 May 2024 12:26:37 +0100 Subject: [PATCH] WIP - workflow to run a CaDeT ingestion --- .github/workflows/ingest-cadet.yml | 40 ++++++++++++++++++++++++++++++ ingestion/cadet.yaml | 27 ++++++++++++++++++++ 2 files changed, 67 insertions(+) create mode 100644 .github/workflows/ingest-cadet.yml create mode 100644 ingestion/cadet.yaml diff --git a/.github/workflows/ingest-cadet.yml b/.github/workflows/ingest-cadet.yml new file mode 100644 index 00000000..bb923c1d --- /dev/null +++ b/.github/workflows/ingest-cadet.yml @@ -0,0 +1,40 @@ +name: "Ingest DBT metadata from Create a Derived Table" + +permissions: + id-token: write + contents: read + +on: + workflow_call: + inputs: + env: + description: "which environment to deploy to" + required: true + type: string + ecr_region: + description: "ecr region to connect to" + required: false + type: string + default: eu-west-1 + +jobs: + main: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v2 + with: + python-version: 3.11.1 + - uses: aws-actions/configure-aws-credentials@v4 + with: + role-to-assume: ${{ secrets.CADET_METADATA_ROLE_TO_ASSUME }} + aws-region: ${{ inputs.ECR_REGION }} + - name: install reqs + run: pip install acryl-datahub + - name: push metadata to datahub + env: + DATAHUB_GMS_TOKEN: ${{ secrets.CATALOGUE_TOKEN }} + DATAHUB_GMS_URL: ${{ vars.CATALOGUE_URL }} + run: | + datahub init + datahub ingest -c ingestion/cadet.yaml diff --git a/ingestion/cadet.yaml b/ingestion/cadet.yaml new file mode 100644 index 00000000..a2e75921 --- /dev/null +++ b/ingestion/cadet.yaml @@ -0,0 +1,27 @@ +source: + type: dbt + config: + manifest_path: "s3://mojap-derived-tables/prod/run_artefacts/latest/target/manifest.json" + catalog_path: "s3://mojap-derived-tables/prod/run_artefacts/latest/target/catalog.json" + test_results_path: "s3://mojap-derived-tables/prod/run_artefacts/latest/target/run_results.json" + platform_instance: cadet + target_platform: athena + target_platform_instance: athena_cadet + infer_dbt_schemas: true + entities_enabled: + test_results: true + seeds: false + snapshots: true + models: true + sources: true + test_definitions: true + stateful_ingestion: + remove_stale_metadata: true +transformers: + - type: pattern_add_dataset_domain + config: + semantics: OVERWRITE + domain_pattern: + rules: + 'urn:li:dataset:\(urn:li:dataPlatform:dbt,cadet\.awsdatacatalog\.courts.*': + [courts]