diff --git a/.gitignore b/.gitignore index 277adc9a..921cb8b1 100644 --- a/.gitignore +++ b/.gitignore @@ -1,10 +1,24 @@ +# Python virtualenv +.venv/ +venv/ +# Environment variables +.env + +# DBT artifacts target/ logs/ dbt_modules/ dbt_packages/ -.vscode integration_tests/state/ site/ env/ +profiles.yml +package-lock.yml + +# IDE +.vscode +.idea + +# MacOS .DS_Store diff --git a/README.md b/README.md index 2f247e5b..d59486aa 100644 --- a/README.md +++ b/README.md @@ -21,6 +21,7 @@ Currently, the following adapters are supported: - Snowflake - DuckDB - Trino (tested with Iceberg connector) +- AWS Athena (tested manually) ## Using This Package diff --git a/dbt_project.yml b/dbt_project.yml index e706fa5d..6a086102 100644 --- a/dbt_project.yml +++ b/dbt_project.yml @@ -81,8 +81,8 @@ vars: other_prefixes: ['rpt_'] # -- Performance variables -- - chained_views_threshold: "{{ 5 if target.type != 'trino' else 4 }}" + chained_views_threshold: "{{ 5 if target.type not in ['athena', 'trino'] else 4 }}" # -- Execution variables -- - insert_batch_size: "{{ 500 if target.type == 'bigquery' else 10000 }}" - max_depth_dag: "{{ 9 if target.type in ['bigquery', 'spark', 'databricks'] else 4 if target.type == 'trino' else -1 }}" + insert_batch_size: "{{ 500 if target.type in ['athena', 'bigquery'] else 10000 }}" + max_depth_dag: "{{ 9 if target.type in ['bigquery', 'spark', 'databricks'] else 4 if target.type in ['athena', 'trino'] else -1 }}" diff --git a/integration_tests/README.md b/integration_tests/README.md index 769f2a0c..f529088b 100644 --- a/integration_tests/README.md +++ b/integration_tests/README.md @@ -1,4 +1,4 @@ -## Test dbt Project +# Test dbt Project The models within this folder (barring those in models/audit_schema_tests) represent a dbt project with poor DAG modeling. Error detection tools within this package are tested on this dbt project. @@ -6,3 +6,37 @@ The models within this folder (barring those in models/audit_schema_tests) repre ## Adding an Integration Test Create a seed which matches the intended output of your model and add equality tests comparing the output to your seed to the output of your model. + +## Local tests + +### AWS Athena + +To run tests locally, please follow instructions: + +* Set up environment variables: + +```bash +ATHENA_S3_STAGING_DIR= +ATHENA_S3_DATA_DIR= +ATHENA_REGION= +ATHENA_SCHEMA= +ATHENA_WORKGROUP= +``` + +* Add `profiles.yml` file based on [sample](ci/sample.profiles.yml): + +```yaml + athena: # for local tests only + type: athena + s3_staging_dir: {{ env_var('ATHENA_S3_STAGING_DIR') }} + s3_data_dir: {{ env_var('ATHENA_S3_DATA_DIR') }} + s3_data_naming: schema_table_unique + region_name: {{ env_var('ATHENA_REGION') }} + schema: {{ env_var('ATHENA_SCHEMA') }} + database: awsdatacatalog + work_group: {{ env_var('ATHENA_WORKGROUP') }} + num_retries: 2 + threads: 4 +``` + +* Now you can run integration tests, see details [here](../run_test.sh) with `--target athena` flag for dbt commands. diff --git a/integration_tests/seeds/docs/docs_seeds.yml b/integration_tests/seeds/docs/docs_seeds.yml index ef885352..c86ca95f 100644 --- a/integration_tests/seeds/docs/docs_seeds.yml +++ b/integration_tests/seeds/docs/docs_seeds.yml @@ -13,10 +13,10 @@ seeds: - name: test_fct_documentation_coverage config: column_types: - staging_documentation_coverage_pct: "{{ 'float' if target.type not in ['spark','databricks','duckdb','trino'] else 'decimal(10,2)' }}" - intermediate_documentation_coverage_pct: "{{ 'float' if target.type not in ['spark','databricks','duckdb','trino'] else 'decimal(10,2)' }}" - marts_documentation_coverage_pct: "{{ 'float' if target.type not in ['spark','databricks','duckdb','trino'] else 'decimal(10,2)' }}" - other_documentation_coverage_pct: "{{ 'float' if target.type not in ['spark','databricks','duckdb','trino'] else 'decimal(10,2)' }}" + staging_documentation_coverage_pct: &float "{{ 'float' if target.type not in ['athena', 'databricks', 'duckdb','trino', 'spark'] else 'decimal(10,2)' }}" + intermediate_documentation_coverage_pct: *float + marts_documentation_coverage_pct: *float + other_documentation_coverage_pct: *float tags: - docs tests: @@ -48,4 +48,4 @@ seeds: tests: - dbt_utils.equality: name: equality_fct_undocumented_sources - compare_model: ref('fct_undocumented_sources') \ No newline at end of file + compare_model: ref('fct_undocumented_sources') diff --git a/integration_tests/seeds/tests/tests_seeds.yml b/integration_tests/seeds/tests/tests_seeds.yml index 1f86ab85..25c4b238 100644 --- a/integration_tests/seeds/tests/tests_seeds.yml +++ b/integration_tests/seeds/tests/tests_seeds.yml @@ -10,11 +10,11 @@ seeds: - name: test_fct_test_coverage config: column_types: - test_coverage_pct: "{{ 'float' if target.type != 'trino' else 'double' }}" - staging_test_coverage_pct: "{{ 'float' if target.type != 'trino' else 'double' }}" - intermediate_test_coverage_pct: "{{ 'float' if target.type != 'trino' else 'double' }}" - marts_test_coverage_pct: "{{ 'float' if target.type != 'trino' else 'double' }}" - other_test_coverage_pct: "{{ 'float' if target.type != 'trino' else 'double' }}" + test_coverage_pct: &float "{{ 'float' if target.type not in ['athena', 'trino'] else 'double' }}" + staging_test_coverage_pct: *float + intermediate_test_coverage_pct: *float + marts_test_coverage_pct: *float + other_test_coverage_pct: *float tests: - dbt_utils.equality: name: equality_fct_test_coverage @@ -28,4 +28,4 @@ seeds: - staging_test_coverage_pct - intermediate_test_coverage_pct - marts_test_coverage_pct - - other_test_coverage_pct \ No newline at end of file + - other_test_coverage_pct diff --git a/macros/recursive_dag.sql b/macros/recursive_dag.sql index f0911da9..38d1614d 100644 --- a/macros/recursive_dag.sql +++ b/macros/recursive_dag.sql @@ -5,7 +5,7 @@ {% macro default__recursive_dag() %} with recursive direct_relationships as ( - select + select * from {{ ref('int_direct_relationships') }} where resource_type <> 'test' @@ -44,7 +44,7 @@ all_relationships ( path, is_dependent_on_chain_of_views ) as ( - -- anchor + -- anchor select distinct resource_id as parent_id, resource_name as parent, @@ -76,11 +76,11 @@ all_relationships ( from direct_relationships -- where direct_parent_id is null {# optional lever to change filtering of anchor clause to only include root resources #} - + union all -- recursive clause - select + select all_relationships.parent_id as parent_id, all_relationships.parent as parent, all_relationships.parent_resource_type as parent_resource_type, @@ -105,12 +105,12 @@ all_relationships ( direct_relationships.directory_path as child_directory_path, direct_relationships.file_name as child_file_name, direct_relationships.is_excluded as child_is_excluded, - all_relationships.distance+1 as distance, + all_relationships.distance+1 as distance, {{ dbt.array_append('all_relationships.path', 'direct_relationships.resource_name') }} as path, - case - when - all_relationships.child_materialized in ('view', 'ephemeral') - and coalesce(all_relationships.is_dependent_on_chain_of_views, true) + case + when + all_relationships.child_materialized in ('view', 'ephemeral') + and coalesce(all_relationships.is_dependent_on_chain_of_views, true) then true else false end as is_dependent_on_chain_of_views @@ -145,7 +145,7 @@ all_relationships ( {% endif %} with direct_relationships as ( - select + select * from {{ ref('int_direct_relationships') }} where resource_type <> 'test' @@ -161,12 +161,12 @@ with direct_relationships as ( is_public as child_is_public, access as child_access, is_excluded as child_is_excluded - + from direct_relationships ) , cte_0 as ( - select + select parent_id, child_id, child_materialized, @@ -182,19 +182,19 @@ with direct_relationships as ( {% for i in range(1,max_depth) %} {% set prev_cte_path %}cte_{{ i - 1 }}.path{% endset %} , cte_{{i}} as ( - select + select cte_{{i - 1}}.parent_id as parent_id, direct_relationships.resource_id as child_id, direct_relationships.materialized as child_materialized, direct_relationships.is_public as child_is_public, direct_relationships.access as child_access, direct_relationships.is_excluded as child_is_excluded, - cte_{{i - 1}}.distance+1 as distance, + cte_{{i - 1}}.distance+1 as distance, {{ dbt.array_append(prev_cte_path, 'direct_relationships.resource_name') }} as path, - case - when - cte_{{i - 1}}.child_materialized in ('view', 'ephemeral') - and coalesce(cte_{{i - 1}}.is_dependent_on_chain_of_views, true) + case + when + cte_{{i - 1}}.child_materialized in ('view', 'ephemeral') + and coalesce(cte_{{i - 1}}.is_dependent_on_chain_of_views, true) then true else false end as is_dependent_on_chain_of_views @@ -265,6 +265,10 @@ with direct_relationships as ( {% macro trino__recursive_dag() %} {#-- Although Trino supports a recursive WITH-queries, --- it is less performant than creating CTEs with loops and unioning them --#} +-- it is less performant than creating CTEs with loops and union them --#} + {{ return(bigquery__recursive_dag()) }} +{% endmacro %} + +{% macro athena__recursive_dag() %} {{ return(bigquery__recursive_dag()) }} {% endmacro %} diff --git a/models/marts/documentation/fct_documentation_coverage.sql b/models/marts/documentation/fct_documentation_coverage.sql index d5bfbfd3..26a4fd2f 100644 --- a/models/marts/documentation/fct_documentation_coverage.sql +++ b/models/marts/documentation/fct_documentation_coverage.sql @@ -20,7 +20,7 @@ conversion as ( final as ( select - {{ 'current_timestamp' if target.type != 'trino' else 'current_timestamp(6)' }} as measured_at, + {{ dbt.current_timestamp() if target.type != 'trino' else 'current_timestamp(6)' }} as measured_at, count(*) as total_models, sum(is_described_model) as documented_models, round(sum(is_described_model) * 100.00 / count(*), 2) as documentation_coverage_pct, diff --git a/models/marts/tests/fct_test_coverage.sql b/models/marts/tests/fct_test_coverage.sql index 70a2e2f0..be54ad58 100644 --- a/models/marts/tests/fct_test_coverage.sql +++ b/models/marts/tests/fct_test_coverage.sql @@ -19,7 +19,7 @@ conversion as ( final as ( select - {{ 'current_timestamp' if target.type != 'trino' else 'current_timestamp(6)' }} as measured_at, + {{ dbt.current_timestamp() if target.type != 'trino' else 'current_timestamp(6)' }} as measured_at, count(*) as total_models, sum(number_of_tests_on_model) as total_tests, sum(is_tested_model) as tested_models, @@ -39,4 +39,4 @@ final as ( on test_counts.resource_name = conversion.resource_name ) -select * from final \ No newline at end of file +select * from final