From 30685326d71e50835f21d9c15837f9489b10aa3f Mon Sep 17 00:00:00 2001 From: treff7es Date: Mon, 31 Jan 2022 23:23:34 +0100 Subject: [PATCH 1/2] Addressing pr review comments --- metadata-ingestion/source_docs/bigquery.md | 31 ++++---- metadata-ingestion/source_docs/redshift.md | 33 ++++---- metadata-ingestion/source_docs/snowflake.md | 40 +++++----- metadata-ingestion/source_docs/trino.md | 25 +++--- .../ingestion/source/usage/bigquery_usage.py | 6 +- .../ingestion/source/usage/redshift_usage.py | 6 +- .../ingestion/source/usage/snowflake_usage.py | 6 +- .../source/usage/starburst_trino_usage.py | 4 +- .../ingestion/source/usage/usage_common.py | 8 ++ .../tests/unit/test_usage_common.py | 79 +++++++++++++++---- 10 files changed, 160 insertions(+), 78 deletions(-) diff --git a/metadata-ingestion/source_docs/bigquery.md b/metadata-ingestion/source_docs/bigquery.md index 98fb78305647a..97abdad6268f0 100644 --- a/metadata-ingestion/source_docs/bigquery.md +++ b/metadata-ingestion/source_docs/bigquery.md @@ -215,20 +215,23 @@ Note that a `.` is used to denote nested fields in the YAML recipe. By default, we extract usage stats for the last day, with the recommendation that this source is executed every day. -| Field | Required | Default | Description | -|-----------------------------|----------|----------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| `projects` | | | | -| `extra_client_options` | | | | -| `env` | | `"PROD"` | Environment to use in namespace when constructing URNs. | -| `start_time` | | Last full day in UTC (or hour, depending on `bucket_duration`) | Earliest date of usage logs to consider. | -| `end_time` | | Last full day in UTC (or hour, depending on `bucket_duration`) | Latest date of usage logs to consider. | -| `top_n_queries` | | `10` | Number of top queries to save to each table. | -| `include_operational_stats` | | `true` | Whether to display operational stats. | -| `extra_client_options` | | | Additional options to pass to `google.cloud.logging_v2.client.Client`. | -| `query_log_delay` | | | To account for the possibility that the query event arrives after the read event in the audit logs, we wait for at least `query_log_delay` additional events to be processed before attempting to resolve BigQuery job information from the logs. If `query_log_delay` is `None`, it gets treated as an unlimited delay, which prioritizes correctness at the expense of memory usage. | -| `max_query_duration` | | `15` | Correction to pad `start_time` and `end_time` with. For handling the case where the read happens within our time range but the query completion event is delayed and happens after the configured end time. | -| `table_pattern.allow` | | | List of regex patterns for tables to include in ingestion. | -| `table_pattern.deny` | | | List of regex patterns for tables to exclude in ingestion. | +| Field | Required | Default | Description | +|---------------------------------|----------|----------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `projects` | | | | +| `extra_client_options` | | | | +| `env` | | `"PROD"` | Environment to use in namespace when constructing URNs. | +| `start_time` | | Last full day in UTC (or hour, depending on `bucket_duration`) | Earliest date of usage logs to consider. | +| `end_time` | | Last full day in UTC (or hour, depending on `bucket_duration`) | Latest date of usage logs to consider. | +| `top_n_queries` | | `10` | Number of top queries to save to each table. | +| `include_operational_stats` | | `true` | Whether to display operational stats. | +| `extra_client_options` | | | Additional options to pass to `google.cloud.logging_v2.client.Client`. | +| `query_log_delay` | | | To account for the possibility that the query event arrives after the read event in the audit logs, we wait for at least `query_log_delay` additional events to be processed before attempting to resolve BigQuery job information from the logs. If `query_log_delay` is `None`, it gets treated as an unlimited delay, which prioritizes correctness at the expense of memory usage. | +| `max_query_duration` | | `15` | Correction to pad `start_time` and `end_time` with. For handling the case where the read happens within our time range but the query completion event is delayed and happens after the configured end time. | +| `table_pattern.allow` | | | List of regex patterns for tables to include in ingestion. | +| `table_pattern.deny` | | | List of regex patterns for tables to exclude in ingestion. | +| `user_email_pattern.allow` | | * | List of regex patterns for user emails to include in usage. | +| `user_email_pattern.deny` | | | List of regex patterns for user emails to exclude from usage. | +| `user_email_pattern.ignoreCase` | | `True` | Whether to ignore case sensitivity during pattern matching. | ## Compatibility diff --git a/metadata-ingestion/source_docs/redshift.md b/metadata-ingestion/source_docs/redshift.md index 8cf4631d04a8c..8446800b0c9b9 100644 --- a/metadata-ingestion/source_docs/redshift.md +++ b/metadata-ingestion/source_docs/redshift.md @@ -234,21 +234,24 @@ Note that a `.` is used to denote nested fields in the YAML recipe. By default, we extract usage stats for the last day, with the recommendation that this source is executed every day. -| Field | Required | Default | Description | -|-----------------------------|----------|----------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| `username` | | | Redshift username. | -| `password` | | | Redshift password. | -| `host_port` | ✅ | | Redshift host URL. | -| `database` | | | Redshift database. | -| `env` | | `"PROD"` | Environment to use in namespace when constructing URNs. | -| `platform_instance` | | None | The Platform instance to use while constructing URNs. | -| `options.