From 62371ed5f77f5a3f5467a2c6101c5dc10fad78a8 Mon Sep 17 00:00:00 2001 From: Carlos Timoteo Date: Thu, 5 Oct 2023 17:29:31 -0400 Subject: [PATCH 1/3] fix on the table schema location, tables lifecycle and pipeline bucket lifecycle (#49) Co-authored-by: Carlos Timoteo --- config/config.yaml.tftpl | 4 +- .../modules/feature-store/bigquery-tables.tf | 78 +- .../terraform/modules/pipelines/pipelines.tf | 7 +- python/pipelines/pipeline_ops.py | 4 +- .../transformations-customer-ltv.json | 504 +++++++++++++ .../transformations-purchase-propensity.json | 702 ++++++++++++++++++ ...r_lifetime_value_training_preparation.sqlx | 221 ++++++ ...chase_propensity_training_preparation.sqlx | 307 ++++++++ ...oke_backfill_user_lifetime_dimensions.sqlx | 5 - 9 files changed, 1806 insertions(+), 26 deletions(-) create mode 100644 python/pipelines/transformations-customer-ltv.json create mode 100644 python/pipelines/transformations-purchase-propensity.json diff --git a/config/config.yaml.tftpl b/config/config.yaml.tftpl index 68130cc5..aa66ed68 100644 --- a/config/config.yaml.tftpl +++ b/config/config.yaml.tftpl @@ -186,7 +186,7 @@ vertex_ai: # data_source_bigquery_table_path: "bq://${project_id}.purchase_propensity.v_purchase_propensity_training_15_7" # data_source_bigquery_table_path: "bq://${project_id}.purchase_propensity.v_purchase_propensity_training_15_15" data_source_bigquery_table_path: "bq://${project_id}.purchase_propensity.v_purchase_propensity_training_30_15" - data_source_bigquery_table_schema: "sql/schema/table/purchase_propensity_training_preparation.json" + data_source_bigquery_table_schema: "../sql/schema/table/purchase_propensity_training_preparation.json" dataflow_service_account: "df-worker@${project_id}.iam.gserviceaccount.com" timestamp_split_key: null stratified_split_key: null @@ -330,7 +330,7 @@ vertex_ai: data_source_csv_filenames: null optimization_objective: minimize-mae # minimize-mae | minimize-rmse | minimize-rmsle data_source_bigquery_table_path: "bq://${project_id}.customer_lifetime_value.v_customer_lifetime_value_training_180_30" - data_source_bigquery_table_schema: "sql/schema/table/customer_lifetime_value_training_preparation.json" + data_source_bigquery_table_schema: "../sql/schema/table/customer_lifetime_value_training_preparation.json" dataflow_service_account: "df-worker@${project_id}.iam.gserviceaccount.com" timestamp_split_key: null stratified_split_key: null diff --git a/infrastructure/terraform/modules/feature-store/bigquery-tables.tf b/infrastructure/terraform/modules/feature-store/bigquery-tables.tf index 20114820..259c542e 100644 --- a/infrastructure/terraform/modules/feature-store/bigquery-tables.tf +++ b/infrastructure/terraform/modules/feature-store/bigquery-tables.tf @@ -17,7 +17,7 @@ resource "google_bigquery_table" "audience_segmentation_inference_preparation" { dataset_id = google_bigquery_dataset.audience_segmentation.dataset_id table_id = local.config_bigquery.table.audience_segmentation_inference_preparation.table_name description = local.config_bigquery.table.audience_segmentation_inference_preparation.table_description - deletion_protection = true + deletion_protection = false labels = { version = "pilot" } @@ -30,7 +30,7 @@ resource "google_bigquery_table" "customer_lifetime_value_inference_preparation" dataset_id = google_bigquery_dataset.customer_lifetime_value.dataset_id table_id = local.config_bigquery.table.customer_lifetime_value_inference_preparation.table_name description = local.config_bigquery.table.customer_lifetime_value_inference_preparation.table_description - deletion_protection = true + deletion_protection = false labels = { version = "pilot" } @@ -42,11 +42,15 @@ resource "google_bigquery_table" "customer_lifetime_value_label" { dataset_id = google_bigquery_dataset.feature_store.dataset_id table_id = local.config_bigquery.table.customer_lifetime_value_label.table_name description = local.config_bigquery.table.customer_lifetime_value_label.table_description - deletion_protection = true + deletion_protection = false labels = { version = "pilot" } schema = file("${local.sql_dir}/schema/table/customer_lifetime_value_label.json") + lifecycle { + ignore_changes = all + prevent_destroy = true + } } resource "google_bigquery_table" "purchase_propensity_inference_preparation" { @@ -54,7 +58,7 @@ resource "google_bigquery_table" "purchase_propensity_inference_preparation" { dataset_id = google_bigquery_dataset.purchase_propensity.dataset_id table_id = local.config_bigquery.table.purchase_propensity_inference_preparation.table_name description = local.config_bigquery.table.purchase_propensity_inference_preparation.table_description - deletion_protection = true + deletion_protection = false labels = { version = "pilot" } @@ -66,11 +70,15 @@ resource "google_bigquery_table" "purchase_propensity_label" { dataset_id = google_bigquery_dataset.feature_store.dataset_id table_id = local.config_bigquery.table.purchase_propensity_label.table_name description = local.config_bigquery.table.purchase_propensity_label.table_description - deletion_protection = true + deletion_protection = false labels = { version = "pilot" } schema = file("${local.sql_dir}/schema/table/purchase_propensity_label.json") + lifecycle { + ignore_changes = all + prevent_destroy = true + } } resource "google_bigquery_table" "user_dimensions" { @@ -78,11 +86,15 @@ resource "google_bigquery_table" "user_dimensions" { dataset_id = google_bigquery_dataset.feature_store.dataset_id table_id = local.config_bigquery.table.user_dimensions.table_name description = local.config_bigquery.table.user_dimensions.table_description - deletion_protection = true + deletion_protection = false labels = { version = "pilot" } schema = file("${local.sql_dir}/schema/table/user_dimensions.json") + lifecycle { + ignore_changes = all + prevent_destroy = true + } } resource "google_bigquery_table" "user_lifetime_dimensions" { @@ -90,11 +102,15 @@ resource "google_bigquery_table" "user_lifetime_dimensions" { dataset_id = google_bigquery_dataset.feature_store.dataset_id table_id = local.config_bigquery.table.user_lifetime_dimensions.table_name description = local.config_bigquery.table.user_lifetime_dimensions.table_description - deletion_protection = true + deletion_protection = false labels = { version = "pilot" } schema = file("${local.sql_dir}/schema/table/user_lifetime_dimensions.json") + lifecycle { + ignore_changes = all + prevent_destroy = true + } } @@ -103,11 +119,15 @@ resource "google_bigquery_table" "user_lookback_metrics" { dataset_id = google_bigquery_dataset.feature_store.dataset_id table_id = local.config_bigquery.table.user_lookback_metrics.table_name description = local.config_bigquery.table.user_lookback_metrics.table_description - deletion_protection = true + deletion_protection = false labels = { version = "pilot" } schema = file("${local.sql_dir}/schema/table/user_lookback_metrics.json") + lifecycle { + ignore_changes = all + prevent_destroy = true + } } @@ -116,11 +136,15 @@ resource "google_bigquery_table" "user_rolling_window_lifetime_metrics" { dataset_id = google_bigquery_dataset.feature_store.dataset_id table_id = local.config_bigquery.table.user_rolling_window_lifetime_metrics.table_name description = local.config_bigquery.table.user_rolling_window_lifetime_metrics.table_description - deletion_protection = true + deletion_protection = false labels = { version = "pilot" } schema = file("${local.sql_dir}/schema/table/user_rolling_window_lifetime_metrics.json") + lifecycle { + ignore_changes = all + prevent_destroy = true + } } resource "google_bigquery_table" "user_rolling_window_metrics" { @@ -128,11 +152,15 @@ resource "google_bigquery_table" "user_rolling_window_metrics" { dataset_id = google_bigquery_dataset.feature_store.dataset_id table_id = local.config_bigquery.table.user_rolling_window_metrics.table_name description = local.config_bigquery.table.user_rolling_window_metrics.table_description - deletion_protection = true + deletion_protection = false labels = { version = "pilot" } schema = file("${local.sql_dir}/schema/table/user_rolling_window_metrics.json") + lifecycle { + ignore_changes = all + prevent_destroy = true + } } resource "google_bigquery_table" "user_scoped_lifetime_metrics" { @@ -140,11 +168,15 @@ resource "google_bigquery_table" "user_scoped_lifetime_metrics" { dataset_id = google_bigquery_dataset.feature_store.dataset_id table_id = local.config_bigquery.table.user_scoped_lifetime_metrics.table_name description = local.config_bigquery.table.user_scoped_lifetime_metrics.table_description - deletion_protection = true + deletion_protection = false labels = { version = "pilot" } schema = file("${local.sql_dir}/schema/table/user_scoped_lifetime_metrics.json") + lifecycle { + ignore_changes = all + prevent_destroy = true + } } resource "google_bigquery_table" "user_scoped_metrics" { @@ -152,11 +184,15 @@ resource "google_bigquery_table" "user_scoped_metrics" { dataset_id = google_bigquery_dataset.feature_store.dataset_id table_id = local.config_bigquery.table.user_scoped_metrics.table_name description = local.config_bigquery.table.user_scoped_metrics.table_description - deletion_protection = true + deletion_protection = false labels = { version = "pilot" } schema = file("${local.sql_dir}/schema/table/user_scoped_metrics.json") + lifecycle { + ignore_changes = all + prevent_destroy = true + } } resource "google_bigquery_table" "user_scoped_segmentation_metrics" { @@ -164,11 +200,15 @@ resource "google_bigquery_table" "user_scoped_segmentation_metrics" { dataset_id = google_bigquery_dataset.feature_store.dataset_id table_id = local.config_bigquery.table.user_scoped_segmentation_metrics.table_name description = local.config_bigquery.table.user_scoped_segmentation_metrics.table_description - deletion_protection = true + deletion_protection = false labels = { version = "pilot" } schema = file("${local.sql_dir}/schema/table/user_scoped_segmentation_metrics.json") + lifecycle { + ignore_changes = all + prevent_destroy = true + } } resource "google_bigquery_table" "user_segmentation_dimensions" { @@ -176,11 +216,15 @@ resource "google_bigquery_table" "user_segmentation_dimensions" { dataset_id = google_bigquery_dataset.feature_store.dataset_id table_id = local.config_bigquery.table.user_segmentation_dimensions.table_name description = local.config_bigquery.table.user_segmentation_dimensions.table_description - deletion_protection = true + deletion_protection = false labels = { version = "pilot" } schema = file("${local.sql_dir}/schema/table/user_segmentation_dimensions.json") + lifecycle { + ignore_changes = all + prevent_destroy = true + } } resource "google_bigquery_table" "user_session_event_aggregated_metrics" { @@ -188,11 +232,15 @@ resource "google_bigquery_table" "user_session_event_aggregated_metrics" { dataset_id = google_bigquery_dataset.feature_store.dataset_id table_id = local.config_bigquery.table.user_session_event_aggregated_metrics.table_name description = local.config_bigquery.table.user_session_event_aggregated_metrics.table_description - deletion_protection = true + deletion_protection = false labels = { version = "pilot" } schema = file("${local.sql_dir}/schema/table/user_session_event_aggregated_metrics.json") + lifecycle { + ignore_changes = all + prevent_destroy = true + } } diff --git a/infrastructure/terraform/modules/pipelines/pipelines.tf b/infrastructure/terraform/modules/pipelines/pipelines.tf index 2ae7ba8b..921a31cd 100644 --- a/infrastructure/terraform/modules/pipelines/pipelines.tf +++ b/infrastructure/terraform/modules/pipelines/pipelines.tf @@ -74,8 +74,11 @@ resource "google_storage_bucket" "pipelines_bucket" { storage_class = "REGIONAL" location = local.pipeline_vars.region uniform_bucket_level_access = true - force_destroy = true - + force_destroy = false + lifecycle { + ignore_changes = all + prevent_destroy = false ##true + } } diff --git a/python/pipelines/pipeline_ops.py b/python/pipelines/pipeline_ops.py index 71945d07..cfb0bbe7 100644 --- a/python/pipelines/pipeline_ops.py +++ b/python/pipelines/pipeline_ops.py @@ -159,9 +159,9 @@ def _extract_schema_from_bigquery( except exceptions.NotFound as e: logging.warn(f'Pipeline compiled without columns transformation. \ Make sure the `data_source_bigquery_table_path` table or view exists! \ - Loading default values from schema file {schema_name}.') + Loading default values from schema file {table_schema}.') import json - with open(schema_name) as f: + with open(table_schema) as f: d = json.load(f) schema = [feature['name'] for feature in d] return schema diff --git a/python/pipelines/transformations-customer-ltv.json b/python/pipelines/transformations-customer-ltv.json new file mode 100644 index 00000000..a7bd1372 --- /dev/null +++ b/python/pipelines/transformations-customer-ltv.json @@ -0,0 +1,504 @@ +[ + { + "categorical": { + "columnName": "month_of_the_year" + } + }, + { + "categorical": { + "columnName": "week_of_the_year" + } + }, + { + "categorical": { + "columnName": "day_of_the_month" + } + }, + { + "categorical": { + "columnName": "day_of_week" + } + }, + { + "categorical": { + "columnName": "device_category" + } + }, + { + "categorical": { + "columnName": "device_mobile_brand_name" + } + }, + { + "categorical": { + "columnName": "device_mobile_model_name" + } + }, + { + "categorical": { + "columnName": "device_os" + } + }, + { + "categorical": { + "columnName": "device_os_version" + } + }, + { + "categorical": { + "columnName": "device_language" + } + }, + { + "categorical": { + "columnName": "device_web_browser" + } + }, + { + "categorical": { + "columnName": "device_web_browser_version" + } + }, + { + "categorical": { + "columnName": "geo_sub_continent" + } + }, + { + "categorical": { + "columnName": "geo_country" + } + }, + { + "categorical": { + "columnName": "geo_region" + } + }, + { + "categorical": { + "columnName": "geo_city" + } + }, + { + "categorical": { + "columnName": "geo_metro" + } + }, + { + "categorical": { + "columnName": "last_traffic_source_medium" + } + }, + { + "categorical": { + "columnName": "last_traffic_source_name" + } + }, + { + "categorical": { + "columnName": "last_traffic_source_source" + } + }, + { + "categorical": { + "columnName": "first_traffic_source_medium" + } + }, + { + "categorical": { + "columnName": "first_traffic_source_name" + } + }, + { + "categorical": { + "columnName": "first_traffic_source_source" + } + }, + { + "categorical": { + "columnName": "has_signed_in_with_user_id" + } + }, + { + "numeric": { + "columnName": "active_users_past_1_30_day", + "invalidValuesAllowed": false + } + }, + { + "numeric": { + "columnName": "active_users_past_30_60_day", + "invalidValuesAllowed": false + } + }, + { + "numeric": { + "columnName": "active_users_past_60_90_day", + "invalidValuesAllowed": false + } + }, + { + "numeric": { + "columnName": "active_users_past_90_120_day", + "invalidValuesAllowed": false + } + }, + { + "numeric": { + "columnName": "active_users_past_120_150_day", + "invalidValuesAllowed": false + } + }, + { + "numeric": { + "columnName": "active_users_past_150_180_day", + "invalidValuesAllowed": false + } + }, + { + "numeric": { + "columnName": "purchases_past_1_30_day", + "invalidValuesAllowed": false + } + }, + { + "numeric": { + "columnName": "purchases_past_30_60_day", + "invalidValuesAllowed": false + } + }, + { + "numeric": { + "columnName": "purchases_past_60_90_day", + "invalidValuesAllowed": false + } + }, + { + "numeric": { + "columnName": "purchases_past_90_120_day", + "invalidValuesAllowed": false + } + }, + { + "numeric": { + "columnName": "purchases_past_120_150_day", + "invalidValuesAllowed": false + } + }, + { + "numeric": { + "columnName": "purchases_past_150_180_day", + "invalidValuesAllowed": false + } + }, + { + "numeric": { + "columnName": "visits_past_1_30_day", + "invalidValuesAllowed": false + } + }, + { + "numeric": { + "columnName": "visits_past_30_60_day", + "invalidValuesAllowed": false + } + }, + { + "numeric": { + "columnName": "visits_past_60_90_day", + "invalidValuesAllowed": false + } + }, + { + "numeric": { + "columnName": "visits_past_90_120_day", + "invalidValuesAllowed": false + } + }, + { + "auto": { + "columnName": "visits_past_120_150_day" + } + }, + { + "auto": { + "columnName": "visits_past_150_180_day" + } + }, + { + "numeric": { + "columnName": "view_items_past_1_30_day", + "invalidValuesAllowed": false + } + }, + { + "numeric": { + "columnName": "view_items_past_30_60_day", + "invalidValuesAllowed": false + } + }, + { + "numeric": { + "columnName": "view_items_past_60_90_day", + "invalidValuesAllowed": false + } + }, + { + "numeric": { + "columnName": "view_items_past_90_120_day", + "invalidValuesAllowed": false + } + }, + { + "numeric": { + "columnName": "view_items_past_120_150_day", + "invalidValuesAllowed": false + } + }, + { + "numeric": { + "columnName": "view_items_past_150_180_day", + "invalidValuesAllowed": false + } + }, + { + "numeric": { + "columnName": "add_to_carts_past_1_30_day", + "invalidValuesAllowed": false + } + }, + { + "numeric": { + "columnName": "add_to_carts_past_30_60_day", + "invalidValuesAllowed": false + } + }, + { + "numeric": { + "columnName": "add_to_carts_past_60_90_day", + "invalidValuesAllowed": false + } + }, + { + "numeric": { + "columnName": "add_to_carts_past_90_120_day", + "invalidValuesAllowed": false + } + }, + { + "numeric": { + "columnName": "add_to_carts_past_120_150_day", + "invalidValuesAllowed": false + } + }, + { + "numeric": { + "columnName": "add_to_carts_past_150_180_day", + "invalidValuesAllowed": false + } + }, + { + "numeric": { + "columnName": "checkouts_past_1_30_day", + "invalidValuesAllowed": false + } + }, + { + "numeric": { + "columnName": "checkouts_past_30_60_day", + "invalidValuesAllowed": false + } + }, + { + "numeric": { + "columnName": "checkouts_past_60_90_day", + "invalidValuesAllowed": false + } + }, + { + "numeric": { + "columnName": "checkouts_past_90_120_day", + "invalidValuesAllowed": false + } + }, + { + "numeric": { + "columnName": "checkouts_past_120_150_day", + "invalidValuesAllowed": false + } + }, + { + "numeric": { + "columnName": "checkouts_past_150_180_day", + "invalidValuesAllowed": false + } + }, + { + "numeric": { + "columnName": "ltv_revenue_past_1_30_day", + "invalidValuesAllowed": false + } + }, + { + "numeric": { + "columnName": "ltv_revenue_past_30_90_day", + "invalidValuesAllowed": false + } + }, + { + "numeric": { + "columnName": "ltv_revenue_past_90_180_day", + "invalidValuesAllowed": false + } + }, + { + "numeric": { + "columnName": "lifetime_purchasers_users", + "invalidValuesAllowed": false + } + }, + { + "numeric": { + "columnName": "lifetime_average_daily_purchasers", + "invalidValuesAllowed": false + } + }, + { + "numeric": { + "columnName": "lifetime_active_users", + "invalidValuesAllowed": false + } + }, + { + "numeric": { + "columnName": "lifetime_DAU", + "invalidValuesAllowed": false + } + }, + { + "numeric": { + "columnName": "lifetime_MAU", + "invalidValuesAllowed": false + } + }, + { + "numeric": { + "columnName": "lifetime_WAU", + "invalidValuesAllowed": false + } + }, + { + "numeric": { + "columnName": "lifetime_dau_per_mau", + "invalidValuesAllowed": false + } + }, + { + "numeric": { + "columnName": "lifetime_dau_per_wau", + "invalidValuesAllowed": false + } + }, + { + "numeric": { + "columnName": "lifetime_wau_per_mau", + "invalidValuesAllowed": false + } + }, + { + "numeric": { + "columnName": "lifetime_users_engagement_duration_seconds", + "invalidValuesAllowed": false + } + }, + { + "numeric": { + "columnName": "lifetime_average_engagement_time", + "invalidValuesAllowed": false + } + }, + { + "numeric": { + "columnName": "lifetime_average_engagement_time_per_session", + "invalidValuesAllowed": false + } + }, + { + "numeric": { + "columnName": "lifetime_average_sessions_per_user", + "invalidValuesAllowed": false + } + }, + { + "numeric": { + "columnName": "lifetime_ARPPU", + "invalidValuesAllowed": false + } + }, + { + "numeric": { + "columnName": "lifetime_ARPU", + "invalidValuesAllowed": false + } + }, + { + "numeric": { + "columnName": "lifetime_average_daily_revenue", + "invalidValuesAllowed": false + } + }, + { + "numeric": { + "columnName": "lifetime_max_daily_revenue", + "invalidValuesAllowed": false + } + }, + { + "numeric": { + "columnName": "lifetime_min_daily_revenue", + "invalidValuesAllowed": false + } + }, + { + "numeric": { + "columnName": "lifetime_new_users", + "invalidValuesAllowed": false + } + }, + { + "numeric": { + "columnName": "lifetime_returning_users", + "invalidValuesAllowed": false + } + }, + { + "numeric": { + "columnName": "lifetime_first_time_purchasers", + "invalidValuesAllowed": false + } + }, + { + "numeric": { + "columnName": "lifetime_first_time_purchaser_conversion", + "invalidValuesAllowed": false + } + }, + { + "numeric": { + "columnName": "lifetime_first_time_purchasers_per_new_user", + "invalidValuesAllowed": false + } + }, + { + "numeric": { + "columnName": "lifetime_avg_user_conversion_rate", + "invalidValuesAllowed": false + } + }, + { + "numeric": { + "columnName": "lifetime_avg_session_conversion_rate", + "invalidValuesAllowed": false + } + } +] \ No newline at end of file diff --git a/python/pipelines/transformations-purchase-propensity.json b/python/pipelines/transformations-purchase-propensity.json new file mode 100644 index 00000000..1c6c56e6 --- /dev/null +++ b/python/pipelines/transformations-purchase-propensity.json @@ -0,0 +1,702 @@ +[ + { + "categorical": { + "columnName": "month_of_the_year" + } + }, + { + "numeric": { + "columnName": "week_of_the_year" + } + }, + { + "categorical": { + "columnName": "day_of_the_month" + } + }, + { + "categorical": { + "columnName": "day_of_week" + } + }, + { + "numeric": { + "columnName": "user_ltv_revenue" + } + }, + { + "categorical": { + "columnName": "device_category" + } + }, + { + "categorical": { + "columnName": "device_mobile_brand_name" + } + }, + { + "categorical": { + "columnName": "device_mobile_model_name" + } + }, + { + "categorical": { + "columnName": "device_os" + } + }, + { + "categorical": { + "columnName": "device_os_version" + } + }, + { + "categorical": { + "columnName": "device_language" + } + }, + { + "categorical": { + "columnName": "device_web_browser" + } + }, + { + "categorical": { + "columnName": "device_web_browser_version" + } + }, + { + "categorical": { + "columnName": "geo_sub_continent" + } + }, + { + "categorical": { + "columnName": "geo_country" + } + }, + { + "categorical": { + "columnName": "geo_region" + } + }, + { + "categorical": { + "columnName": "geo_city" + } + }, + { + "categorical": { + "columnName": "geo_metro" + } + }, + { + "numeric": { + "columnName": "last_traffic_source_medium", + "invalidValuesAllowed": false + } + }, + { + "numeric": { + "columnName": "last_traffic_source_name", + "invalidValuesAllowed": false + } + }, + { + "numeric": { + "columnName": "last_traffic_source_source", + "invalidValuesAllowed": false + } + }, + { + "categorical": { + "columnName": "first_traffic_source_medium" + } + }, + { + "categorical": { + "columnName": "first_traffic_source_name" + } + }, + { + "categorical": { + "columnName": "first_traffic_source_source" + } + }, + { + "categorical": { + "columnName": "has_signed_in_with_user_id" + } + }, + { + "numeric": { + "columnName": "engagement_rate" + } + }, + { + "numeric": { + "columnName": "engaged_sessions_per_user" + } + }, + { + "numeric": { + "columnName": "session_conversion_rate" + } + }, + { + "numeric": { + "columnName": "bounces" + } + }, + { + "numeric": { + "columnName": "bounce_rate_per_user" + } + }, + { + "numeric": { + "columnName": "sessions_per_user" + } + }, + { + "numeric": { + "columnName": "avg_views_per_session" + } + }, + { + "numeric": { + "columnName": "sum_engagement_time_seconds" + } + }, + { + "numeric": { + "columnName": "avg_engagement_time_seconds" + } + }, + { + "numeric": { + "columnName": "new_visits", + "invalidValuesAllowed": false + } + }, + { + "numeric": { + "columnName": "returning_visits" + } + }, + { + "numeric": { + "columnName": "add_to_carts" + } + }, + { + "numeric": { + "columnName": "cart_to_view_rate" + } + }, + { + "numeric": { + "columnName": "checkouts" + } + }, + { + "numeric": { + "columnName": "ecommerce_purchases" + } + }, + { + "numeric": { + "columnName": "ecommerce_quantity" + } + }, + { + "numeric": { + "columnName": "ecommerce_revenue" + } + }, + { + "numeric": { + "columnName": "item_revenue" + } + }, + { + "numeric": { + "columnName": "item_quantity" + } + }, + { + "numeric": { + "columnName": "item_view_events" + } + }, + { + "numeric": { + "columnName": "items_clicked_in_promotion" + } + }, + { + "numeric": { + "columnName": "items_clicked_in_list", + "invalidValuesAllowed": false + } + }, + { + "numeric": { + "columnName": "items_checked_out", + "invalidValuesAllowed": false + } + }, + { + "numeric": { + "columnName": "items_added_to_cart" + } + }, + { + "numeric": { + "columnName": "item_list_view_events" + } + }, + { + "numeric": { + "columnName": "purchase_revenue" + } + }, + { + "numeric": { + "columnName": "purchase_to_view_rate" + } + }, + { + "numeric": { + "columnName": "transactions_per_purchaser" + } + }, + { + "numeric": { + "columnName": "user_conversion_rate" + } + }, + { + "numeric": { + "columnName": "how_many_purchased_before" + } + }, + { + "categorical": { + "columnName": "has_abandoned_cart" + } + }, + { + "numeric": { + "columnName": "active_users_past_1_day", + "invalidValuesAllowed": false + } + }, + { + "numeric": { + "columnName": "active_users_past_2_day", + "invalidValuesAllowed": false + } + }, + { + "numeric": { + "columnName": "active_users_past_3_day", + "invalidValuesAllowed": false + } + }, + { + "numeric": { + "columnName": "active_users_past_4_day", + "invalidValuesAllowed": false + } + }, + { + "numeric": { + "columnName": "active_users_past_5_day", + "invalidValuesAllowed": false + } + }, + { + "numeric": { + "columnName": "active_users_past_6_day", + "invalidValuesAllowed": false + } + }, + { + "numeric": { + "columnName": "active_users_past_7_day", + "invalidValuesAllowed": false + } + }, + { + "numeric": { + "columnName": "active_users_past_8_14_day", + "invalidValuesAllowed": false + } + }, + { + "numeric": { + "columnName": "active_users_past_15_30_day" + } + }, + { + "numeric": { + "columnName": "purchases_past_1_day" + } + }, + { + "numeric": { + "columnName": "purchases_past_2_day" + } + }, + { + "numeric": { + "columnName": "purchases_past_3_day" + } + }, + { + "numeric": { + "columnName": "purchases_past_4_day" + } + }, + { + "numeric": { + "columnName": "purchases_past_5_day" + } + }, + { + "numeric": { + "columnName": "purchases_past_6_day" + } + }, + { + "numeric": { + "columnName": "purchases_past_7_day" + } + }, + { + "numeric": { + "columnName": "purchases_past_8_14_day" + } + }, + { + "numeric": { + "columnName": "purchases_past_15_30_day" + } + }, + { + "numeric": { + "columnName": "visits_past_1_day", + "invalidValuesAllowed": false + } + }, + { + "numeric": { + "columnName": "visits_past_2_day", + "invalidValuesAllowed": false + } + }, + { + "numeric": { + "columnName": "visits_past_3_day", + "invalidValuesAllowed": false + } + }, + { + "numeric": { + "columnName": "visits_past_4_day", + "invalidValuesAllowed": false + } + }, + { + "numeric": { + "columnName": "visits_past_5_day", + "invalidValuesAllowed": false + } + }, + { + "numeric": { + "columnName": "visits_past_6_day", + "invalidValuesAllowed": false + } + }, + { + "numeric": { + "columnName": "visits_past_7_day", + "invalidValuesAllowed": false + } + }, + { + "numeric": { + "columnName": "visits_past_8_14_day" + } + }, + { + "numeric": { + "columnName": "visits_past_15_30_day" + } + }, + { + "numeric": { + "columnName": "view_items_past_1_day" + } + }, + { + "numeric": { + "columnName": "view_items_past_2_day" + } + }, + { + "numeric": { + "columnName": "view_items_past_3_day" + } + }, + { + "numeric": { + "columnName": "view_items_past_4_day" + } + }, + { + "numeric": { + "columnName": "view_items_past_5_day" + } + }, + { + "numeric": { + "columnName": "view_items_past_6_day" + } + }, + { + "numeric": { + "columnName": "view_items_past_7_day" + } + }, + { + "numeric": { + "columnName": "view_items_past_8_14_day" + } + }, + { + "numeric": { + "columnName": "view_items_past_15_30_day" + } + }, + { + "numeric": { + "columnName": "add_to_carts_past_1_day" + } + }, + { + "numeric": { + "columnName": "add_to_carts_past_2_day" + } + }, + { + "numeric": { + "columnName": "add_to_carts_past_3_day" + } + }, + { + "numeric": { + "columnName": "add_to_carts_past_4_day" + } + }, + { + "numeric": { + "columnName": "add_to_carts_past_5_day" + } + }, + { + "numeric": { + "columnName": "add_to_carts_past_6_day" + } + }, + { + "numeric": { + "columnName": "add_to_carts_past_7_day" + } + }, + { + "numeric": { + "columnName": "add_to_carts_past_8_14_day" + } + }, + { + "numeric": { + "columnName": "add_to_carts_past_15_30_day" + } + }, + { + "numeric": { + "columnName": "checkouts_past_1_day" + } + }, + { + "numeric": { + "columnName": "checkouts_past_2_day" + } + }, + { + "numeric": { + "columnName": "checkouts_past_3_day" + } + }, + { + "numeric": { + "columnName": "checkouts_past_4_day" + } + }, + { + "numeric": { + "columnName": "checkouts_past_5_day" + } + }, + { + "numeric": { + "columnName": "checkouts_past_6_day" + } + }, + { + "numeric": { + "columnName": "checkouts_past_7_day" + } + }, + { + "numeric": { + "columnName": "checkouts_past_8_14_day" + } + }, + { + "numeric": { + "columnName": "checkouts_past_15_30_day" + } + }, + { + "numeric": { + "columnName": "purchasers_users" + } + }, + { + "numeric": { + "columnName": "average_daily_purchasers" + } + }, + { + "numeric": { + "columnName": "active_users" + } + }, + { + "numeric": { + "columnName": "DAU" + } + }, + { + "numeric": { + "columnName": "MAU" + } + }, + { + "numeric": { + "columnName": "WAU" + } + }, + { + "numeric": { + "columnName": "dau_per_mau", + "invalidValuesAllowed": false + } + }, + { + "numeric": { + "columnName": "dau_per_wau", + "invalidValuesAllowed": false + } + }, + { + "numeric": { + "columnName": "wau_per_mau", + "invalidValuesAllowed": false + } + }, + { + "numeric": { + "columnName": "users_engagement_duration_seconds" + } + }, + { + "numeric": { + "columnName": "average_engagement_time" + } + }, + { + "numeric": { + "columnName": "average_engagement_time_per_session" + } + }, + { + "numeric": { + "columnName": "average_sessions_per_user" + } + }, + { + "numeric": { + "columnName": "ARPPU" + } + }, + { + "numeric": { + "columnName": "ARPU" + } + }, + { + "numeric": { + "columnName": "average_daily_revenue" + } + }, + { + "numeric": { + "columnName": "max_daily_revenue" + } + }, + { + "numeric": { + "columnName": "min_daily_revenue" + } + }, + { + "numeric": { + "columnName": "new_users" + } + }, + { + "numeric": { + "columnName": "returning_users" + } + }, + { + "numeric": { + "columnName": "first_time_purchasers", + "invalidValuesAllowed": false + } + }, + { + "numeric": { + "columnName": "first_time_purchaser_conversion" + } + }, + { + "numeric": { + "columnName": "first_time_purchasers_per_new_user" + } + }, + { + "numeric": { + "columnName": "avg_user_conversion_rate" + } + }, + { + "numeric": { + "columnName": "avg_session_conversion_rate" + } + } +] \ No newline at end of file diff --git a/sql/procedure/customer_lifetime_value_training_preparation.sqlx b/sql/procedure/customer_lifetime_value_training_preparation.sqlx index c4cbf3d8..f7c03359 100644 --- a/sql/procedure/customer_lifetime_value_training_preparation.sqlx +++ b/sql/procedure/customer_lifetime_value_training_preparation.sqlx @@ -1145,5 +1145,226 @@ SELECT DISTINCT FROM `{{project_id}}.{{dataset}}.customer_lifetime_value_training_180_180`; + + +CREATE OR REPLACE VIEW `{{project_id}}.{{dataset}}.v_customer_lifetime_value_training_180_30_balanced` +(processed_timestamp, + data_split, + feature_date, + user_pseudo_id, + user_id, + month_of_the_year, + week_of_the_year, + day_of_the_month, + day_of_week, + device_category, + device_mobile_brand_name, + device_mobile_model_name, + device_os, + device_os_version, + device_language, + device_web_browser, + device_web_browser_version, + geo_sub_continent, + geo_country, + geo_region, + geo_city, + geo_metro, + last_traffic_source_medium, + last_traffic_source_name, + last_traffic_source_source, + first_traffic_source_medium, + first_traffic_source_name, + first_traffic_source_source, + has_signed_in_with_user_id, + active_users_past_1_30_day, + active_users_past_30_60_day, + active_users_past_60_90_day, + active_users_past_90_120_day, + active_users_past_120_150_day, + active_users_past_150_180_day, + purchases_past_1_30_day, + purchases_past_30_60_day, + purchases_past_60_90_day, + purchases_past_90_120_day, + purchases_past_120_150_day, + purchases_past_150_180_day, + visits_past_1_30_day, + visits_past_30_60_day, + visits_past_60_90_day, + visits_past_90_120_day, + visits_past_120_150_day, + visits_past_150_180_day, + view_items_past_1_30_day, + view_items_past_30_60_day, + view_items_past_60_90_day, + view_items_past_90_120_day, + view_items_past_120_150_day, + view_items_past_150_180_day, + add_to_carts_past_1_30_day, + add_to_carts_past_30_60_day, + add_to_carts_past_60_90_day, + add_to_carts_past_90_120_day, + add_to_carts_past_120_150_day, + add_to_carts_past_150_180_day, + checkouts_past_1_30_day, + checkouts_past_30_60_day, + checkouts_past_60_90_day, + checkouts_past_90_120_day, + checkouts_past_120_150_day, + checkouts_past_150_180_day, + ltv_revenue_past_1_30_day, + ltv_revenue_past_30_90_day, + ltv_revenue_past_90_180_day, + lifetime_purchasers_users, + lifetime_average_daily_purchasers, + lifetime_active_users, + lifetime_DAU, + lifetime_MAU, + lifetime_WAU, + lifetime_dau_per_mau, + lifetime_dau_per_wau, + lifetime_wau_per_mau, + lifetime_users_engagement_duration_seconds, + lifetime_average_engagement_time, + lifetime_average_engagement_time_per_session, + lifetime_average_sessions_per_user, + lifetime_ARPPU, + lifetime_ARPU, + lifetime_average_daily_revenue, + lifetime_max_daily_revenue, + lifetime_min_daily_revenue, + lifetime_new_users, + lifetime_returning_users, + lifetime_first_time_purchasers, + lifetime_first_time_purchaser_conversion, + lifetime_first_time_purchasers_per_new_user, + lifetime_avg_user_conversion_rate, + lifetime_avg_session_conversion_rate, + pltv_revenue_30_days) +OPTIONS( + --expiration_timestamp=TIMESTAMP_ADD(CURRENT_TIMESTAMP(), INTERVAL {{expiration_duration_hours}} HOUR), + friendly_name="v_customer_lifetime_value_training_180_30_balanced", + description="View Purchase Propensity Training dataset using 15 days back to predict 15 days ahead. View expires after 48h and should run daily.", + labels=[("org_unit", "development")] +) AS +SELECT + processed_timestamp, + data_split, + feature_date, + user_pseudo_id, + user_id, + month_of_the_year, + week_of_the_year, + day_of_the_month, + day_of_week, + device_category, + device_mobile_brand_name, + device_mobile_model_name, + device_os, + device_os_version, + device_language, + device_web_browser, + device_web_browser_version, + geo_sub_continent, + geo_country, + geo_region, + geo_city, + geo_metro, + last_traffic_source_medium, + last_traffic_source_name, + last_traffic_source_source, + first_traffic_source_medium, + first_traffic_source_name, + first_traffic_source_source, + has_signed_in_with_user_id, + active_users_past_1_30_day, + active_users_past_30_60_day, + active_users_past_60_90_day, + active_users_past_90_120_day, + active_users_past_120_150_day, + active_users_past_150_180_day, + purchases_past_1_30_day, + purchases_past_30_60_day, + purchases_past_60_90_day, + purchases_past_90_120_day, + purchases_past_120_150_day, + purchases_past_150_180_day, + visits_past_1_30_day, + visits_past_30_60_day, + visits_past_60_90_day, + visits_past_90_120_day, + visits_past_120_150_day, + visits_past_150_180_day, + view_items_past_1_30_day, + view_items_past_30_60_day, + view_items_past_60_90_day, + view_items_past_90_120_day, + view_items_past_120_150_day, + view_items_past_150_180_day, + add_to_carts_past_1_30_day, + add_to_carts_past_30_60_day, + add_to_carts_past_60_90_day, + add_to_carts_past_90_120_day, + add_to_carts_past_120_150_day, + add_to_carts_past_150_180_day, + checkouts_past_1_30_day, + checkouts_past_30_60_day, + checkouts_past_60_90_day, + checkouts_past_90_120_day, + checkouts_past_120_150_day, + checkouts_past_150_180_day, + ltv_revenue_past_1_30_day, + ltv_revenue_past_30_90_day, + ltv_revenue_past_90_180_day, + lifetime_purchasers_users, + lifetime_average_daily_purchasers, + lifetime_active_users, + lifetime_DAU, + lifetime_MAU, + lifetime_WAU, + lifetime_dau_per_mau, + lifetime_dau_per_wau, + lifetime_wau_per_mau, + lifetime_users_engagement_duration_seconds, + lifetime_average_engagement_time, + lifetime_average_engagement_time_per_session, + lifetime_average_sessions_per_user, + lifetime_ARPPU, + lifetime_ARPU, + lifetime_average_daily_revenue, + lifetime_max_daily_revenue, + lifetime_min_daily_revenue, + lifetime_new_users, + lifetime_returning_users, + lifetime_first_time_purchasers, + lifetime_first_time_purchaser_conversion, + lifetime_first_time_purchasers_per_new_user, + lifetime_avg_user_conversion_rate, + lifetime_avg_session_conversion_rate, + pltv_revenue_30_days +FROM +( +SELECT +* EXCEPT(rn) FROM ( +SELECT + *, + ROW_NUMBER() OVER (PARTITION BY bucket ORDER BY RAND()) AS rn +FROM ( + SELECT + *, + CASE + WHEN pltv_revenue_30_days < 50 THEN "bucket1" + WHEN pltv_revenue_30_days BETWEEN 50 AND 350 THEN "bucket2" + WHEN pltv_revenue_30_days > 350 THEN "bucket3" END as bucket + FROM + `{{project_id}}.{{dataset}}.v_customer_lifetime_value_training_180_30`) +) +WHERE + rn <= 10000) +; + + + DROP TABLE training_preparation; DROP TABLE DataForTargetTable; diff --git a/sql/procedure/purchase_propensity_training_preparation.sqlx b/sql/procedure/purchase_propensity_training_preparation.sqlx index 8c5d9e47..fe924215 100644 --- a/sql/procedure/purchase_propensity_training_preparation.sqlx +++ b/sql/procedure/purchase_propensity_training_preparation.sqlx @@ -1724,5 +1724,312 @@ SELECT DISTINCT MAX(will_purchase) OVER(PARTITION BY user_pseudo_id, feature_date) as will_purchase FROM `{{project_id}}.{{dataset}}.purchase_propensity_training_30_15`; + CREATE OR REPLACE VIEW `{{project_id}}.{{dataset}}.v_purchase_propensity_training_30_15_balanced` +(processed_timestamp, + data_split, + feature_date, + user_pseudo_id, + user_id, + month_of_the_year, + week_of_the_year, + day_of_the_month, + day_of_week, + user_ltv_revenue, + device_category, + device_mobile_brand_name, + device_mobile_model_name, + device_os, + device_os_version, + device_language, + device_web_browser, + device_web_browser_version, + geo_sub_continent, + geo_country, + geo_region, + geo_city, + geo_metro, + last_traffic_source_medium, + last_traffic_source_name, + last_traffic_source_source, + first_traffic_source_medium, + first_traffic_source_name, + first_traffic_source_source, + has_signed_in_with_user_id, + engagement_rate, + engaged_sessions_per_user, + session_conversion_rate, + bounces, + bounce_rate_per_user, + sessions_per_user, + avg_views_per_session, + sum_engagement_time_seconds, + avg_engagement_time_seconds, + new_visits, + returning_visits, + add_to_carts, + cart_to_view_rate, + checkouts, + ecommerce_purchases, + ecommerce_quantity, + ecommerce_revenue, + item_revenue, + item_quantity, + item_view_events, + items_clicked_in_promotion, + items_clicked_in_list, + items_checked_out, + items_added_to_cart, + item_list_view_events, + purchase_revenue, + purchase_to_view_rate, + transactions_per_purchaser, + user_conversion_rate, + how_many_purchased_before, + has_abandoned_cart, + active_users_past_1_day, + active_users_past_2_day, + active_users_past_3_day, + active_users_past_4_day, + active_users_past_5_day, + active_users_past_6_day, + active_users_past_7_day, + active_users_past_8_14_day, + active_users_past_15_30_day, + purchases_past_1_day, + purchases_past_2_day, + purchases_past_3_day, + purchases_past_4_day, + purchases_past_5_day, + purchases_past_6_day, + purchases_past_7_day, + purchases_past_8_14_day, + purchases_past_15_30_day, + visits_past_1_day, + visits_past_2_day, + visits_past_3_day, + visits_past_4_day, + visits_past_5_day, + visits_past_6_day, + visits_past_7_day, + visits_past_8_14_day, + visits_past_15_30_day, + view_items_past_1_day, + view_items_past_2_day, + view_items_past_3_day, + view_items_past_4_day, + view_items_past_5_day, + view_items_past_6_day, + view_items_past_7_day, + view_items_past_8_14_day, + view_items_past_15_30_day, + add_to_carts_past_1_day, + add_to_carts_past_2_day, + add_to_carts_past_3_day, + add_to_carts_past_4_day, + add_to_carts_past_5_day, + add_to_carts_past_6_day, + add_to_carts_past_7_day, + add_to_carts_past_8_14_day, + add_to_carts_past_15_30_day, + checkouts_past_1_day, + checkouts_past_2_day, + checkouts_past_3_day, + checkouts_past_4_day, + checkouts_past_5_day, + checkouts_past_6_day, + checkouts_past_7_day, + checkouts_past_8_14_day, + checkouts_past_15_30_day, + purchasers_users, + average_daily_purchasers, + active_users, + DAU, + MAU, + WAU, + dau_per_mau, + dau_per_wau, + wau_per_mau, + users_engagement_duration_seconds, + average_engagement_time, + average_engagement_time_per_session, + average_sessions_per_user, + ARPPU, + ARPU, + average_daily_revenue, + max_daily_revenue, + min_daily_revenue, + new_users, + returning_users, + first_time_purchasers, + first_time_purchaser_conversion, + first_time_purchasers_per_new_user, + avg_user_conversion_rate, + avg_session_conversion_rate, + will_purchase) +OPTIONS( + --expiration_timestamp=TIMESTAMP_ADD(CURRENT_TIMESTAMP(), INTERVAL 48 HOUR), + friendly_name="v_purchase_propensity_training_30_15_balanced", + description="View Purchase Propensity Training dataset using 30 days back to predict 15 days ahead. View expires after 48h and should run daily.", + labels=[("org_unit", "development")] +) AS + SELECT + processed_timestamp, + data_split, + feature_date, + user_pseudo_id, + user_id, + month_of_the_year, + week_of_the_year, + day_of_the_month, + day_of_week, + user_ltv_revenue, + device_category, + device_mobile_brand_name, + device_mobile_model_name, + device_os, + device_os_version, + device_language, + device_web_browser, + device_web_browser_version, + geo_sub_continent, + geo_country, + geo_region, + geo_city, + geo_metro, + last_traffic_source_medium, + last_traffic_source_name, + last_traffic_source_source, + first_traffic_source_medium, + first_traffic_source_name, + first_traffic_source_source, + has_signed_in_with_user_id, + engagement_rate, + engaged_sessions_per_user, + session_conversion_rate, + bounces, + bounce_rate_per_user, + sessions_per_user, + avg_views_per_session, + sum_engagement_time_seconds, + avg_engagement_time_seconds, + new_visits, + returning_visits, + add_to_carts, + cart_to_view_rate, + checkouts, + ecommerce_purchases, + ecommerce_quantity, + ecommerce_revenue, + item_revenue, + item_quantity, + item_view_events, + items_clicked_in_promotion, + items_clicked_in_list, + items_checked_out, + items_added_to_cart, + item_list_view_events, + purchase_revenue, + purchase_to_view_rate, + transactions_per_purchaser, + user_conversion_rate, + how_many_purchased_before, + has_abandoned_cart, + active_users_past_1_day, + active_users_past_2_day, + active_users_past_3_day, + active_users_past_4_day, + active_users_past_5_day, + active_users_past_6_day, + active_users_past_7_day, + active_users_past_8_14_day, + active_users_past_15_30_day, + purchases_past_1_day, + purchases_past_2_day, + purchases_past_3_day, + purchases_past_4_day, + purchases_past_5_day, + purchases_past_6_day, + purchases_past_7_day, + purchases_past_8_14_day, + purchases_past_15_30_day, + visits_past_1_day, + visits_past_2_day, + visits_past_3_day, + visits_past_4_day, + visits_past_5_day, + visits_past_6_day, + visits_past_7_day, + visits_past_8_14_day, + visits_past_15_30_day, + view_items_past_1_day, + view_items_past_2_day, + view_items_past_3_day, + view_items_past_4_day, + view_items_past_5_day, + view_items_past_6_day, + view_items_past_7_day, + view_items_past_8_14_day, + view_items_past_15_30_day, + add_to_carts_past_1_day, + add_to_carts_past_2_day, + add_to_carts_past_3_day, + add_to_carts_past_4_day, + add_to_carts_past_5_day, + add_to_carts_past_6_day, + add_to_carts_past_7_day, + add_to_carts_past_8_14_day, + add_to_carts_past_15_30_day, + checkouts_past_1_day, + checkouts_past_2_day, + checkouts_past_3_day, + checkouts_past_4_day, + checkouts_past_5_day, + checkouts_past_6_day, + checkouts_past_7_day, + checkouts_past_8_14_day, + checkouts_past_15_30_day, + purchasers_users, + average_daily_purchasers, + active_users, + DAU, + MAU, + WAU, + dau_per_mau, + dau_per_wau, + wau_per_mau, + users_engagement_duration_seconds, + average_engagement_time, + average_engagement_time_per_session, + average_sessions_per_user, + ARPPU, + ARPU, + average_daily_revenue, + max_daily_revenue, + min_daily_revenue, + new_users, + returning_users, + first_time_purchasers, + first_time_purchaser_conversion, + first_time_purchasers_per_new_user, + avg_user_conversion_rate, + avg_session_conversion_rate, + will_purchase + FROM ( + SELECT + DISTINCT *, + ROW_NUMBER() OVER (PARTITION BY will_purchase ORDER BY RAND()) AS rn + FROM + `{{project_id}}.{{dataset}}.v_purchase_propensity_training_30_15` ) + WHERE + rn <= ( + SELECT + COUNT(will_purchase) + FROM + `{{project_id}}.{{dataset}}.v_purchase_propensity_training_30_15` + WHERE + will_purchase = 1) +; + + DROP TABLE training_preparation; DROP TABLE DataForTargetTable; \ No newline at end of file diff --git a/sql/query/invoke_backfill_user_lifetime_dimensions.sqlx b/sql/query/invoke_backfill_user_lifetime_dimensions.sqlx index 987e2b85..e8508f3c 100644 --- a/sql/query/invoke_backfill_user_lifetime_dimensions.sqlx +++ b/sql/query/invoke_backfill_user_lifetime_dimensions.sqlx @@ -34,11 +34,6 @@ CREATE OR REPLACE TEMP TABLE user_dimensions_event_session_scoped as ( LAST_VALUE(format_date('%U',event_date)) OVER(PARTITION BY user_pseudo_id, DI.input_date ORDER BY event_timestamp DESC) AS week_of_the_year, LAST_VALUE(format_date('%d',event_date)) OVER(PARTITION BY user_pseudo_id, DI.input_date ORDER BY event_timestamp DESC) AS day_of_the_month, LAST_VALUE(format_date('%w',event_date)) OVER(PARTITION BY user_pseudo_id, DI.input_date ORDER BY event_timestamp DESC) AS day_of_week, - --LAST_VALUE(format("%02d",extract(hour from event_timestamp))) OVER(PARTITION BY user_pseudo_id, DI.input_date ORDER BY event_timestamp DESC) as hour_of_day, - --LAST_VALUE(DATE_DIFF(event_date, DI.input_date, DAY)) OVER(PARTITION BY user_pseudo_id, DI.input_date ORDER BY event_timestamp DESC) AS nth_day, - --LAST_VALUE(TIMESTAMP_DIFF(event_timestamp, TIMESTAMP(DI.input_date), HOUR)) OVER(PARTITION BY user_pseudo_id, DI.input_date ORDER BY event_timestamp DESC) AS nth_hour, - --LAST_VALUE(DATE_DIFF(event_date, DI.input_date, WEEK)) OVER(PARTITION BY user_pseudo_id, DI.input_date ORDER BY event_timestamp DESC) AS nth_week, - --LAST_VALUE(DATE_DIFF(event_date, DI.input_date, MONTH)) OVER(PARTITION BY user_pseudo_id, DI.input_date ORDER BY event_timestamp DESC) AS nth_month, LAST_VALUE(user_ltv_revenue) OVER(PARTITION BY user_pseudo_id, DI.input_date ORDER BY event_timestamp DESC) AS user_ltv_revenue, LAST_VALUE(T.traffic_source_medium) OVER(PARTITION BY user_pseudo_id, DI.input_date ORDER BY event_timestamp DESC) AS last_traffic_source_medium, LAST_VALUE(T.traffic_source_name) OVER(PARTITION BY user_pseudo_id, DI.input_date ORDER BY event_timestamp DESC) AS last_traffic_source_name, From 8ddcc636338bc7bda3d69cea1b77c3037783fe54 Mon Sep 17 00:00:00 2001 From: Charlie Wang <2144018+kingman@users.noreply.github.com> Date: Fri, 6 Oct 2023 18:26:25 +0200 Subject: [PATCH 2/3] add triggers for re-generate sql files and re-install poetry (#48) * remove unnecessary dependency declarations * automatic terraform validation * add triggers on file content * ignore error when files does not exist * add trigger on pyproject.toml configuration for poetry install --- infrastructure/terraform/main.tf | 51 +++++++++++++++++++++++--------- 1 file changed, 37 insertions(+), 14 deletions(-) diff --git a/infrastructure/terraform/main.tf b/infrastructure/terraform/main.tf index 6f18b34f..ba156ce8 100644 --- a/infrastructure/terraform/main.tf +++ b/infrastructure/terraform/main.tf @@ -57,6 +57,17 @@ locals { config_file_name = "config" poetry_run_alias = "${var.poetry_cmd} run" mds_dataset_suffix = var.create_prod_environment ? "prod" : var.create_dev_environment ? "dev" : "staging" + + project_toml_file_path = "${local.source_root_dir}/pyproject.toml" + project_toml_content_hash = filesha512(local.project_toml_file_path) + + generated_sql_queries_directory_path = "${local.source_root_dir}/sql/query" + generated_sql_queries_fileset = [for f in fileset(local.generated_sql_queries_directory_path, "*.sql") : "${local.generated_sql_queries_directory_path}/${f}"] + generated_sql_queries_content_hash = sha512(join("", [for f in local.generated_sql_queries_fileset : fileexists(f) ? filesha512(f) : sha512("file-not-found")])) + + generated_sql_procedures_directory_path = "${local.source_root_dir}/sql/procedure" + generated_sql_procedures_fileset = [for f in fileset(local.generated_sql_procedures_directory_path, "*.sql") : "${local.generated_sql_procedures_directory_path}/${f}"] + generated_sql_procedures_content_hash = sha512(join("", [for f in local.generated_sql_procedures_fileset : fileexists(f) ? filesha512(f) : sha512("file-not-found")])) } resource "local_file" "feature_store_configuration" { @@ -75,8 +86,14 @@ resource "local_file" "feature_store_configuration" { } resource "null_resource" "poetry_install" { + triggers = { + create_command = "${var.poetry_cmd} install" + source_contents_hash = local.project_toml_content_hash + } + provisioner "local-exec" { - command = "${var.poetry_cmd} install" + when = create + command = self.triggers.create_command working_dir = local.source_root_dir } } @@ -84,30 +101,36 @@ resource "null_resource" "poetry_install" { resource "null_resource" "generate_sql_queries" { triggers = { + create_command = <<-EOT + ${local.poetry_run_alias} inv apply-env-variables-queries --env-name=${local.config_file_name} + ${local.poetry_run_alias} inv apply-env-variables-procedures --env-name=${local.config_file_name} + EOT + + destroy_command = <<-EOT + rm -f sql/query/*.sql + rm -f sql/procedure/*.sql + EOT + working_dir = local.source_root_dir + + poetry_installed = null_resource.poetry_install.id + + source_contents_hash = local_file.feature_store_configuration.content_sha512 + destination_queries_hash = local.generated_sql_queries_content_hash + destination_procedures_hash = local.generated_sql_procedures_content_hash } provisioner "local-exec" { - command = <<-EOT - ${local.poetry_run_alias} inv apply-env-variables-queries --env-name=${local.config_file_name} - ${local.poetry_run_alias} inv apply-env-variables-procedures --env-name=${local.config_file_name} - EOT + when = create + command = self.triggers.create_command working_dir = self.triggers.working_dir } provisioner "local-exec" { when = destroy - command = <<-EOT - rm sql/query/*.sql - rm sql/procedure/*.sql - EOT + command = self.triggers.destroy_command working_dir = self.triggers.working_dir } - - depends_on = [ - local_file.feature_store_configuration, - null_resource.poetry_install - ] } module "feature_store" { From a5060128051e79bf0d1964fb9e3a1e33d2fe63e6 Mon Sep 17 00:00:00 2001 From: Charlie Wang <2144018+kingman@users.noreply.github.com> Date: Tue, 10 Oct 2023 19:10:40 +0200 Subject: [PATCH 3/3] Cf gen two (#50) * remove unnecessary dependency declarations * automatic terraform validation * making activation cloud function internal and refactor to Cloud Functions 2rd Gen * preventing uneccessary re-deployment of the function --- .../terraform/modules/activation/main.tf | 97 +++++++++++++------ 1 file changed, 65 insertions(+), 32 deletions(-) diff --git a/infrastructure/terraform/modules/activation/main.tf b/infrastructure/terraform/modules/activation/main.tf index 753299e0..d303f73b 100644 --- a/infrastructure/terraform/modules/activation/main.tf +++ b/infrastructure/terraform/modules/activation/main.tf @@ -38,6 +38,10 @@ locals { } +data "google_project" "activation_project" { + project_id = var.project_id +} + module "project_services" { source = "terraform-google-modules/project-factory/google//modules/project_services" version = "14.1.0" @@ -61,6 +65,8 @@ module "project_services" { "storage.googleapis.com", "datapipelines.googleapis.com", "analyticsadmin.googleapis.com", + "eventarc.googleapis.com", + "run.googleapis.com", ] } @@ -343,46 +349,73 @@ resource "google_storage_bucket_object" "activation_trigger_archive" { bucket = module.function_bucket.name } -resource "google_cloudfunctions_function" "activation_trigger_cf" { - name = "activation-trigger" - project = var.project_id - region = var.trigger_function_location - runtime = "python311" +resource "google_cloudfunctions2_function" "activation_trigger_cf" { + name = "activation-trigger" + project = var.project_id + location = var.trigger_function_location + + build_config { + runtime = "python311" + source { + storage_source { + bucket = module.function_bucket.name + object = google_storage_bucket_object.activation_trigger_archive.name + } + } + entry_point = "subscribe" + } - available_memory_mb = 256 - max_instances = 3 - source_archive_bucket = module.function_bucket.name - source_archive_object = google_storage_bucket_object.activation_trigger_archive.name event_trigger { - event_type = "google.pubsub.topic.publish" - resource = google_pubsub_topic.activation_trigger.name - } - timeout = 60 - entry_point = "subscribe" - service_account_email = module.trigger_function_account.email - - environment_variables = { - ACTIVATION_PROJECT = var.project_id - ACTIVATION_REGION = var.location - ACTIVATION_TYPE_CONFIGURATION = "gs://${module.pipeline_bucket.name}/${google_storage_bucket_object.activation_type_configuration_file.output_name}" - TEMPLATE_FILE_GCS_LOCATION = "gs://${module.pipeline_bucket.name}/dataflow/templates/${local.activation_container_image_id}.json" - PIPELINE_TEMP_LOCATION = "gs://${module.pipeline_bucket.name}/tmp/" - LOG_DATA_SET = module.bigquery.bigquery_dataset.dataset_id - PIPELINE_WORKER_EMAIL = module.pipeline_service_account.email + event_type = "google.cloud.pubsub.topic.v1.messagePublished" + pubsub_topic = google_pubsub_topic.activation_trigger.id + retry_policy = "RETRY_POLICY_DO_NOT_RETRY" + trigger_region = var.trigger_function_location } - secret_environment_variables { - key = "GA4_MEASUREMENT_ID" - secret = split("/", module.secret_manager.secret_names[0])[3] - version = split("/", module.secret_manager.secret_versions[0])[5] + + service_config { + available_memory = "256M" + max_instance_count = 3 + timeout_seconds = 60 + ingress_settings = "ALLOW_INTERNAL_ONLY" + service_account_email = module.trigger_function_account.email + environment_variables = { + ACTIVATION_PROJECT = var.project_id + ACTIVATION_REGION = var.location + ACTIVATION_TYPE_CONFIGURATION = "gs://${module.pipeline_bucket.name}/${google_storage_bucket_object.activation_type_configuration_file.output_name}" + TEMPLATE_FILE_GCS_LOCATION = "gs://${module.pipeline_bucket.name}/dataflow/templates/${local.activation_container_image_id}.json" + PIPELINE_TEMP_LOCATION = "gs://${module.pipeline_bucket.name}/tmp/" + LOG_DATA_SET = module.bigquery.bigquery_dataset.dataset_id + PIPELINE_WORKER_EMAIL = module.pipeline_service_account.email + } + secret_environment_variables { + project_id = var.project_id + key = "GA4_MEASUREMENT_ID" + secret = split("/", module.secret_manager.secret_names[0])[3] + version = split("/", module.secret_manager.secret_versions[0])[5] + } + secret_environment_variables { + project_id = var.project_id + key = "GA4_MEASUREMENT_SECRET" + secret = split("/", module.secret_manager.secret_names[1])[3] + version = split("/", module.secret_manager.secret_versions[1])[5] + } } - secret_environment_variables { - key = "GA4_MEASUREMENT_SECRET" - secret = split("/", module.secret_manager.secret_names[1])[3] - version = split("/", module.secret_manager.secret_versions[1])[5] + lifecycle { + ignore_changes = [build_config[0].source[0].storage_source[0].generation] } depends_on = [ module.project_services ] } + +module "add_invoker_binding" { + source = "terraform-google-modules/gcloud/google" + version = "3.1.2" + + platform = "linux" + + create_cmd_body = "functions add-invoker-policy-binding ${google_cloudfunctions2_function.activation_trigger_cf.name} --project=${google_cloudfunctions2_function.activation_trigger_cf.project} --region=${google_cloudfunctions2_function.activation_trigger_cf.location} --member=\"serviceAccount:${data.google_project.activation_project.number}-compute@developer.gserviceaccount.com\"" + destroy_cmd_body = "functions remove-invoker-policy-binding ${google_cloudfunctions2_function.activation_trigger_cf.name} --project=${google_cloudfunctions2_function.activation_trigger_cf.project} --region=${google_cloudfunctions2_function.activation_trigger_cf.location} --member=\"serviceAccount:${data.google_project.activation_project.number}-compute@developer.gserviceaccount.com\"" +}