From 0fd9a6e34c27f773c549942b03af01ab2af42299 Mon Sep 17 00:00:00 2001 From: divyagovindaiah Date: Tue, 5 Nov 2024 21:04:24 +0530 Subject: [PATCH 1/8] #OBS-I298: fix:hudi and api service container varaible fix --- helmcharts/global-resource-values.yaml | 4 ++-- helmcharts/global-values.yaml | 9 ++++----- helmcharts/services/command-api/values.yaml | 5 +++++ helmcharts/services/config-api/values.yaml | 2 +- helmcharts/services/dataset-api/values.yaml | 2 +- 5 files changed, 13 insertions(+), 9 deletions(-) diff --git a/helmcharts/global-resource-values.yaml b/helmcharts/global-resource-values.yaml index 3d6436ad..60865826 100644 --- a/helmcharts/global-resource-values.yaml +++ b/helmcharts/global-resource-values.yaml @@ -578,10 +578,10 @@ dataset-api: resources: limits: cpu: 0.5 - memory: 1024Mi + memory: 2048Mi requests: cpu: 0.5 - memory: 512Mi + memory: 1024Mi config-api: resources: diff --git a/helmcharts/global-values.yaml b/helmcharts/global-values.yaml index cc31415b..8324963a 100644 --- a/helmcharts/global-values.yaml +++ b/helmcharts/global-values.yaml @@ -71,7 +71,7 @@ defaults: druid_indexer_host: &druid-indexer-host "druid-raw-indexers.druid-raw.svc.cluster.local" redirection_auth_path: &redirection_auth_path "" - domain: &domain "18.119.23.219.sslip.io" + domain: &domain "3.147.16.119.sslip.io" ssl_enabled: &ssl_enabled true @@ -288,9 +288,6 @@ kube-prometheus-stack: from_name: "Grafana" - - - kubernetes-reflector: namespace: *kubernetes-reflector-namespace @@ -455,8 +452,10 @@ volume-autoscaler: scale_above_percent: "80" # How much to scale disks up by, as a percentage of their current size scale_up_percent: "20" + # An minimum amount of bytes to scale up by (typically because providers like AWS only support 10GB increments in disk size) + scale_up_min_increment: "10737418240" # An maximum amount of bytes to scale up by (helps prevent large disks from growing too fast/exponentially). Set to 16TB by default, which basically means this is disabled - scale_up_max_increment: "16000000000000" + scale_up_max_increment: "" # The maximum size of disks to scale up to. By default on AWS using EBS volumes this is set to 16TB as that is the EBS Max disk size. scale_up_max_size: "" diff --git a/helmcharts/services/command-api/values.yaml b/helmcharts/services/command-api/values.yaml index faa799aa..c4e1c9a7 100644 --- a/helmcharts/services/command-api/values.yaml +++ b/helmcharts/services/command-api/values.yaml @@ -151,6 +151,11 @@ serviceConfig: release_name: cache-indexer job_manager_url: "cache-indexer.{{.Values.global.flink.namespace}}.svc.cluster.local:8081" + - name: "Flink-Hudi-Connector" + release_name: lakehouse-connector + job_manager_url: "lakehouse-connector-jobmanager.{{.Values.global.flink.namespace}}.svc.cluster.local:8081" + + commands: PUBLISH_DATASET: workflow: diff --git a/helmcharts/services/config-api/values.yaml b/helmcharts/services/config-api/values.yaml index 07a27577..acf1ab97 100644 --- a/helmcharts/services/config-api/values.yaml +++ b/helmcharts/services/config-api/values.yaml @@ -160,7 +160,7 @@ env: dedup_redis_port: "{{ .Values.global.redis_dedup.port }}" cloud_storage_provider: "{{ .Values.global.cloud_storage_provider }}" cloud_storage_region: "{{ .Values.global.cloud_storage_region }}" - container: "{{ .Values.global.config_api_container }}" + container: "{{ .Values.global.config_api_cloud_bucket }}" container_prefix: "connector-registry" storage_url_expiry: "3600" exhaust_query_range: "31" # In Days diff --git a/helmcharts/services/dataset-api/values.yaml b/helmcharts/services/dataset-api/values.yaml index 94fdb802..fa96ad3b 100644 --- a/helmcharts/services/dataset-api/values.yaml +++ b/helmcharts/services/dataset-api/values.yaml @@ -160,7 +160,7 @@ env: dedup_redis_port: "{{ .Values.global.redis_dedup.port }}" cloud_storage_provider: "{{ .Values.global.cloud_storage_provider }}" cloud_storage_region: "{{ .Values.global.cloud_storage_region }}" - container: "{{ .Values.global.dataset_api_container }}" + container: "{{ .Values.global.dataset_api_cloud_bucket }}" container_prefix: "connector-registry" storage_url_expiry: "3600" exhaust_query_range: "31" # In Days From 2d23c82620629bc39d19eead7a7c6e1b4d94b56a Mon Sep 17 00:00:00 2001 From: divyagovindaiah Date: Wed, 6 Nov 2024 12:14:36 +0530 Subject: [PATCH 2/8] #OBS-I298: fix:grafana invalid url fix and grafana s3 backup dashboard fix --- helmcharts/obsrv/values.yaml | 4 +++ .../dashboards/s3-backup-metrics.json | 32 ++++++------------- helmcharts/services/web-console/values.yaml | 4 +-- 3 files changed, 16 insertions(+), 24 deletions(-) diff --git a/helmcharts/obsrv/values.yaml b/helmcharts/obsrv/values.yaml index 7a990e77..1e71b551 100644 --- a/helmcharts/obsrv/values.yaml +++ b/helmcharts/obsrv/values.yaml @@ -603,6 +603,10 @@ kube-prometheus-stack: - url: http://s3-exporter.s3-exporter.svc.cluster.local:9340/discovery - job_name: s3-backups metrics_path: /probe + scrape_interval: 10m + scrape_timeout: 3m + http_sd_configs: + - url: http://s3-exporter.s3-exporter.svc.cluster.local:9340/discovery static_configs: - targets: - bucket={{ .Values.global.postgresql_backup_cloud_bucket }};prefix=postgresql; diff --git a/helmcharts/services/grafana-configs/dashboards/s3-backup-metrics.json b/helmcharts/services/grafana-configs/dashboards/s3-backup-metrics.json index 02b7c604..41d17a8b 100644 --- a/helmcharts/services/grafana-configs/dashboards/s3-backup-metrics.json +++ b/helmcharts/services/grafana-configs/dashboards/s3-backup-metrics.json @@ -355,19 +355,6 @@ "title": "Backup Folder Size", "type": "stat" }, - { - "collapsed": false, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 10 - }, - "id": 15, - "panels": [], - "title": "Logs", - "type": "row" - }, { "datasource": { "type": "loki", @@ -377,18 +364,18 @@ "h": 6, "w": 24, "x": 0, - "y": 11 + "y": 15 }, "id": 22, "options": { - "dedupStrategy": "exact", - "enableLogDetails": false, - "prettifyLogMessage": true, - "showCommonLabels": false, - "showLabels": false, "showTime": true, - "sortOrder": "Descending", - "wrapLogMessage": true + "showLabels": false, + "showCommonLabels": false, + "wrapLogMessage": true, + "prettifyLogMessage": true, + "enableLogDetails": false, + "dedupStrategy": "exact", + "sortOrder": "Descending" }, "targets": [ { @@ -404,7 +391,8 @@ ], "title": "PostgreSQL Backup Logs", "type": "logs" - }, + } + , { "datasource": { "type": "loki", diff --git a/helmcharts/services/web-console/values.yaml b/helmcharts/services/web-console/values.yaml index 1ef570fa..ec84935c 100644 --- a/helmcharts/services/web-console/values.yaml +++ b/helmcharts/services/web-console/values.yaml @@ -172,9 +172,9 @@ env: SYSTEM_API_URL: "http://command-api.command-api:8000" ALERT_MANAGER_URL: "http://alertmanager-operated.monitoring:9093" GRAFANA_ADMIN_URL: "http://grafana.monitoring.svc.cluster.local" - GRAFANA_URL: "http://grafana.monitoring.svc.cluster.local" + GRAFANA_URL: "http://{{.Values.global.domain}}/grafana" SUPERSET_URL: "http://{{.Values.global.domain}}" - REACT_APP_GRAFANA_URL: "{{.Values.global.domain}}/grafana" + REACT_APP_GRAFANA_URL: "http://{{.Values.global.domain}}/grafana" REACT_APP_SUPERSET_URL: "{{.Values.global.domain}}" SESSION_SECRET: "backend-session" POSTGRES_CONNECTION_STRING: "postgres://{{.Values.global.postgresql.obsrv.user}}:{{.Values.global.postgresql.obsrv.password}}@{{.Values.global.postgresql.host}}:{{.Values.global.postgresql.port}}/{{.Values.global.postgresql.obsrv.name}}" From 52f183bff1b878e0553e2600c8fbc79b17798a12 Mon Sep 17 00:00:00 2001 From: divyagovindaiah Date: Thu, 7 Nov 2024 17:33:35 +0530 Subject: [PATCH 3/8] #OBS-I298: fix:db backup fix --- helmcharts/obsrv/values.yaml | 14 ++++++++++---- .../system-rules-ingestor/configs/rules.yaml | 8 ++++---- helmcharts/services/web-console/values.yaml | 2 +- 3 files changed, 15 insertions(+), 9 deletions(-) diff --git a/helmcharts/obsrv/values.yaml b/helmcharts/obsrv/values.yaml index 1e71b551..4e333b03 100644 --- a/helmcharts/obsrv/values.yaml +++ b/helmcharts/obsrv/values.yaml @@ -75,7 +75,7 @@ web-console: AUTH_AD_BASE_DN: "dc=example,dc=com" AUTH_AD_USER_NAME: "admin" AUTH_AD_PASSWORD: "password1" - REACT_APP_AUTHENTICATION_ALLOWED_TYPES: "obsrv,ad,google,keycloak" + AUTHENTICATION_ALLOWED_TYPES: "obsrv,ad" AUTH_OIDC_ISSUER: "http://localhost:8080/auth/realms/TestOIDCrealm" AUTH_OIDC_AUTHRIZATION_URL: "http://localhost:8080/auth/realms/TestOIDCrealm/protocol/openid-connect/auth" AUTH_OIDC_TOKEN_URL: "http://localhost:8080/auth/realms/TestOIDCrealm/protocol/openid-connect/token" @@ -610,18 +610,24 @@ kube-prometheus-stack: static_configs: - targets: - bucket={{ .Values.global.postgresql_backup_cloud_bucket }};prefix=postgresql; - - bucket={{ .Values.global.redis_backup_cloud_bucket }};prefix=redis; + - bucket={{ .Values.global.redis_backup_cloud_bucket }};prefix=dedup-redis; + - bucket={{ .Values.global.redis_backup_cloud_bucket }};prefix=denorm-redis; relabel_configs: - source_labels: [__address__] regex: "^bucket=(.*);prefix=(.*);$" - replacement: "$${1}" + replacement: "$1" target_label: "__param_bucket" - source_labels: [__address__] regex: "^bucket=(.*);prefix=(.*);$" - replacement: "$${2}" + replacement: "$2" target_label: "__param_prefix" - target_label: __address__ replacement: s3-exporter.s3-exporter.svc.cluster.local:9340 + - target_label: bucket + replacement: "$1" + - target_label: prefix + replacement: "$2" + kube-state-metrics: namespaceOverride: "monitoring" diff --git a/helmcharts/services/system-rules-ingestor/configs/rules.yaml b/helmcharts/services/system-rules-ingestor/configs/rules.yaml index 6ae74256..29c93f4c 100644 --- a/helmcharts/services/system-rules-ingestor/configs/rules.yaml +++ b/helmcharts/services/system-rules-ingestor/configs/rules.yaml @@ -575,7 +575,7 @@ severity: critical - name: "Critical Alert: PostgreSQL Database Backup Failure" - query: (time() - s3_last_modified_object_date{job="s3-db-backups", prefix=~"postgresql"}) + query: (time() - s3_last_modified_object_date{job="s3-backups", prefix=~"postgresql"}) operator: gt threshold: [86400] category: DB Backup @@ -587,7 +587,7 @@ severity: critical - name: "Critical Alert: Dedup Redis Database Backup Failure" - query: (time() - s3_last_modified_object_date{job="s3-db-backups", prefix=~"dedup-redis"}) + query: (time() - s3_last_modified_object_date{job="s3-backups", prefix=~"dedup-redis"}) operator: gt threshold: [86400] category: DB Backup @@ -599,7 +599,7 @@ severity: critical - name: "Critical Alert: Denorm Redis Database Backup Failure" - query: (time() - s3_last_modified_object_date{job="s3-db-backups", prefix=~"denorm-redis"}) + query: (time() - s3_last_modified_object_date{job="s3-backups", prefix=~"denorm-redis"}) operator: gt threshold: [86400] category: DB Backup @@ -611,7 +611,7 @@ severity: critical - name: "Critical Alert: Velero (Kubernetes Cluster) Backup Failure" - query: (time() - s3_last_modified_object_date{job="s3-common-backups", bucket=~"velero.*"}) + query: (time() - s3_last_modified_object_date{job="s3-backups", bucket=~"velero.*"}) operator: gt threshold: [86400] category: DB Backup diff --git a/helmcharts/services/web-console/values.yaml b/helmcharts/services/web-console/values.yaml index ec84935c..54e1351a 100644 --- a/helmcharts/services/web-console/values.yaml +++ b/helmcharts/services/web-console/values.yaml @@ -194,7 +194,7 @@ env: AUTH_AD_BASE_DN: "dc=example,dc=com" AUTH_AD_USER_NAME: "admin" AUTH_AD_PASSWORD: "password1" - REACT_APP_AUTHENTICATION_ALLOWED_TYPES: "obsrv,ad,google,keycloak" + AUTHENTICATION_ALLOWED_TYPES: "obsrv,ad" AUTH_OIDC_ISSUER: "http://localhost:8080/auth/realms/TestOIDCrealm" AUTH_OIDC_AUTHRIZATION_URL: "http://localhost:8080/auth/realms/TestOIDCrealm/protocol/openid-connect/auth" AUTH_OIDC_TOKEN_URL: "http://localhost:8080/auth/realms/TestOIDCrealm/protocol/openid-connect/token" From c642393d67d926079ade75a41fc86a060ce39add Mon Sep 17 00:00:00 2001 From: divyagovindaiah Date: Fri, 8 Nov 2024 13:26:07 +0530 Subject: [PATCH 4/8] #OBS-I298: fix: updated the volume autosclaer config and disabled the masterdata-ingest-backup --- .../bootstrapper/templates/docker-secret.yaml | 2 +- helmcharts/bootstrapper/values.yaml | 4 +- helmcharts/global-values.yaml | 8 +- helmcharts/images.yaml | 3 +- helmcharts/kitchen/global-values.yaml | 504 ------------------ helmcharts/services/secor/values.yaml | 1 + 6 files changed, 11 insertions(+), 511 deletions(-) delete mode 100644 helmcharts/kitchen/global-values.yaml diff --git a/helmcharts/bootstrapper/templates/docker-secret.yaml b/helmcharts/bootstrapper/templates/docker-secret.yaml index 046ed577..679edd33 100644 --- a/helmcharts/bootstrapper/templates/docker-secret.yaml +++ b/helmcharts/bootstrapper/templates/docker-secret.yaml @@ -1,4 +1,4 @@ -{{- range .Values.private_registry_namespaces }} +{{- range .Values.namespaces }} --- apiVersion: v1 data: diff --git a/helmcharts/bootstrapper/values.yaml b/helmcharts/bootstrapper/values.yaml index 22123e7d..9df21119 100644 --- a/helmcharts/bootstrapper/values.yaml +++ b/helmcharts/bootstrapper/values.yaml @@ -28,6 +28,6 @@ namespaces: global: image: - dockerRegistrySecretName: registry-secret-name + dockerRegistrySecretName: obsrv-docker-secret dockerConfigJson: |- - {"auths":{"http://registry.your-domain.io":{"username":"REGISTRY_USER","password":"REGISTRY_PASSWORD","email":"email@your-domain.io","auth":"UkVHSVNUUllfVVNFUjpSRUdJU1RSWV9QQVNTV09SRA=="}}} + {"auths":{"http://registry.your-domain.io":{"auth":"eW91cl9yZWdpc3RyeV91c2VybmFtZTp5b3VyX3JlZ2lzdHJ5X3Rva2Vu"}}} \ No newline at end of file diff --git a/helmcharts/global-values.yaml b/helmcharts/global-values.yaml index 8324963a..5d4d42aa 100644 --- a/helmcharts/global-values.yaml +++ b/helmcharts/global-values.yaml @@ -71,7 +71,9 @@ defaults: druid_indexer_host: &druid-indexer-host "druid-raw-indexers.druid-raw.svc.cluster.local" redirection_auth_path: &redirection_auth_path "" - domain: &domain "3.147.16.119.sslip.io" + domain: &domain ".sslip.io" # Update with the Allocation ID of the Elastic IP + + ssl_enabled: &ssl_enabled true @@ -452,8 +454,8 @@ volume-autoscaler: scale_above_percent: "80" # How much to scale disks up by, as a percentage of their current size scale_up_percent: "20" - # An minimum amount of bytes to scale up by (typically because providers like AWS only support 10GB increments in disk size) - scale_up_min_increment: "10737418240" + # An minimum amount of bytes to scale up by (typically because providers like AWS only support 100GB increments in disk size) + scale_up_min_increment: "107374182400" # An maximum amount of bytes to scale up by (helps prevent large disks from growing too fast/exponentially). Set to 16TB by default, which basically means this is disabled scale_up_max_increment: "" # The maximum size of disks to scale up to. By default on AWS using EBS volumes this is set to 16TB as that is the EBS Max disk size. diff --git a/helmcharts/images.yaml b/helmcharts/images.yaml index de94ccf4..1390b969 100644 --- a/helmcharts/images.yaml +++ b/helmcharts/images.yaml @@ -3,7 +3,8 @@ global: registry: &global_registry "sanketikahub" dockerRegistrySecretName: &dockerSecretName "registry-secret-name" dockerConfigJson: |- - {"auths":{"http://registry.your-domain.io":{"username":"REGISTRY_USER","password":"REGISTRY_PASSWORD","email":"email@your-domain.io","auth":"UkVHSVNUUllfVVNFUjpSRUdJU1RSWV9QQVNTV09SRA=="}}} + {"auths":{"http://registry.your-domain.io":{"auth":"eW91cl9yZWdpc3RyeV91c2VybmFtZTp5b3VyX3JlZ2lzdHJ5X3Rva2Vu"}}} + imagePullSecrets: &imagePullSecrets diff --git a/helmcharts/kitchen/global-values.yaml b/helmcharts/kitchen/global-values.yaml deleted file mode 100644 index c2862507..00000000 --- a/helmcharts/kitchen/global-values.yaml +++ /dev/null @@ -1,504 +0,0 @@ -defaults: - env: &env "dev" - building-block: &building-block "obsrv" - encryption_key: &encryption-key "strong_encryption_key_to_encrypt" - namespaces: - kafka_namespace: &kafka-namespace "kafka" - postgresql_namespace: &postgresql-namespace "postgresql" - druid_namespace: &druid-namespace "druid-raw" - redis_namespace: &redis-namespace "redis" - flink_namespace: &flink_namespace "flink" - loki_namespace: &loki-namespace "loki" - monitoring_namespace: &monitoring-namespace "monitoring" - spark_namespace: &spark-namespace "spark" - superset_namespace: &superset-namespace "superset" - secor_namespace: &secor-namespace "secor" - command_api_namespace: &command-api-namespace "command-api" - dataset_api_namespace: &dataset-api-namespace "dataset-api" - config_api_namespace: &config-api-namespace "config-api" - kubernetes_reflector_namespace: &kubernetes-reflector-namespace "kubernetes-reflector" - submit_ingestion_namespace: &submit-ingestion-namespace "submit-ingestion" - kong_namespace: &kong-namespace "kong" - kong_ingress_namespace: &kong-ingress-namespace "kong-ingress" - system_rules_ingestor: &system-rules-ingestor-namespace "system-rules-ingestor" - web_console_namesapce: &web-console-namesapce "web-console" - cert_manager_namespace: &cert-manager-namespace "cert-manager" - velero_namespace: &velero-namespace "velero" - volume_autoscaler_namespace: &volume-autoscaler-namespace "volume-autoscaler" - hms_namespace: &hms_namespace "hms" - trino_namespace: &trino_namespace "trino" - nlq_api_namespace: &nlq_api_namespace "nlq" - - postgres: - pghost: &pghost "postgresql-hl.postgresql.svc.cluster.local" - obsrv_username: &psql-obsrv-user "obsrv" - obsrv_database: &psql-obsrv-db "obsrv" - obsrv_user_password: &psql-obsrv-pwd "obsrv123" - druid_database: &psql-druid-db "druid_raw" - druid_username: &psql-druid-user "druid_raw" - druid_user_password: &psql-druid-pwd "druidraw123" - superset_db_name: &psql-superset-db "superset" - superset_user_name: &psql-superset-user "superset" - superset_user_password: &psql-superset-pwd "superset123" - hms_db_name: &psql-hms-db "hms" - hms_user_name: &psql-hms-user "hms" - hms_user_password: &psql-hms-pwd "hms123" - - grafana: - grafana_admin_user: &grafana-admin-user "admin" - grafana_admin_password: &grafana-admin-password "adminpassword" - grafana_auth_token: &grafana-auth-token "YWRtaW46YWRtaW5wYXNzd29yZA==" - grafana_url: &grafana-url "http://grafana.monitoring.svc.cluster.local" - - kafka-topics: - ingestTopic: &ingestTopic "dev.ingest" - system_stats_topic: &system_stats_topic "dev.stats" - masterdata_system_stats_topic: &masterdata_system_stats_topic "dev.masterdata.stats" - masterdataIngestTopic: &masterdataIngestTopic "dev.masterdata.ingest" - telemetryEventsTopic: &telemetryEventsTopic "system.telemetry.events" - hudiConnectorTopic: &hudiConnectorTopic "dev.hudi.connector.in" - connectorsMetricTopic: &connectorsMetricTopic "obsrv-connectors-metrics" - connectorsFailedTopic: &connectorsFailedTopic "connectors.failed" - - # config_service_host: &config-service-host "http://command-api.command-api.svc.cluster.local" - # druid_host: &global-druid-host "http://druid-raw-routers.druid-raw.svc.cluster.local" - druid_url: &global-druid-url "http://druid-raw-routers.druid-raw.svc:8888" - dataset_service_url: &dataset-service-url "http://dataset-api.dataset-api.svc.cluster.local:3000" - config_service_url: &config-service-url "http://config-api.config-api.svc.cluster.local:4000" - - cron_schedule: &cron-schedule "0 0 * * *" - druid_router_host: &druid-router-host "druid-raw-routers.druid-raw.svc.cluster.local" - druid_indexer_host: &druid-indexer-host "druid-raw-indexers.druid-raw.svc.cluster.local" - - redirection_auth_path: &redirection_auth_path "" - domain: &domain "3.147.16.119.sslip.io" - - ssl_enabled: &ssl_enabled true - - domain_admin_email: &domain_admin_email "test@obsrv-ai.com" - -#coredb-charts/ -kafka: &kafka - namespace: *kafka-namespace - host: "kafka-headless.kafka.svc.cluster.local" - port: 9092 - bootstrap-server: &kafka-bootstrap-server "kafka-headless.kafka.svc.cluster.local:9092" - ingestTopic: *ingestTopic - masterdataIngestTopic: *masterdataIngestTopic - telemetryEventsTopic: *telemetryEventsTopic - ss-kafka-topic: *system_stats_topic - masterdata-ss-kafka-topic: *masterdata_system_stats_topic - hudiConnectorTopic: *hudiConnectorTopic - connectorsMetricTopic: *connectorsMetricTopic - connectorsFailedTopic: *connectorsFailedTopic - numPartitions: 4 - provisioning: - numPartitions: 4 - replicationFactor: 1 - topics: - - name: *ingestTopic - partitions: 4 - replicationFactor: 1 - - name: *masterdataIngestTopic - partitions: 4 - replicationFactor: 1 - - name: *telemetryEventsTopic - partitions: 4 - replicationFactor: 1 - - name: *system_stats_topic - partitions: 4 - replicationFactor: 1 - - name: *masterdata_system_stats_topic - partitions: 4 - replicationFactor: 1 - - name: *hudiConnectorTopic - partitions: 4 - replicationFactor: 1 - - name: *connectorsMetricTopic - partitions: 4 - replicationFactor: 1 - - name: *connectorsFailedTopic - partitions: 4 - replicationFactor: 1 - - -zookeeper: &zookeeper - namespace: *kafka-namespace - host: "kafka-zookeeper-headless.kafka.svc.cluster.local" - port: 2181 - -postgresql: &postgresql - namespace: *postgresql-namespace - # the below variables are not chart variables - # these are defined so that it can used used in other charts - # changing the below value has no effect on actual values - # refer to the chart for actual variables - # update below values manually if actual you change actual values - host: *pghost - port: 5432 - ################################################################# - # !!! If bitnami charts is used, **username should be postgres**. - ################################################################# - username: &pguser "postgres" - password: &pgpassword "postgres" - - obsrv: - name: *psql-obsrv-db - user: *psql-obsrv-user - password: *psql-obsrv-pwd - - superset: - name: *psql-superset-db - user: *psql-superset-user - password: *psql-superset-pwd - - druid: - name: *psql-druid-db - user: *psql-druid-user - password: *psql-druid-pwd - - hms: - name: *psql-hms-db - user: *psql-hms-user - password: *psql-hms-pwd - - # Internal. Don't touch - auth: - enablePostgresUser: true - postgresPassword: *pgpassword - - primary: - initdb: - user: *pguser - password: *pgpassword - scripts: - 00_create_superset_db.sql: | - CREATE DATABASE superset; - 01_create_druid_raw_db.sql: | - CREATE DATABASE druid_raw; - 02_create_obsrv_db.sql: | - CREATE DATABASE obsrv; - 03_create_pg_stat_statements_db.sql: | - CREATE EXTENSION IF NOT EXISTS pg_stat_statements; - 04_create_hms_db.sql: | - CREATE DATABASE hms; - extendedConfiguration: | - password_encryption = md5 - shared_preload_libraries = 'pg_stat_statements' # (change requires restart) - pg_stat_statements.max = 10000 - pg_stat_statements.track = all - log_statement = 'all' - logging_collector = 'on' - log_min_duration_statement = 0 - log_filename = 'postgresql.log' - log_directory='/opt/bitnami/postgresql/logs' - persistence: - size: 10Gi - enabled: true - mountPath: /bitnami/postgresql - -druid: &druid - namespace: *druid-namespace - host: "druid-raw-routers.druid-raw.svc.cluster.local" - port: 8888 - supervisorEndpoint: "indexer/v1/supervisor" - -redis-denorm: &redis_denorm - namespace: *redis-namespace - host: redis-denorm-headless.redis.svc.cluster.local - port: 6379 - -redis-dedup: &redis_dedup - namespaceOverride: *redis-namespace #TODO: check why namespace doesn't work here - host: &redis-url redis-dedup-headless.redis.svc.cluster.local - port: 6379 - -flink: &flink - namespace: *flink_namespace - host: "flink-headless.flink.svc.cluster.local" - port: 8081 - -#migrations/charts/ -postgresql-migration: - namespace: *postgresql-namespace - superset_oauth_clientid: &superset_oauth_clientid "451058501-dev.oauth.obsrv.ai" - superset_oauth_client_secret: &superset_oauth_client_secret "luXRJMh" - kong_ingress_domain: *domain - gf_auth_generic_oauth_client_id: "528806583-dev.oauth.obsrv.ai" - gf_auth_generic_oauth_client_secret: "el642dcXd1P3v6i+hODnGrUKx9ZSWAlmXWZaEoZQI7/R3NUGQlLTnNCV" - web_console_password: "$2a$10$bG9R7ioA4/pfw8m0GPcWTOZMhc2sNN4wEkKV.j50RvQW5iUki/4Za" - web_console_login: "admin@obsrv.in" - system_settings: - encryption_key: "strong_encryption_key_to_encrypt" - default_dataset_id: "ALL" - max_event_size: 1048576 - dedup_period: 604800 # In seconds (7 days) - -#coreinfra-charts/ -prometheus: &prometheus - url: http://monitoring-kube-prometheus-prometheus.monitoring.svc.cluster.local:9090 - -loki: - namespace: *loki-namespace - - -druid-raw-cluster: &druid-raw-cluster - namespace: *druid-namespace - druid_metadata_storage_connector_user: druid_raw - druid_metadata_storage_connector_password: *psql-druid-pwd - zookeeper: - namespace: *druid-namespace - -druid-operator: - namespace: *druid-namespace - # nameOverride: druid-operator - -kafka-message-exporter: - namespace: *kafka-namespace - exporterEnv: - KAFKA_BROKERS: *kafka-bootstrap-server - KAFKA_CONSUMER_GROUP_ID: obsrv-kafka-message-exporter - -kube-prometheus-stack: - namespaceOverride: *monitoring-namespace - prometheus-windows-exporter: - namespaceOverride: *monitoring-namespace - kube-state-metrics: - namespaceOverride: *monitoring-namespace - prometheus-node-exporter: - namespaceOverride: *monitoring-namespace - grafana: - fullnameOverride: "grafana" - namespaceOverride: *monitoring-namespace - env: - GF_SECURITY_ADMIN_PASSWORD: *grafana-admin-password - GF_SECURITY_ADMIN_USER: *grafana-admin-user - grafana.ini: - smtp: - enabled: true - host: "" - user: "" - password: "" - from_address: "" - cert_file: "" - key_file: "" - ehlo_identity: "" - startTLS_policy: "" - skip_verify: true - from_name: "Grafana" - - - - - -kubernetes-reflector: - namespace: *kubernetes-reflector-namespace - -postgresql-backup: - namespace: *postgresql-namespace - cronSchedule: *cron-schedule - PG_USER: *pguser - PG_HOST: *pghost - PGPASSWORD: *pgpassword - initContainers: {} - -postgresql-exporter: - config: - datasource: - host: *pghost - user: *pguser - password: *pgpassword - - -spark: &spark - namespace: *spark-namespace - -superset_oauth: &superset_oauth - enabled: true - client_id: *superset_oauth_clientid - client_secret: *superset_oauth_client_secret - auth_token: "NDUxMDU4NTAxLWRldi5vYXV0aC5vYnNydi5haTpsdVhSSk1o" - -superset: - namespace: *superset-namespace - postgres: - adminUser: *pguser - adminPassword: *pgpassword - db_host: *pghost #postgresql-hl.postgresql.svc.cluster.local (Actual value) Need to be checked - db_port: "5432" - superset: - db_name: *psql-superset-db - db_username: *psql-superset-user - db_password: *psql-superset-pwd - oauth: - <<: *superset_oauth - -command-api: &command-api - namespace: *command-api-namespace - system_env: *env - -dataset-api: &dataset-api - namespace: *dataset-api-namespace - host: *dataset-service-url - env: - GRAFANA_AUTH_TOKEN: *grafana-auth-token - GRAFANA_ADMIN_URL: *grafana-url - -config-api: &config-api - namespace: *config-api-namespace - host: *config-service-url - env: - GRAFANA_AUTH_TOKEN: *grafana-auth-token - GRAFANA_ADMIN_URL: *grafana-url - - -grafana-configs: - namespace: *monitoring-namespace - -submit-ingestion: - namespace: *submit-ingestion-namespace - -system-rules-ingestor: - namespace: *system-rules-ingestor-namespace - datasetServiceUrl: *dataset-service-url - grafana: - namespace: *monitoring-namespace - containerName: grafana - fileNames: "api.yaml,ingestion.yaml,processing.yaml,querying.yaml,node.yaml,storage.yaml,monitoring.yaml" - -web-console: - namespace: *web-console-namesapce - env: - GRAFANA_AUTH_TOKEN: *grafana-auth-token - GRAFANA_ADMIN_URL: *grafana-url - -#additional-charts/ -alert-rules: &alert-rules - namespace: *monitoring-namespace - -druid-exporter: - namespace: *druid-namespace - druidURL: *global-druid-url - serviceMonitor: - enabled: true - namespace: *druid-namespace - interval: 30s - scrapeTimeout: 10s - -secor: - namespace: *secor-namespace - # kafka: *kafka - # zookeeper: *zookeeper - -cert-manager: &cert-manager - namespace: *cert-manager-namespace - -kong: - namespace: *kong-ingress-namespace - extraLabels: - system.api: "true" - podLabels: - system.api: "true" - proxy: - annotations: {} - labels: - system.api: "true" - serviceMonitor: - enabled: true - interval: 30s - namespace: *kong-ingress-namespace - labels: - release: monitoring - system.api: "true" - -kong-ingress-routes: - namespace: *kong-ingress-namespace - domain: *domain - -letsencrypt-ssl: - enabled: *ssl_enabled - namespace: *web-console-namesapce - cert_issuer_name: "letsencrypt-prod" - # prod letsencrypt url, to be used on prod instances - letsencrypt_server_url: "https://acme-v02.api.letsencrypt.org/directory" - # letsencrypt_server_url: "https://acme-staging-v02.api.letsencrypt.org/directory" - domain: *domain - domain_admin_email: *domain_admin_email - -velero: - namespace: *velero-namespace - metrics: - service: - labels: - release: monitoring - enabled: true - additionalLabels: - release: monitoring - serviceMonitor: - enabled: true - namespace: *velero-namespace - additionalLabels: - release: monitoring - autodetect: true - schedules: - obsrv-daily-backup: - disabled: false - schedule: "0 0 * * *" - useOwnerReferencesInBackup: false - template: - ttl: "247h" - includedNamespaces: [] - -volume-autoscaler: - namespace: *volume-autoscaler-namespace - # How much full the disk must be before considering scaling - scale_above_percent: "80" - # How much to scale disks up by, as a percentage of their current size - scale_up_percent: "20" - # An maximum amount of bytes to scale up by (helps prevent large disks from growing too fast/exponentially). Set to 16TB by default, which basically means this is disabled - scale_up_max_increment: "16000000000000" - # The maximum size of disks to scale up to. By default on AWS using EBS volumes this is set to 16TB as that is the EBS Max disk size. - scale_up_max_size: "" - -hms: - namespace: *hms_namespace - envVars: - DATABASE_HOST: *pghost - DATABASE_DB: *psql-hms-db - DATABASE_USER: *psql-hms-user - DATABASE_PASSWORD: *psql-hms-pwd - THRIFT_PORT: "9083" - -trino: - namespace: *trino_namespace - -lakehouse-connector: - namespace: *flink_namespace - -nlq-api: - namespace: *nlq_api_namespace - openai_api_key: "" - -global: - env: *env - building_block: *building-block - encryption_key: *encryption-key - domain: *domain - ssl_enabled: *ssl_enabled - kafka: *kafka - zookeeper: *zookeeper - druid: *druid - postgresql: *postgresql - flink: *flink - redis_denorm: *redis_denorm - redis_dedup: *redis_dedup - druid-raw-cluster: *druid-raw-cluster - spark: *spark - cert-manager: *cert-manager - alert_rules: *alert-rules - command-api: *command-api - dataset_api: *dataset-api - storageClass: "" - # This redis is used by flink - redis: *redis_denorm - prometheus: *prometheus \ No newline at end of file diff --git a/helmcharts/services/secor/values.yaml b/helmcharts/services/secor/values.yaml index 3db994b9..169c6c93 100644 --- a/helmcharts/services/secor/values.yaml +++ b/helmcharts/services/secor/values.yaml @@ -128,6 +128,7 @@ secor_jobs: timestamp_key: "ets" masterdata-ingest-backup: <<: *base_config + enabled: false topic: "{{.Values.global.env}}.masterdata.ingest" service_name: "masterdata-ingest-backup" consumer_group: "{{.Values.global.env}}.masterdata-ingest" From 6a955f5736d8f6f9f98cfe1a9b7605da45e9c481 Mon Sep 17 00:00:00 2001 From: divyagovindaiah Date: Fri, 8 Nov 2024 14:48:29 +0530 Subject: [PATCH 5/8] #OBS-I298: fix: updated the private image dockerconfigjson file --- helmcharts/bootstrapper/values.yaml | 4 +++- helmcharts/images.yaml | 3 ++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/helmcharts/bootstrapper/values.yaml b/helmcharts/bootstrapper/values.yaml index 9df21119..284abfe2 100644 --- a/helmcharts/bootstrapper/values.yaml +++ b/helmcharts/bootstrapper/values.yaml @@ -29,5 +29,7 @@ namespaces: global: image: dockerRegistrySecretName: obsrv-docker-secret + #: Replace the dockerConfigJson below for the private images dockerConfigJson: |- - {"auths":{"http://registry.your-domain.io":{"auth":"eW91cl9yZWdpc3RyeV91c2VybmFtZTp5b3VyX3JlZ2lzdHJ5X3Rva2Vu"}}} \ No newline at end of file + {"auths": {"http://": {"auth": ""}}} + diff --git a/helmcharts/images.yaml b/helmcharts/images.yaml index 1390b969..931e5048 100644 --- a/helmcharts/images.yaml +++ b/helmcharts/images.yaml @@ -2,8 +2,9 @@ global: image: registry: &global_registry "sanketikahub" dockerRegistrySecretName: &dockerSecretName "registry-secret-name" + #: Replace the dockerConfigJson below for the private images dockerConfigJson: |- - {"auths":{"http://registry.your-domain.io":{"auth":"eW91cl9yZWdpc3RyeV91c2VybmFtZTp5b3VyX3JlZ2lzdHJ5X3Rva2Vu"}}} + {"auths": {"http://": {"auth": ""}}} From ecb56c87eb44e375767f5a723b4b79f981be25c2 Mon Sep 17 00:00:00 2001 From: divyagovindaiah Date: Mon, 11 Nov 2024 14:49:46 +0530 Subject: [PATCH 6/8] #OBS-I293: fix:updated the web console and dataset api env & grafana alert fix --- helmcharts/global-resource-values.yaml | 8 +- helmcharts/obsrv/values.yaml | 44 +++---- helmcharts/services/config-api/values.yaml | 2 +- helmcharts/services/dataset-api/values.yaml | 6 +- .../dashboards/s3-backup-metrics.json | 1 - .../system-rules-ingestor/configs/rules.yaml | 120 +++++++++--------- helmcharts/services/web-console/values.yaml | 11 +- 7 files changed, 101 insertions(+), 91 deletions(-) diff --git a/helmcharts/global-resource-values.yaml b/helmcharts/global-resource-values.yaml index 60865826..b894a2fe 100644 --- a/helmcharts/global-resource-values.yaml +++ b/helmcharts/global-resource-values.yaml @@ -451,10 +451,10 @@ kube-prometheus-stack: resources: limits: cpu: 0.2 - memory: 128Mi + memory: 1024Mi requests: cpu: 0.1 - memory: 128Mi + memory: 500Mi persistence: type: pvc # storageClassName: default @@ -658,7 +658,7 @@ trino: memory: 1024Mi requests: cpu: 1 - memory: 1024Mi + memory: 1536Mi worker: resources: limits: @@ -666,7 +666,7 @@ trino: memory: 1024Mi requests: cpu: 1 - memory: 1024Mi + memory: 1536Mi s3-exporter: resources: diff --git a/helmcharts/obsrv/values.yaml b/helmcharts/obsrv/values.yaml index 4e333b03..fc97c46c 100644 --- a/helmcharts/obsrv/values.yaml +++ b/helmcharts/obsrv/values.yaml @@ -60,11 +60,9 @@ dataset-api: grafana_token: &grafana_token "YWRtaW46cHJvbS1vcGVyYXRvcg==" web-console: - GRAFANA_URL: "" - SUPERSET_URL: "" - REACT_APP_GRAFANA_URL: "" - REACT_APP_SUPERSET_URL: "" - OAUTH_WEB_CONSOLE_URL: "" + GRAFANA_URL: "http://{{.Values.global.domain}}/grafana" + SUPERSET_URL: "http://{{.Values.global.domain}}" + OAUTH_WEB_CONSOLE_URL: "{{.Values.global.domain}}/console" AUTH_KEYCLOAK_SSL_REQUIRED: "external" AUTH_KEYCLOAK_CLIENT_ID: "myOauthClient" AUTH_KEYCLOAK_CLIENT_SECRET: "SCWHeF9HgtJ5BjmJFruk2IW15a5auueq" @@ -595,38 +593,40 @@ kube-prometheus-stack: requests: storage: 10Gi additionalScrapeConfigs: - - job_name: s3-exporter + - job_name: s3-common-backups + metrics_path: /probe scrape_interval: 5m scrape_timeout: 30s + static_configs: + - targets: + - bucket={{ .Values.global.velero_backup_cloud_bucket}}; + relabel_configs: + - source_labels: [__address__] + regex: "^bucket=(.*);$" + replacement: "${1}" + target_label: "__param_bucket" + - target_label: __address__ + replacement: s3-exporter.s3-exporter.svc.cluster.local:9340 + - job_name: "s3-backups" metrics_path: /probe - http_sd_configs: - - url: http://s3-exporter.s3-exporter.svc.cluster.local:9340/discovery - - job_name: s3-backups - metrics_path: /probe - scrape_interval: 10m - scrape_timeout: 3m - http_sd_configs: - - url: http://s3-exporter.s3-exporter.svc.cluster.local:9340/discovery + scrape_interval: 5m + scrape_timeout: 30s static_configs: - targets: - bucket={{ .Values.global.postgresql_backup_cloud_bucket }};prefix=postgresql; - bucket={{ .Values.global.redis_backup_cloud_bucket }};prefix=dedup-redis; - - bucket={{ .Values.global.redis_backup_cloud_bucket }};prefix=denorm-redis; + - bucket={{ .Values.global.redis_backup_cloud_bucket }};prefix=denorm-redis; relabel_configs: - source_labels: [__address__] regex: "^bucket=(.*);prefix=(.*);$" - replacement: "$1" + replacement: "${1}" target_label: "__param_bucket" - source_labels: [__address__] regex: "^bucket=(.*);prefix=(.*);$" - replacement: "$2" + replacement: "${2}" target_label: "__param_prefix" - target_label: __address__ - replacement: s3-exporter.s3-exporter.svc.cluster.local:9340 - - target_label: bucket - replacement: "$1" - - target_label: prefix - replacement: "$2" + replacement: s3-exporter.s3-exporter.svc.cluster.local:9340 # S3 exporter. kube-state-metrics: diff --git a/helmcharts/services/config-api/values.yaml b/helmcharts/services/config-api/values.yaml index acf1ab97..8968103f 100644 --- a/helmcharts/services/config-api/values.yaml +++ b/helmcharts/services/config-api/values.yaml @@ -116,7 +116,7 @@ serviceMonitor: # interval: 30s # scrapeTimeout: 10s # honorLabels: true - - port: http + - port: "4000" path: /metrics interval: 30s scrapeTimeout: 10s diff --git a/helmcharts/services/dataset-api/values.yaml b/helmcharts/services/dataset-api/values.yaml index fa96ad3b..99a9c5cf 100644 --- a/helmcharts/services/dataset-api/values.yaml +++ b/helmcharts/services/dataset-api/values.yaml @@ -176,4 +176,8 @@ env: GRAFANA_ADMIN_URL: http://grafana.monitoring.svc.cluster.local pipeline_merged_job_manager_url: "http://unified-pipeline-jobmanager.{{.Values.global.flink.namespace}}.svc.cluster.local:8081" masterdata_processor_job_manager_url: "http://master-data-processor-ext-jobmanager.{{.Values.global.flink.namespace}}.svc.cluster.local:8081" - prometheus_url: "{{ .Values.global.prometheus.url }}" \ No newline at end of file + prometheus_url: "{{ .Values.global.prometheus.url }}" + user_token_public_key: "" + is_RBAC_enabled: "false" + user_token_keycloak_public_key: "" + keycloak_audience: "" \ No newline at end of file diff --git a/helmcharts/services/grafana-configs/dashboards/s3-backup-metrics.json b/helmcharts/services/grafana-configs/dashboards/s3-backup-metrics.json index 41d17a8b..5b6e677c 100644 --- a/helmcharts/services/grafana-configs/dashboards/s3-backup-metrics.json +++ b/helmcharts/services/grafana-configs/dashboards/s3-backup-metrics.json @@ -38,7 +38,6 @@ }, "id": 13, "panels": [], - "repeat": "app", "repeatDirection": "h", "title": "$app", "type": "row" diff --git a/helmcharts/services/system-rules-ingestor/configs/rules.yaml b/helmcharts/services/system-rules-ingestor/configs/rules.yaml index 29c93f4c..9ba6fc39 100644 --- a/helmcharts/services/system-rules-ingestor/configs/rules.yaml +++ b/helmcharts/services/system-rules-ingestor/configs/rules.yaml @@ -2,7 +2,7 @@ query: max without(label_system_ingestion, pod) (( max by (pod) (rate(container_cpu_usage_seconds_total[$__range]) / on (pod) group_left max by (pod) (kube_pod_container_resource_limits{resource="cpu"})) * 100)* on (pod) group_left (label_system_ingestion)kube_pod_labels{label_system_ingestion="true"}) operator: gt threshold: [80] - category: Ingestion + category: ingestion frequency: 2m interval: 1m labels: {} @@ -14,7 +14,7 @@ query: max without(label_system_ingestion, pod) (( max by (pod) (rate(container_cpu_usage_seconds_total[$__range]) / on (pod) group_left max by (pod) (kube_pod_container_resource_limits{resource="cpu"})) * 100)* on (pod) group_left (label_system_ingestion)kube_pod_labels{label_system_ingestion="true"}) operator: within_range threshold: [60, 80] - category: Ingestion + category: ingestion frequency: 2m interval: 1m labels: {} @@ -26,7 +26,7 @@ query: max without(label_system_processing, pod) (( max by (pod) (rate(container_cpu_usage_seconds_total[$__range]) / on (pod) group_left max by (pod) (kube_pod_container_resource_limits{resource="cpu"})) * 100)* on (pod) group_left (label_system_processing)kube_pod_labels{label_system_processing="true"}) operator: gt threshold: [80] - category: Processing + category: processing frequency: 2m interval: 1m labels: {} @@ -38,7 +38,7 @@ query: max without(label_system_processing, pod) (( max by (pod) (rate(container_cpu_usage_seconds_total[$__range]) / on (pod) group_left max by (pod) (kube_pod_container_resource_limits{resource="cpu"})) * 100)* on (pod) group_left (label_system_processing)kube_pod_labels{label_system_processing="true"}) operator: within_range threshold: [60,80] - category: Processing + category: processing frequency: 2m interval: 1m labels: {} @@ -50,7 +50,7 @@ query: max without(label_system_querying, pod) (( max by (pod) (rate(container_cpu_usage_seconds_total[$__range]) / on (pod) group_left max by (pod) (kube_pod_container_resource_limits{resource="cpu"})) * 100)* on (pod) group_left (label_system_querying)kube_pod_labels{label_system_querying="true"}) operator: gt threshold: [80] - category: Querying + category: querying frequency: 2m interval: 1m labels: {} @@ -62,7 +62,7 @@ query: max without(label_system_querying, pod) (( max by (pod) (rate(container_cpu_usage_seconds_total[$__range]) / on (pod) group_left max by (pod) (kube_pod_container_resource_limits{resource="cpu"})) * 100)* on (pod) group_left (label_system_querying)kube_pod_labels{label_system_querying="true"}) operator: within_range threshold: [60,80] - category: Querying + category: querying frequency: 2m interval: 1m labels: {} @@ -74,7 +74,7 @@ query: max without(label_system_monitoring, pod) (( max by (pod) (rate(container_cpu_usage_seconds_total[$__range]) / on (pod) group_left max by (pod) (kube_pod_container_resource_limits{resource="cpu"})) * 100)* on (pod) group_left (label_system_monitoring)kube_pod_labels{label_system_monitoring="true"}) operator: gt threshold: [80] - category: Monitoring + category: monitoring frequency: 2m interval: 1m labels: {} @@ -86,7 +86,7 @@ query: max without(label_system_monitoring, pod) (( max by (pod) (rate(container_cpu_usage_seconds_total[$__range]) / on (pod) group_left max by (pod) (kube_pod_container_resource_limits{resource="cpu"})) * 100)* on (pod) group_left (label_system_monitoring)kube_pod_labels{label_system_monitoring="true"}) operator: within_range threshold: [60,80] - category: Monitoring + category: monitoring frequency: 2m interval: 1m labels: {} @@ -98,7 +98,7 @@ query: max without(label_system_reporting, pod) (( max by (pod) (rate(container_cpu_usage_seconds_total[$__range]) / on (pod) group_left max by (pod) (kube_pod_container_resource_limits{resource="cpu"})) * 100)* on (pod) group_left (label_system_reporting)kube_pod_labels{label_system_reporting="true"}) operator: gt threshold: [80] - category: Reporting + category: reporting frequency: 2m interval: 1m labels: {} @@ -110,7 +110,7 @@ query: max without(label_system_reporting, pod) (( max by (pod) (rate(container_cpu_usage_seconds_total[$__range]) / on (pod) group_left max by (pod) (kube_pod_container_resource_limits{resource="cpu"})) * 100)* on (pod) group_left (label_system_reporting)kube_pod_labels{label_system_reporting="true"}) operator: within_range threshold: [60,80] - category: Reporting + category: reporting frequency: 2m interval: 1m labels: {} @@ -122,7 +122,7 @@ query: max without(label_system_storage, pod) (( max by (pod) (rate(container_cpu_usage_seconds_total[$__range]) / on (pod) group_left max by (pod) (kube_pod_container_resource_limits{resource="cpu"})) * 100)* on (pod) group_left (label_system_storage)kube_pod_labels{label_system_storage="true"}) operator: gt threshold: [80] - category: Storage + category: storage frequency: 2m interval: 1m labels: {} @@ -134,7 +134,7 @@ query: max without(label_system_storage, pod) (( max by (pod) (rate(container_cpu_usage_seconds_total[$__range]) / on (pod) group_left max by (pod) (kube_pod_container_resource_limits{resource="cpu"})) * 100)* on (pod) group_left (label_system_storage)kube_pod_labels{label_system_storage="true"}) operator: within_range threshold: [60,80] - category: Storage + category: storage frequency: 2m interval: 1m labels: {} @@ -146,7 +146,7 @@ query: max without(label_system_dataset_management, pod) (( max by (pod) (rate(container_cpu_usage_seconds_total[$__range]) / on (pod) group_left max by (pod) (kube_pod_container_resource_limits{resource="cpu"})) * 100)* on (pod) group_left (label_system_dataset_management)kube_pod_labels{label_system_dataset_management="true"}) operator: gt threshold: [80] - category: Dataset Management + category: dataset management frequency: 2m interval: 1m labels: {} @@ -158,7 +158,7 @@ query: max without(label_system_dataset_management, pod) (( max by (pod) (rate(container_cpu_usage_seconds_total[$__range]) / on (pod) group_left max by (pod) (kube_pod_container_resource_limits{resource="cpu"})) * 100)* on (pod) group_left (label_system_dataset_management)kube_pod_labels{label_system_dataset_management="true"}) operator: within_range threshold: [60,80] - category: Dataset Management + category: dataset management frequency: 2m interval: 1m labels: {} @@ -170,7 +170,7 @@ query: max without(label_system_infra, pod) (( max by (pod) (rate(container_cpu_usage_seconds_total[$__range]) / on (pod) group_left max by (pod) (kube_pod_container_resource_limits{resource="cpu"})) * 100)* on (pod) group_left (label_system_infra)kube_pod_labels{label_system_infra="true"}) operator: gt threshold: [80] - category: Infra + category: infra frequency: 2m interval: 1m labels: {} @@ -182,7 +182,7 @@ query: max without(label_system_infra, pod) (( max by (pod) (rate(container_cpu_usage_seconds_total[$__range]) / on (pod) group_left max by (pod) (kube_pod_container_resource_limits{resource="cpu"})) * 100)* on (pod) group_left (label_system_infra)kube_pod_labels{label_system_infra="true"}) operator: within_range threshold: [60,80] - category: Infra + category: infra frequency: 2m interval: 1m labels: {} @@ -194,7 +194,7 @@ query: max without(label_system_ingestion, pod) (( max by (pod) (avg_over_time(container_memory_usage_bytes[$__range]) / on (pod) group_left max by (pod) (kube_pod_container_resource_limits{resource="memory"})) * 100)* on (pod) group_left (label_system_ingestion)kube_pod_labels{label_system_ingestion="true"}) operator: gt threshold: [80] - category: Ingestion + category: ingestion frequency: 2m interval: 1m labels: {} @@ -206,7 +206,7 @@ query: max without(label_system_ingestion, pod) (( max by (pod) (avg_over_time(container_memory_usage_bytes[$__range]) / on (pod) group_left max by (pod) (kube_pod_container_resource_limits{resource="memory"})) * 100)* on (pod) group_left (label_system_ingestion)kube_pod_labels{label_system_ingestion="true"}) operator: within_range threshold: [60,80] - category: Ingestion + category: ingestion frequency: 2m interval: 1m labels: {} @@ -218,7 +218,7 @@ query: max without(label_system_processing, pod) (( max by (pod) (avg_over_time(container_memory_usage_bytes[$__range]) / on (pod) group_left max by (pod) (kube_pod_container_resource_limits{resource="memory"})) * 100)* on (pod) group_left (label_system_processing)kube_pod_labels{label_system_processing="true"}) operator: gt threshold: [80] - category: Processing + category: processing frequency: 2m interval: 1m labels: {} @@ -230,7 +230,7 @@ query: max without(label_system_processing, pod) (( max by (pod) (avg_over_time(container_memory_usage_bytes[$__range]) / on (pod) group_left max by (pod) (kube_pod_container_resource_limits{resource="memory"})) * 100)* on (pod) group_left (label_system_processing)kube_pod_labels{label_system_processing="true"}) operator: within_range threshold: [60,80] - category: Processing + category: processing frequency: 2m interval: 1m labels: {} @@ -242,7 +242,7 @@ query: max without(label_system_querying, pod) (( max by (pod) (avg_over_time(container_memory_usage_bytes[$__range]) / on (pod) group_left max by (pod) (kube_pod_container_resource_limits{resource="memory"})) * 100)* on (pod) group_left (label_system_querying)kube_pod_labels{label_system_querying="true"}) operator: gt threshold: [80] - category: Querying + category: querying frequency: 2m interval: 1m labels: {} @@ -254,7 +254,7 @@ query: max without(label_system_querying, pod) (( max by (pod) (avg_over_time(container_memory_usage_bytes[$__range]) / on (pod) group_left max by (pod) (kube_pod_container_resource_limits{resource="memory"})) * 100)* on (pod) group_left (label_system_querying)kube_pod_labels{label_system_querying="true"}) operator: within_range threshold: [60,80] - category: Querying + category: querying frequency: 2m interval: 1m labels: {} @@ -266,7 +266,7 @@ query: max without(label_system_monitoring, pod) (( max by (pod) (avg_over_time(container_memory_usage_bytes[$__range]) / on (pod) group_left max by (pod) (kube_pod_container_resource_limits{resource="memory"})) * 100)* on (pod) group_left (label_system_monitoring)kube_pod_labels{label_system_monitoring="true"}) operator: gt threshold: [80] - category: Monitoring + category: monitoring frequency: 2m interval: 1m labels: {} @@ -278,7 +278,7 @@ query: max without(label_system_monitoring, pod) (( max by (pod) (avg_over_time(container_memory_usage_bytes[$__range]) / on (pod) group_left max by (pod) (kube_pod_container_resource_limits{resource="memory"})) * 100)* on (pod) group_left (label_system_monitoring)kube_pod_labels{label_system_monitoring="true"}) operator: within_range threshold: [60,80] - category: Monitoring + category: monitoring frequency: 2m interval: 1m labels: {} @@ -290,7 +290,7 @@ query: max without(label_system_reporting, pod) (( max by (pod) (avg_over_time(container_memory_usage_bytes[$__range]) / on (pod) group_left max by (pod) (kube_pod_container_resource_limits{resource="memory"})) * 100)* on (pod) group_left (label_system_reporting)kube_pod_labels{label_system_reporting="true"}) operator: gt threshold: [80] - category: Reporting + category: reporting frequency: 2m interval: 1m labels: {} @@ -302,7 +302,7 @@ query: max without(label_system_reporting, pod) (( max by (pod) (avg_over_time(container_memory_usage_bytes[$__range]) / on (pod) group_left max by (pod) (kube_pod_container_resource_limits{resource="memory"})) * 100)* on (pod) group_left (label_system_reporting)kube_pod_labels{label_system_reporting="true"}) operator: within_range threshold: [60,80] - category: Reporting + category: reporting frequency: 2m interval: 1m labels: {} @@ -314,7 +314,7 @@ query: max without(label_system_storage, pod) (( max by (pod) (avg_over_time(container_memory_usage_bytes[$__range]) / on (pod) group_left max by (pod) (kube_pod_container_resource_limits{resource="memory"})) * 100)* on (pod) group_left (label_system_storage)kube_pod_labels{label_system_storage="true"}) operator: gt threshold: [80] - category: Storage + category: storage frequency: 2m interval: 1m labels: {} @@ -326,7 +326,7 @@ query: max without(label_system_storage, pod) (( max by (pod) (avg_over_time(container_memory_usage_bytes[$__range]) / on (pod) group_left max by (pod) (kube_pod_container_resource_limits{resource="memory"})) * 100)* on (pod) group_left (label_system_storage)kube_pod_labels{label_system_storage="true"}) operator: within_range threshold: [60,80] - category: Storage + category: storage frequency: 2m interval: 1m labels: {} @@ -338,7 +338,7 @@ query: max without(label_system_dataset_management, pod) (( max by (pod) (avg_over_time(container_memory_usage_bytes[$__range]) / on (pod) group_left max by (pod) (kube_pod_container_resource_limits{resource="memory"})) * 100)* on (pod) group_left (label_system_dataset_management)kube_pod_labels{label_system_dataset_management="true"}) operator: gt threshold: [80] - category: Dataset Management + category: dataset management frequency: 2m interval: 1m labels: {} @@ -350,7 +350,7 @@ query: max without(label_system_dataset_management, pod) (( max by (pod) (avg_over_time(container_memory_usage_bytes[$__range]) / on (pod) group_left max by (pod) (kube_pod_container_resource_limits{resource="memory"})) * 100)* on (pod) group_left (label_system_dataset_management)kube_pod_labels{label_system_dataset_management="true"}) operator: within_range threshold: [60,80] - category: Dataset Management + category: dataset management frequency: 2m interval: 1m labels: {} @@ -362,7 +362,7 @@ query: max without(label_system_infra, pod) (( max by (pod) (avg_over_time(container_memory_usage_bytes[$__range]) / on (pod) group_left max by (pod) (kube_pod_container_resource_limits{resource="memory"})) * 100)* on (pod) group_left (label_system_infra)kube_pod_labels{label_system_infra="true"}) operator: gt threshold: [80] - category: Infra + category: infra frequency: 2m interval: 1m labels: {} @@ -374,7 +374,7 @@ query: max without(label_system_infra, pod) (( max by (pod) (avg_over_time(container_memory_usage_bytes[$__range]) / on (pod) group_left max by (pod) (kube_pod_container_resource_limits{resource="memory"})) * 100)* on (pod) group_left (label_system_infra)kube_pod_labels{label_system_infra="true"}) operator: within_range threshold: [60,80] - category: Infra + category: infra frequency: 2m interval: 1m labels: {} @@ -386,7 +386,7 @@ query: (sum without(pod) (sum by (pod) (ceil(increase(kube_pod_container_status_restarts_total[$__range]))) * on(pod) kube_pod_labels{label_system_infra='true'})) operator: within_range threshold: [1,3] - category: Infra + category: infra frequency: 2m interval: 1m labels: {} @@ -398,7 +398,7 @@ query: (sum without(pod) ((sum by (pod) (ceil(increase(kube_pod_container_status_restarts_total[$__range]))) > 0) * on(pod) kube_pod_labels{label_system_infra='true'})) operator: gt threshold: [3] - category: Infra + category: infra frequency: 2m interval: 1m labels: {} @@ -410,7 +410,7 @@ query: (sum without(pod) (sum by (pod) (ceil(increase(kube_pod_container_status_restarts_total[$__range]))) * on(pod) kube_pod_labels{label_system_ingestion='true'})) operator: within_range threshold: [1,3] - category: Ingestion + category: ingestion frequency: 2m interval: 1m labels: {} @@ -422,7 +422,7 @@ query: (sum without(pod) ((sum by (pod) (ceil(increase(kube_pod_container_status_restarts_total[$__range]))) > 0) * on(pod) kube_pod_labels{label_system_ingestion='true'})) operator: gt threshold: [3] - category: Ingestion + category: ingestion frequency: 2m interval: 1m labels: {} @@ -434,7 +434,7 @@ query: (sum without(pod) (sum by (pod) (ceil(increase(kube_pod_container_status_restarts_total[$__range]))) * on(pod) kube_pod_labels{label_system_querying='true'})) operator: within_range threshold: [1,3] - category: Querying + category: querying frequency: 2m interval: 1m labels: {} @@ -446,7 +446,7 @@ query: (sum without(pod) ((sum by (pod) (ceil(increase(kube_pod_container_status_restarts_total[$__range]))) > 0) * on(pod) kube_pod_labels{label_system_querying='true'})) operator: gt threshold: [3] - category: Querying + category: querying frequency: 2m interval: 1m labels: {} @@ -458,7 +458,7 @@ query: (sum without(pod) (sum by (pod) (ceil(increase(kube_pod_container_status_restarts_total[$__range]))) * on(pod) kube_pod_labels{label_system_storage='true'})) operator: within_range threshold: [1,3] - category: Storage + category: storage frequency: 2m interval: 1m labels: {} @@ -470,7 +470,7 @@ query: (sum without(pod) ((sum by (pod) (ceil(increase(kube_pod_container_status_restarts_total[$__range]))) > 0) * on(pod) kube_pod_labels{label_system_storage='true'})) operator: gt threshold: [3] - category: Storage + category: storage frequency: 2m interval: 1m labels: {} @@ -482,7 +482,7 @@ query: (sum without(pod) (sum by (pod) (ceil(increase(kube_pod_container_status_restarts_total[$__range]))) * on(pod) kube_pod_labels{label_system_processing='true'})) operator: within_range threshold: [1,3] - category: Processing + category: processing frequency: 2m interval: 1m labels: {} @@ -494,7 +494,7 @@ query: (sum without(pod) ((sum by (pod) (ceil(increase(kube_pod_container_status_restarts_total[$__range]))) > 0) * on(pod) kube_pod_labels{label_system_processing='true'})) operator: gt threshold: [3] - category: Processing + category: processing frequency: 2m interval: 1m labels: {} @@ -506,7 +506,7 @@ query: (sum without(pod) (sum by (pod) (ceil(increase(kube_pod_container_status_restarts_total[$__range]))) * on(pod) kube_pod_labels{label_system_dataset_management='true'})) operator: within_range threshold: [1,3] - category: Dataset Management + category: dataset management frequency: 2m interval: 1m labels: {} @@ -518,7 +518,7 @@ query: (sum without(pod) ((sum by (pod) (ceil(increase(kube_pod_container_status_restarts_total[$__range]))) > 0) * on(pod) kube_pod_labels{label_system_dataset_management='true'})) operator: gt threshold: [3] - category: Dataset Management + category: dataset management frequency: 2m interval: 1m labels: {} @@ -530,7 +530,7 @@ query: (sum without(pod) (sum by (pod) (ceil(increase(kube_pod_container_status_restarts_total[$__range]))) * on(pod) kube_pod_labels{label_system_monitoring='true'})) operator: within_range threshold: [1,3] - category: Monitoring + category: monitoring frequency: 2m interval: 1m labels: {} @@ -542,7 +542,7 @@ query: (sum without(pod) ((sum by (pod) (ceil(increase(kube_pod_container_status_restarts_total[$__range]))) > 0) * on(pod) kube_pod_labels{label_system_monitoring='true'})) operator: gt threshold: [3] - category: Monitoring + category: monitoring frequency: 2m interval: 1m labels: {} @@ -554,7 +554,7 @@ query: (sum without(pod) (sum by (pod) (ceil(increase(kube_pod_container_status_restarts_total[$__range]))) * on(pod) kube_pod_labels{label_system_reporting='true'})) operator: within_range threshold: [1,3] - category: Reporting + category: reporting frequency: 2m interval: 1m labels: {} @@ -566,7 +566,7 @@ query: (sum without(pod) ((sum by (pod) (ceil(increase(kube_pod_container_status_restarts_total[$__range]))) > 0) * on(pod) kube_pod_labels{label_system_reporting='true'})) operator: gt threshold: [3] - category: Reporting + category: reporting frequency: 2m interval: 1m labels: {} @@ -578,7 +578,7 @@ query: (time() - s3_last_modified_object_date{job="s3-backups", prefix=~"postgresql"}) operator: gt threshold: [86400] - category: DB Backup + category: db backup frequency: 1h interval: 1h labels: {} @@ -590,7 +590,7 @@ query: (time() - s3_last_modified_object_date{job="s3-backups", prefix=~"dedup-redis"}) operator: gt threshold: [86400] - category: DB Backup + category: db backup frequency: 1h interval: 1h labels: {} @@ -602,7 +602,7 @@ query: (time() - s3_last_modified_object_date{job="s3-backups", prefix=~"denorm-redis"}) operator: gt threshold: [86400] - category: DB Backup + category: db backup frequency: 1h interval: 1h labels: {} @@ -611,10 +611,10 @@ severity: critical - name: "Critical Alert: Velero (Kubernetes Cluster) Backup Failure" - query: (time() - s3_last_modified_object_date{job="s3-backups", bucket=~"velero.*"}) + query: (time() - s3_last_modified_object_date{job="s3-common-backups", bucket=~"velero.*"}) operator: gt threshold: [86400] - category: DB Backup + category: db backup frequency: 1h interval: 1h labels: {} @@ -626,7 +626,7 @@ query: min by (instance) (max by (instance, mountpoint) (max_over_time(node_filesystem_avail_bytes{mountpoint!~".*/tmp.*"}[$__range]) > 0) / on (instance, mountpoint) max by (instance, mountpoint) (node_filesystem_size_bytes{mountpoint!~".*/tmp.*"}) * 100) operator: lt threshold: [20] - category: Disk Space + category: disk space frequency: 2m interval: 1m labels: {} @@ -638,7 +638,7 @@ query: volume_autoscaler_resize_failure_total operator: gt threshold: [0] - category: Disk Space + category: disk space frequency: 2m interval: 1m labels: {} @@ -650,32 +650,34 @@ query: count(kube_persistentvolumeclaim_info) - min(volume_autoscaler_num_valid_pvcs) operator: gt threshold: [0] - category: Disk Space + category: disk space frequency: 2m interval: 1m labels: {} description: There are a few Persistent Volume(PV)s ignored by the volume autoscaler considering the total available PVs in the Kubernetes cluster. Please review the ignored PVs and to ensure they are expected to not auto scale. Please investigate immediately or contact administrative support for assistance. annotations: {} severity: warning + - name: "Critical Alert: Postgres Backup Job Failure" query: kube_job_status_failed{job_name=~".*postgresql-backup.*"} operator: gt threshold: [0] - category: DB Backup + category: db backup frequency: 4h interval: 1h labels: {} description: The Postgres backup has encountered an issue. Please Investigate immediately or contact administrative support for assistance. annotations: {} severity: critical + - name: "Critical Alert: Velero Backup Job Failure" query: increase(velero_backup_failure_total{schedule=~".*obsrv-daily-backup.*"}[4h]) operator: gt threshold: [0] - category: DB Backup + category: db backup frequency: 4h interval: 1h labels: {} description: The Velero (Cluster) backup has encountered an issue. Please Investigate immediately or contact administrative support for assistance. annotations: {} - severity: critical + severity: critical \ No newline at end of file diff --git a/helmcharts/services/web-console/values.yaml b/helmcharts/services/web-console/values.yaml index 54e1351a..71b42d75 100644 --- a/helmcharts/services/web-console/values.yaml +++ b/helmcharts/services/web-console/values.yaml @@ -12,7 +12,7 @@ repository: "obsrv-web-console" tag: "1.0.0-GA" digest: "" -imagePullPolicy: IfNotPresent +imagePullPolicy: Always imagePullSecrets: [] commonLabels: @@ -174,8 +174,6 @@ env: GRAFANA_ADMIN_URL: "http://grafana.monitoring.svc.cluster.local" GRAFANA_URL: "http://{{.Values.global.domain}}/grafana" SUPERSET_URL: "http://{{.Values.global.domain}}" - REACT_APP_GRAFANA_URL: "http://{{.Values.global.domain}}/grafana" - REACT_APP_SUPERSET_URL: "{{.Values.global.domain}}" SESSION_SECRET: "backend-session" POSTGRES_CONNECTION_STRING: "postgres://{{.Values.global.postgresql.obsrv.user}}:{{.Values.global.postgresql.obsrv.password}}@{{.Values.global.postgresql.host}}:{{.Values.global.postgresql.port}}/{{.Values.global.postgresql.obsrv.name}}" OAUTH_WEB_CONSOLE_URL: "{{.Values.global.domain}}/console" @@ -202,6 +200,13 @@ env: AUTH_OIDC_CLIENT_ID: "oidctestclient" AUTH_OIDC_CLIENT_SECRET: "CsfLrFQwdRjZXhKr0t806BGVTWnN7M4k" AUTH_TOKEN: "YWRtaW46cHJvbS1vcGVyYXRvcg==" + USER_TOKEN_PRIVATE_KEY: |- + AUTHENTICATION_TYPE: "basic" + KEYCLOAK_REALM: "" + KEYCLOAK_CLIENT_ID: "" + KEYCLOAK_PUBLIC_CLIENT: "" + KEYCLOAK_SSL_REQUIRED: "" + # Seems to be unused # grafana_secret_allowed_namespaces: "dataset-api,web-console" # grafana_secret_name: "grafana-secret" From 1482c080411ffc6bcf92276fe3ac00be08f32c50 Mon Sep 17 00:00:00 2001 From: divyagovindaiah Date: Mon, 11 Nov 2024 15:00:18 +0530 Subject: [PATCH 7/8] #OBS-I293: fix:increased the trino resource --- helmcharts/global-resource-values.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/helmcharts/global-resource-values.yaml b/helmcharts/global-resource-values.yaml index b894a2fe..3cc4262b 100644 --- a/helmcharts/global-resource-values.yaml +++ b/helmcharts/global-resource-values.yaml @@ -658,7 +658,7 @@ trino: memory: 1024Mi requests: cpu: 1 - memory: 1536Mi + memory: 1024Mi worker: resources: limits: @@ -666,7 +666,7 @@ trino: memory: 1024Mi requests: cpu: 1 - memory: 1536Mi + memory: 1024Mi s3-exporter: resources: From c50ad34461a3d620a15a28931bcc213f89b31c2f Mon Sep 17 00:00:00 2001 From: divyagovindaiah Date: Mon, 11 Nov 2024 16:57:34 +0530 Subject: [PATCH 8/8] #OBS-I293: fix:automated the cors for s3-backup bucket --- terraform/aws/variables.tf | 251 -------------------- terraform/aws/vars/cluster_overrides.tfvars | 44 +--- terraform/modules/aws/s3/main.tf | 18 +- 3 files changed, 21 insertions(+), 292 deletions(-) diff --git a/terraform/aws/variables.tf b/terraform/aws/variables.tf index f795232b..df6e4d57 100644 --- a/terraform/aws/variables.tf +++ b/terraform/aws/variables.tf @@ -48,16 +48,6 @@ variable "eks_master_subnet_ids" { default = [""] } -variable "velero_aws_access_key_id" { - type = string - description = "AWS Access key to access bucket" - default = "" -} -variable "velero_aws_secret_access_key" { - type = string - description = "AWS Secret access key to access bucket" - default = "" -} variable "service_type" { type = string description = "Kubernetes service type either NodePort or LoadBalancer. It is LoadBalancer by default" @@ -73,54 +63,12 @@ variable "cluster_logs_enabled" { description = "Toggle to enable eks cluster logs" default = true } -variable "flink_checkpoint_store_type" { - type = string - description = "Flink checkpoint store type." - default = "s3" -} - -variable "druid_deepstorage_type" { - type = string - description = "Druid deep strorage type." - default = "s3" -} variable "kubernetes_storage_class" { type = string description = "Storage class name for the Kubernetes cluster" default = "gp2" } - -variable "dataset_api_container_registry" { - type = string - description = "Container registry. For example docker.io/obsrv" - default = "sanketikahub" -} - -variable "dataset_api_image_name" { - type = string - description = "Dataset api image name." - default = "config-service-ext" -} - -variable "dataset_api_image_tag" { - type = string - description = "Dataset api image tag." - default = "1.0.2-GA" -} - -variable "flink_container_registry" { - type = string - description = "Container registry. For example docker.io/obsrv" - default = "sanketikahub" -} - -variable "flink_image_tag" { - type = string - description = "Flink kubernetes service name." - default = "1.0.2-GA" -} - variable "monitoring_grafana_oauth_configs" { type = map(any) description = "Grafana oauth related variables. See below commented code for values that need to be passed" @@ -138,13 +86,6 @@ variable "web_console_image_repository" { default = "sanketikahub/obsrv-web-console" } -variable "web_console_image_tag" { - type = string - description = "web console image tag." - default = "1.0.2-GA" -} -# web console variables end. - variable "docker_registry_secret_dockerconfigjson" { type = string description = "The dockerconfigjson encoded in base64 format." @@ -156,12 +97,6 @@ variable "docker_registry_secret_name" { default = "docker-registry-secret" } -variable "command_service_image_tag" { - type = string - description = "CommandService image tag." - default = "1.0.2-GA" -} - variable "oauth_configs" { type = map(any) description = "Superset config variables. See the below commented code to know values to be passed " @@ -184,34 +119,6 @@ variable "flowlogs_retention_in_days" { default = 7 } -variable "flink_release_names" { - description = "Create release names" - type = map(string) - default = { - extractor = "extractor" - preprocessor = "preprocessor" - denormalizer = "denormalizer" - transformer-ext = "transformer-ext" - druid-router = "druid-router" - master-data-processor-ext = "master-data-processor-ext" - } -} - -variable "flink_unified_pipeline_release_names" { - description = "Create release names" - type = map(string) - default = { - unified-pipeline = "unified-pipeline" - master-data-processor-ext = "master-data-processor-ext" - } -} - -variable "unified_pipeline_enabled" { - description = "Toggle to deploy unified pipeline" - type = bool - default = true -} - variable "kong_loadbalancer_annotations" { type = string description = "Kong ingress loadbalancer annotations." @@ -226,79 +133,6 @@ variable "kong_ingress_domain" { description = "Kong ingress domain. Leave it empty if you dont have a domain name. If you have a domain, provide value such as obsrv.ai" default = "" } - -variable "letsencrypt_ssl_admin_email" { - type = string - description = "Letsencrypt ssl domain admin email." - default = "anandp@sanketika.in" -} -variable "grafana_secret_name" { - type = string - description = "The name of the secret. This will be sent back as an output which can be used in other modules" - default = "grafana-secret" -} - - -variable "system_rules_ingestor_container_registry" { - type = string - description = "Container registry. For example docker.io/obsrv" - default = "sanketikahub" -} - -variable "system_rules_ingestor_image_name" { - type = string - description = "Dataset api image name." - default = "system-rules-ingestor" -} - -variable "system_rules_ingestor_image_tag" { - type = string - description = "Dataset api image tag." - default = "1.0.2-GA" -} - -variable "grafana_url" { - type = string - description = "grafana url" - default = "http://monitoring-grafana.monitoring.svc:80" -} - -variable "spark_image_repository" { - type = string - description = "spark image repository." - default = "sanketikahub/spark" -} - -variable "spark_image_tag" { - type = string - description = "spark image tag." - default = "3.3.0-debian-11-r26_1.0.3-GA" -} - -variable "secor_image_tag" { - type = string - description = "secor image version" - default = "1.0.0-GA" -} - -variable "superset_image_tag" { - type = string - description = "Superset image tag." - default = "3.0.2" -} - -variable "redis_backup_image_tag" { - type = string - description = "Redis backup image tag." - default = "1.0.0" -} - -variable "postgresql_backup_image_tag" { - type = string - description = "Postgresql backup image tag." - default = "0.5" -} - variable "eks_node_group_scaling_config" { type = map(number) description = "EKS node group auto scaling configuration." @@ -319,33 +153,7 @@ variable "eks_node_group_capacity_type" { description = "EKS node group type. Either SPOT or ON_DEMAND can be used" } -variable "redirection_auth_path" { - type = string - description = "Either obsrv or keycloak" -} - -variable "smtp_enabled" { - type = bool - description = "enable the smtp server" - default = false -} -variable "smtp_config" { - type = map(string) - description = "smtp server configuration" - default = { - host = "" - user = "" - password = "" - from_address = "" - cert_file = "" - key_file = "" - ehlo_identity = "" - startTLS_policy = "" - skip_verify = "true" - from_name = "obsrv" - } -} variable "eks_endpoint_private_access" { type = bool @@ -375,62 +183,3 @@ variable "storage_provider" { default = "aws" } -variable "hudi_namespace" { - type = string - default = "hudi" - description = "Apache Hudi namespace" -} - -variable "hudi_prefix_path" { - type = string - description = "Hudi prefix path" - default = "hudi" -} - -variable "enable_lakehouse" { - type = bool - description = "Toggle to install hudi components (hms, trino and flink job)" -} - -variable "lakehouse_host" { - type = string - description = "Lakehouse Host" - default = "http://trino.hudi.svc.cluster.local" -} - -variable "lakehouse_port" { - type = string - description = "Trino port" - default = "8080" -} - -variable "lakehouse_catalog" { - type = string - description = "Lakehouse Catalog name" - default = "lakehouse" -} - -variable "lakehouse_schema" { - type = string - description = "Lakehouse Schema name" - default = "hms" -} - -variable "lakehouse_default_user" { - type = string - description = "Lakehouse default user" - default = "admin" -} - - -variable "flink_image_name" { - type = string - description = "Flink image name." - default = "lakehouse-connector" -} - -variable "flink_lakehouse_image_tag" { - type = string - description = "Flink lakehouse image tag." - default = "1.0.0-GA" -} diff --git a/terraform/aws/vars/cluster_overrides.tfvars b/terraform/aws/vars/cluster_overrides.tfvars index 9c2a19aa..a156913a 100644 --- a/terraform/aws/vars/cluster_overrides.tfvars +++ b/terraform/aws/vars/cluster_overrides.tfvars @@ -9,21 +9,19 @@ create_velero_user = "true" vpc_id = "" eks_nodes_subnet_ids = [""] eks_master_subnet_ids = [""] -velero_aws_access_key_id = "" -velero_aws_secret_access_key = "" -enable_lakehouse = false + # cluster sizing eks_endpoint_private_access = false eks_node_group_instance_type = ["t2.2xlarge"] eks_node_group_capacity_type = "ON_DEMAND" eks_node_group_scaling_config = { - desired_size = 2 - max_size = 2 + desired_size = 3 + max_size = 3 min_size = 1 } # Disk node size in gb -eks_node_disk_size = 30 +eks_node_disk_size = 100 create_s3_buckets = true s3_buckets = { @@ -32,37 +30,3 @@ s3_buckets = { "checkpoint_storage_bucket" = "", "s3_backups_bucket" = "" } - -# Image Tags -command_service_image_tag = "1.0.6-GA" -web_console_image_tag = "1.0.6-GA" -system_rules_ingestor_image_tag = "1.0.2-GA" -spark_image_tag = "3.3.0-debian-11-r26" -dataset_api_image_tag = "1.0.6-GA" -flink_image_tag = "1.0.6-GA" -flink_lakehouse_image_tag = "1.0.1" -secor_image_tag = "1.0.0-GA" -superset_image_tag = "3.0.2" - - -# Backup image tags -redis_backup_image_tag = "1.0.5-GA" -postgresql_backup_image_tag = "1.0.5-GA" - -redirection_auth_path = "keycloak" - -#smtp server confuration -smtp_enabled = "false" -smtp_config = { - host = "" - user = "" - password = "" - from_address = "" - cert_file = "" - key_file = "" - ehlo_identity = "" - startTLS_policy = "" - skip_verify = "true" - from_name = "obsrv" - -} diff --git a/terraform/modules/aws/s3/main.tf b/terraform/modules/aws/s3/main.tf index ca2cd559..0ffeaa6d 100644 --- a/terraform/modules/aws/s3/main.tf +++ b/terraform/modules/aws/s3/main.tf @@ -57,9 +57,25 @@ resource "aws_s3_bucket" "s3_backups_bucket" { Name = "backups-${local.storage_bucket}" }, local.common_tags, - var.additional_tags) + var.additional_tags + ) } +# Define CORS configuration separately for the bucket +resource "aws_s3_bucket_cors_configuration" "s3_backups_bucket_cors" { + bucket = aws_s3_bucket.s3_backups_bucket.bucket + + cors_rule { + allowed_headers = ["*"] + allowed_methods = ["GET", "POST", "PUT", "DELETE"] + allowed_origins = ["*"] + expose_headers = ["ETag"] + max_age_seconds = 3000 + } +} + + + # resource "aws_s3_object" "object" { # for_each = fileset("../sample-data/", "*") # bucket = local.storage_bucket