diff --git a/charts/akash-provider/Chart.yaml b/charts/akash-provider/Chart.yaml index 889606e..2026e68 100644 --- a/charts/akash-provider/Chart.yaml +++ b/charts/akash-provider/Chart.yaml @@ -17,7 +17,7 @@ type: application # Versions are expected to follow Semantic Versioning (https://semver.org/) # Major version bit highlights the mainnet release (e.g. mainnet4 = 4.x.x, mainnet5 = 5.x.x, ...) -version: 11.1.0 +version: 11.1.1 # This is the version number of the application being deployed. This version number should be # incremented each time you make changes to the application. Versions are not expected to diff --git a/charts/akash-provider/templates/statefulset.yaml b/charts/akash-provider/templates/statefulset.yaml index 7c2d00b..3e69c0c 100644 --- a/charts/akash-provider/templates/statefulset.yaml +++ b/charts/akash-provider/templates/statefulset.yaml @@ -249,6 +249,16 @@ spec: value: "{{ .Values.minimumbalance }}" - name: AKASH_BID_DEPOSIT value: "{{ .Values.bidmindeposit }}" + - name: AKASH_MONITOR_MAX_RETRIES + value: {{ if (hasKey (.Values.monitor | default dict) "maxRetries") }}{{ (.Values.monitor | default dict).maxRetries | quote }}{{ else }}"40"{{ end }} + - name: AKASH_MONITOR_RETRY_PERIOD + value: {{ (.Values.monitor | default dict).retryPeriod | default "4s" | quote }} + - name: AKASH_MONITOR_RETRY_PERIOD_JITTER + value: {{ (.Values.monitor | default dict).retryPeriodJitter | default "15s" | quote }} + - name: AKASH_MONITOR_HEALTHCHECK_PERIOD + value: {{ (.Values.monitor | default dict).healthcheckPeriod | default "10s" | quote }} + - name: AKASH_MONITOR_HEALTHCHECK_PERIOD_JITTER + value: {{ (.Values.monitor | default dict).healthcheckPeriodJitter | default "5s" | quote }} ports: - name: api diff --git a/charts/akash-provider/values.yaml b/charts/akash-provider/values.yaml index 0f607bf..2b966fb 100644 --- a/charts/akash-provider/values.yaml +++ b/charts/akash-provider/values.yaml @@ -93,6 +93,37 @@ ipoperator: false debug: "false" +# monitor +## + +# These monitor settings control the retry behavior for deployment checks and health monitoring. +# +# If a user deployment cannot start for any reason - such as an issue with the deployment itself, +# a provider issue (e.g., a lost node due to disk or networking failure), or scheduled maintenance - +# this monitor starts a counter for retry attempts. It determines how many times and how frequently +# the provider will retry before marking the deployment as failed and closing the lease. +# +# With the default settings (maxRetries = 40, retryPeriod = 4s, retryPeriodJitter = 15s), the total retry time is ~8 minutes. +# +# Setting maxRetries to 0 will immediately deem the deployment as failed without attempting any retries. +# +# Formula: Total Time = maxRetries * (retryPeriod + average retryPeriodJitter) +# Example for 1 hour (3600 seconds): maxRetries = 3600 / (retryPeriod + average retryPeriodJitter) +# Example for 24 hours (86400 seconds): maxRetries = 86400 / (retryPeriod + average retryPeriodJitter) +# Example for 48 hours (172800 seconds): maxRetries = 172800 / (retryPeriod + average retryPeriodJitter) + +# Defaults: retryPeriod = 4s, retryPeriodJitter = 15s, average retryPeriodJitter = 7.5s +# Example for 1 hour: maxRetries = 3600 / (4 + 7.5) = 3600 / 11.5 ≈ 313 +# Example for 24 hours: maxRetries = 86400 / (4 + 7.5) = 86400 / 11.5 ≈ 7513 +# Example for 48 hours: maxRetries = 172800 / (4 + 7.5) = 172800 / 11.5 ≈ 15026 + +monitor: + maxRetries: 40 # Maximum retry attempts before closing a lease + retryPeriod: 4s # Time interval between retries + retryPeriodJitter: 15s # Jitter for retry period + healthcheckPeriod: 10s # Health check period + healthcheckPeriodJitter: 5s # Jitter for health check period + # Percentage of CPU overcommit overcommit_pct_cpu: 0 # Percentage of memory overcommit