diff --git a/contrib/kubespray/quick-start/services-configuration.yaml.template b/contrib/kubespray/quick-start/services-configuration.yaml.template index 27df33f3e4..2e48fb0585 100644 --- a/contrib/kubespray/quick-start/services-configuration.yaml.template +++ b/contrib/kubespray/quick-start/services-configuration.yaml.template @@ -232,6 +232,9 @@ authentication: # - receiver: pai-email-admin-user-and-stop-job # match: # alertname: PAIJobGpuPercentLowerThan0_3For1h +# - receiver: pai-email-admin-and-fix-nvidia-gpu-low-perf +# match: +# alertname: NodeGpuLowPerfState # customized-receivers: # receivers are combination of several actions # - name: "pai-email-admin-user-and-stop-job" # actions: @@ -244,6 +247,11 @@ authentication: # tag-jobs: # tags: # - 'stopped-by-alert-manager' +# - name: "pai-email-admin-and-fix-nvidia-gpu-low-perf" +# actions: +# email-admin: +# fix-nvidia-gpu-low-perf: + # uncomment following if you want to customize prometheus # prometheus: diff --git a/deployment/quick-start/services-configuration.yaml.template b/deployment/quick-start/services-configuration.yaml.template index 2a30de3fbe..577cb262dc 100644 --- a/deployment/quick-start/services-configuration.yaml.template +++ b/deployment/quick-start/services-configuration.yaml.template @@ -92,6 +92,9 @@ rest-server: # - receiver: pai-email-admin-user-and-stop-job # match: # alertname: PAIJobGpuPercentLowerThan0_3For1h +# - receiver: pai-email-admin-and-fix-nvidia-gpu-low-perf +# match: +# alertname: NodeGpuLowPerfState # customized-receivers: # receivers are combination of several actions # - name: "pai-email-admin-user-and-stop-job" # actions: @@ -104,6 +107,11 @@ rest-server: # tag-jobs: # tags: # - 'stopped-by-alert-manager' +# - name: "pai-email-admin-and-fix-nvidia-gpu-low-perf" +# actions: +# email-admin: +# fix-nvidia-gpu-low-perf: + # uncomment following if you want to customize prometheus # prometheus: diff --git a/docs/manual/cluster-admin/how-to-use-alert-system.md b/docs/manual/cluster-admin/how-to-use-alert-system.md index 1a986bb05b..19953b5221 100644 --- a/docs/manual/cluster-admin/how-to-use-alert-system.md +++ b/docs/manual/cluster-admin/how-to-use-alert-system.md @@ -114,26 +114,29 @@ We have provided so far these following actions: - `stop-jobs`: Stop jobs by calling OpenPAI REST API. **Be careful about this action because it stops jobs without notifying related users.** - `tag-jobs`: Add a tag to jobs by calling OpenPAI REST API. - `cordon-nodes`: Call Kubernetes API to cordon the corresponding nodes. + - `fix-nvidia-gpu-low-perf`: Start a privileged container to fix NVIDIA GPU Low Performance State issue. But before you use them, you have to add proper configuration in the `alert-handler` field. For example, `email-admin` needs you to set up an SMTP account to send the email and an admin email address to receive the email. Also, the `tag-jobs` and `stop-jobs` action calls OpenPAI REST API, so you should set a rest server token for them. To get the token, you should go to your profile page (in the top-right corner on Webporal, click `View my profile`), and use `Create application token` to create one. Generally speaking, there are two parts of the configuration in the `alert-handler` field. One is `email-configs`. The other is `pai-bearer-token`. The requirements for different actions are shown in the following table: -| | email-configs | pai-bearer-token | -| :-----------:| :-----------: | :--------------: | -| cordon-nodes | - | - | -| email-admin | required | - | -| email-user | required | required | -| stop-jobs | - | required | -| tag-jobs | - | required | +| | email-configs | pai-bearer-token | +| :-------------------------: | :-----------: | :--------------: | +| cordon-nodes | - | - | +| email-admin | required | - | +| email-user | required | required | +| stop-jobs | - | required | +| tag-jobs | - | required | +| fix-nvidia-gpu-low-perf | - | - | In addition, some actions may depend on certain fields in the `labels` of alert instances. The labels of the `alert instance` are generated based on the expression in the alert rule. For example, the expression of the `PAIJobGpuPercentLowerThan0_3For1h` alert we mentioned in previous section is `avg(task_gpu_percent{virtual_cluster=~"default"}) by (job_name) < 0.3`. This expression returns a list, the element in which contains the `job_name` field. So there will be also a `job_name` field in the labels of the alert instance. `stop-jobs` action depends on the `job_name` field, and it will stop the corresponding job based on it. To inspect the labels of an alert, you can visit `http(s):///prometheus/alerts`. If the alert is firing, you can see its labels on this page. For the depended fields of each pre-defined action, please refer to the following table: -| | depended on label field | -| :-----------:| :------------------: | -| cordon-nodes | node_name | -| email-admin | - | -| email-user | - | -| stop-jobs | job_name | -| tag-jobs | job_name | +| | depended on label field | +| :-------------------------: | :---------------------: | +| cordon-nodes | node_name | +| email-admin | - | +| email-user | - | +| stop-jobs | job_name | +| tag-jobs | job_name | +| fix-nvidia-gpu-low-perf | node_name, minor_number | The matching rules between alerts and actions are defined using `receivers` and `routes`. diff --git a/examples/cluster-configuration/services-configuration.yaml b/examples/cluster-configuration/services-configuration.yaml index 4933c0d602..6b4c1259ea 100644 --- a/examples/cluster-configuration/services-configuration.yaml +++ b/examples/cluster-configuration/services-configuration.yaml @@ -82,7 +82,6 @@ rest-server: #github-path: marketplace # Job Debugging Reservation Seconds. #debugging-reservation-seconds: 604800 - # uncomment following section if you want to customize the port of web portal # webportal: # server-port: 9286 @@ -125,6 +124,9 @@ rest-server: # - receiver: pai-email-admin-user-and-stop-job # match: # alertname: PAIJobGpuPercentLowerThan0_3For1h +# - receiver: pai-email-admin-and-fix-nvidia-gpu-low-perf +# match: +# alertname: NodeGpuLowPerfState # customized-receivers: # receivers are combination of several actions # - name: "pai-email-admin-user-and-stop-job" # actions: @@ -137,6 +139,10 @@ rest-server: # tag-jobs: # tags: # - 'stopped-by-alert-manager' +# - name: "pai-email-admin-and-fix-nvidia-gpu-low-perf" +# actions: +# email-admin: +# fix-nvidia-gpu-low-perf: # uncomment following if you want to customize prometheus # prometheus: @@ -172,8 +178,6 @@ rest-server: # # key_name: yyyyyy # # key_path: /path/to/yyyyyy - - # uncomment following section if you want to customize the threshold of cleaner # cleaner: # threshold: 90 @@ -185,65 +189,65 @@ rest-server: # uncomment following section, if you want to customize the authentication solution. #authentication: - #OIDC: false - - # If OIDC is set as the value true, you will have to configure the following properties. - #OIDC-type: AAD - # - #AAD: - # # If you wanna configure AAD-OIDC for OpenPAI, the following configuration is mandatory. - # # National Clouds endpoint list https://docs.microsoft.com/en-us/azure/active-directory/develop/authentication-national-cloud - # # AZURE: https://login.microsoftonline.com/{tenantID}/v2.0/.well-known/openid-configuration - # # China: https://login.partner.microsoftonline.cn/{tenantID}/v2.0/.well-known/openid-configuration - # # Germany: https://login.microsoftonline.de/{tenantID}/v2.0/.well-known/openid-configuration - # wellKnownURL: https://login.microsoftonline.com/{tenantID}/v2.0/.well-known/openid-configuration - # - # # If you wanna configure AAD-OIDC for OpenPAI, the following configuration is mandatory. - # tenantID: ${tenat_id} - # - # # Required, the client ID of your app in AAD - # clientID: ${your_client_id} - # - # # Required if `responseType` is 'code', 'id_token code' or 'code id_token'. - # # If app key contains '\', replace it with '\\'. - # clientSecret: '${your_client_secret}' - # - # # Optional. The lifetime of nonce in session or cookie, the default value is 3600 (seconds). - # nonceLifetime: null - # - # # Optional. The max amount of nonce saved in session or cookie, the default value is 10. - # nonceMaxAmount: 5 - # - # # Optional. The clock skew allowed in token validation, the default value is 300 seconds. - # clockSkew: null - # - #group-manager: - # # basic: If you set group-data-source as the value basic, admin should manually modify user's grouplist. - # # winbind: If you set group-data-source as the value winbind, the user's grouplist will get from winbind server based on your configuration. - # group-data-source: basic - # - # # If you set winbind as your data source, you should configure this configuration. - # # winbind-server-address: xxxxxxx - # - # # Admin group name and its user list - # admin-group: - # groupname: admingroup - # description: "admin's group" - # externalName: "" - # - # # Group for default vc. - # # For yarn default queue hack. - # default-group: - # groupname: default - # description: "group for default vc" - # externalName: "" - # - # # If the following groups are not in the data store, it will be created by default. - # grouplist: - # - groupname: forexample - # # internal name - # description: forexample - # # description of the group - # externalName: "" - # # external name, it should be set if your group-data-source is winbind. And the name will be used to query and match the group from - # # the result of winbind. If the group-data-source is basic, this field is useless. +#OIDC: false + +# If OIDC is set as the value true, you will have to configure the following properties. +#OIDC-type: AAD +# +#AAD: +# # If you wanna configure AAD-OIDC for OpenPAI, the following configuration is mandatory. +# # National Clouds endpoint list https://docs.microsoft.com/en-us/azure/active-directory/develop/authentication-national-cloud +# # AZURE: https://login.microsoftonline.com/{tenantID}/v2.0/.well-known/openid-configuration +# # China: https://login.partner.microsoftonline.cn/{tenantID}/v2.0/.well-known/openid-configuration +# # Germany: https://login.microsoftonline.de/{tenantID}/v2.0/.well-known/openid-configuration +# wellKnownURL: https://login.microsoftonline.com/{tenantID}/v2.0/.well-known/openid-configuration +# +# # If you wanna configure AAD-OIDC for OpenPAI, the following configuration is mandatory. +# tenantID: ${tenat_id} +# +# # Required, the client ID of your app in AAD +# clientID: ${your_client_id} +# +# # Required if `responseType` is 'code', 'id_token code' or 'code id_token'. +# # If app key contains '\', replace it with '\\'. +# clientSecret: '${your_client_secret}' +# +# # Optional. The lifetime of nonce in session or cookie, the default value is 3600 (seconds). +# nonceLifetime: null +# +# # Optional. The max amount of nonce saved in session or cookie, the default value is 10. +# nonceMaxAmount: 5 +# +# # Optional. The clock skew allowed in token validation, the default value is 300 seconds. +# clockSkew: null +# +#group-manager: +# # basic: If you set group-data-source as the value basic, admin should manually modify user's grouplist. +# # winbind: If you set group-data-source as the value winbind, the user's grouplist will get from winbind server based on your configuration. +# group-data-source: basic +# +# # If you set winbind as your data source, you should configure this configuration. +# # winbind-server-address: xxxxxxx +# +# # Admin group name and its user list +# admin-group: +# groupname: admingroup +# description: "admin's group" +# externalName: "" +# +# # Group for default vc. +# # For yarn default queue hack. +# default-group: +# groupname: default +# description: "group for default vc" +# externalName: "" +# +# # If the following groups are not in the data store, it will be created by default. +# grouplist: +# - groupname: forexample +# # internal name +# description: forexample +# # description of the group +# externalName: "" +# # external name, it should be set if your group-data-source is winbind. And the name will be used to query and match the group from +# # the result of winbind. If the group-data-source is basic, this field is useless. diff --git a/src/alert-manager/build/nvidia-gpu-low-perf-fixer.common.dockerfile b/src/alert-manager/build/nvidia-gpu-low-perf-fixer.common.dockerfile new file mode 100644 index 0000000000..dd8050d05b --- /dev/null +++ b/src/alert-manager/build/nvidia-gpu-low-perf-fixer.common.dockerfile @@ -0,0 +1,22 @@ +# Copyright (c) Microsoft Corporation +# All rights reserved. +# +# MIT License +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated +# documentation files (the "Software"), to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and +# to permit persons to whom the Software is furnished to do so, subject to the following conditions: +# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING +# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, +# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +FROM nvidia/cuda:11.2.2-base-ubuntu16.04 + +COPY ./src/nvidia-gpu-low-perf-fixer . + +ENTRYPOINT /bin/bash nvidia-gpu-low-perf-fixer.sh diff --git a/src/alert-manager/config/alert_manager.py b/src/alert-manager/config/alert_manager.py index 6b33c15435..63ba1468ee 100644 --- a/src/alert-manager/config/alert_manager.py +++ b/src/alert-manager/config/alert_manager.py @@ -74,17 +74,14 @@ def run(self): else: token_configured = False + result["alert-handler"]["configured"] = True + result["actions-available"] = ["fix-nvidia-gpu-low-perf"] if email_configured and token_configured: - result["alert-handler"]["configured"] = True result["actions-available"].extend(["email-admin", "email-user", "stop-jobs", "tag-jobs"]) elif email_configured: - result["alert-handler"]["configured"] = True result["actions-available"].append("email-admin") elif token_configured: - result["alert-handler"]["configured"] = True result["actions-available"].extend(["stop-jobs", "tag-jobs"]) - else: - result["alert-handler"]["configured"] = False if result.get("cluster-utilization") is not None and \ result["cluster-utilization"].get("schedule") is not None and \ diff --git a/src/alert-manager/deploy/alert-manager-configmap.yaml.template b/src/alert-manager/deploy/alert-manager-configmap.yaml.template index 0169673ba5..28ed7d7a5a 100644 --- a/src/alert-manager/deploy/alert-manager-configmap.yaml.template +++ b/src/alert-manager/deploy/alert-manager-configmap.yaml.template @@ -122,6 +122,11 @@ data: - url: 'http://localhost:{{ cluster_cfg["alert-manager"]["alert-handler"]["port"] }}/alert-handler/cordon-nodes' send_resolved: false {% endif %} + + {% if (receiver["actions"]["fix-nvidia-gpu-low-perf"] is defined) and ('fix-nvidia-gpu-low-perf' in cluster_cfg["alert-manager"]["actions-available"]) %} + - url: 'http://localhost:{{ cluster_cfg["alert-manager"]["alert-handler"]["port"] }}/alert-handler/fix-nvidia-gpu-low-perf' + send_resolved: false + {% endif %} {% endfor %} diff --git a/src/alert-manager/deploy/alert-manager-deployment.yaml.template b/src/alert-manager/deploy/alert-manager-deployment.yaml.template index a9fce2333a..43ffb4d87c 100755 --- a/src/alert-manager/deploy/alert-manager-deployment.yaml.template +++ b/src/alert-manager/deploy/alert-manager-deployment.yaml.template @@ -67,6 +67,10 @@ spec: value: {{ cluster_cfg["cluster"]["common"]["cluster-id"] }} - name: REST_SERVER_URI value: {{ cluster_cfg['rest-server']['uri'] }} + - name: DOCKER_REGISTRY_PREFIX + value: {{ cluster_cfg['cluster']['docker-registry']['prefix'] }} + - name: DOCKER_REGISTRY_TAG + value: {{ cluster_cfg['cluster']['docker-registry']['tag'] }} - name: WEBPORTAL_URI {%- if "ssl" in cluster_cfg["pylon"] and cluster_cfg["pylon"]["ssl"] %} value: "{{ cluster_cfg['pylon']['uri-https']}}" diff --git a/src/alert-manager/deploy/rbac.yaml b/src/alert-manager/deploy/rbac.yaml index 1afbaf0109..89073ff43b 100644 --- a/src/alert-manager/deploy/rbac.yaml +++ b/src/alert-manager/deploy/rbac.yaml @@ -15,6 +15,9 @@ rules: - apiGroups: [""] resources: ["nodes"] verbs: ["patch"] + - apiGroups: ["batch"] + resources: ["jobs"] + verbs: ["create", "list", "delete"] --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding diff --git a/src/alert-manager/src/alert-handler/controllers/kubernetes.js b/src/alert-manager/src/alert-handler/controllers/kubernetes.js new file mode 100644 index 0000000000..ca41879726 --- /dev/null +++ b/src/alert-manager/src/alert-handler/controllers/kubernetes.js @@ -0,0 +1,63 @@ +// Copyright (c) Microsoft Corporation +// All rights reserved. +// +// MIT License +// +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated +// documentation files (the "Software"), to deal in the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and +// to permit persons to whom the Software is furnished to do so, subject to the following conditions: +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING +// BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, +// DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +const k8s = require('@kubernetes/client-node'); +const kc = new k8s.KubeConfig(); +const logger = require('@alert-handler/common/logger'); + +// clean TTL 24 hours jobs created by alert-handler +const cleanTTL24HJobs = () => { + logger.info('Cleaning completed TTL 24h jobs...'); + + const k8sApi = kc.makeApiClient(k8s.BatchV1Api); + k8sApi + .listNamespacedJob( + 'default', + undefined, + undefined, + undefined, + undefined, + 'created-by=alert-handler,time-to-live=24h', // labelSelector + ) + .then((response) => { + logger.info(`Successfully get job list.`); + const jobs = response.body.items; + jobs.forEach((job) => { + const jobName = job.metadata.name; + if ( + (job.status.succeeded === 1 || jobs.status.failed === 1) && // check if the job has completed + new Date() - new Date(job.status.completionTime) > 24 * 60 * 60 * 1000 // completed for more than 24h + ) + k8sApi + .deleteNamespacedJob(jobName, 'default') + .then((response) => { + logger.info(`Successfully deleted job ${jobName}`); + }) + .catch((error) => { + logger.info(`Failed to delete job ${jobName}`, error); + }); + }); + }) + .catch((error) => { + logger.error('Failed to list jobs:', error); + }); +}; + +// module exports +module.exports = { + cleanTTL24HJobs, +}; diff --git a/src/alert-manager/src/alert-handler/controllers/mail.js b/src/alert-manager/src/alert-handler/controllers/mail.js index 52f80a5070..ddc96a1369 100755 --- a/src/alert-manager/src/alert-handler/controllers/mail.js +++ b/src/alert-manager/src/alert-handler/controllers/mail.js @@ -88,19 +88,6 @@ const sendEmailToAdmin = (req, res) => { }); }; -const getUserNameByJobName = async (jobName, token) => { - return axios - .get(`${process.env.REST_SERVER_URI}/api/v2/jobs/${jobName}`, { - headers: { - Authorization: `Bearer ${token}`, - 'Content-Type': 'application/json', - }, - }) - .then((response) => { - return response.data.jobStatus.username; - }); -}; - const getUserEmail = async (username, token) => { return axios .get(`${process.env.REST_SERVER_URI}/api/v2/users/${username}`, { @@ -132,7 +119,7 @@ const sendEmailToUser = async (req, res) => { // group alerts by username const alertsGrouped = {}; alerts.map((alert, index) => { - let userName = alert.labels.job_name.split('~')[0]; + const userName = alert.labels.job_name.split('~')[0]; if (userName in alertsGrouped) { alertsGrouped[userName].push(alerts[index]); } else { diff --git a/src/alert-manager/src/alert-handler/controllers/node.js b/src/alert-manager/src/alert-handler/controllers/node.js index 856b47a81f..39dd01132d 100644 --- a/src/alert-manager/src/alert-handler/controllers/node.js +++ b/src/alert-manager/src/alert-handler/controllers/node.js @@ -18,15 +18,16 @@ const k8s = require('@kubernetes/client-node'); const kc = new k8s.KubeConfig(); const logger = require('@alert-handler/common/logger'); +const crypto = require('crypto'); kc.loadFromDefault(); -const k8sApi = kc.makeApiClient(k8s.CoreV1Api); const cordonNode = async (nodeName) => { const headers = { 'content-type': 'application/strategic-merge-patch+json', }; // set the node unschedulable + const k8sApi = kc.makeApiClient(k8s.CoreV1Api); return k8sApi.patchNode( nodeName, { spec: { unschedulable: true } }, @@ -72,7 +73,108 @@ const cordonNodes = (req, res) => { }); }; +const getK8sV1Job = (jobName, nodeName, minorNumber) => { + const DOCKER_REGISTRY_PREFIX = process.env.DOCKER_REGISTRY_PREFIX; + const DOCKER_REGISTRY_TAG = process.env.DOCKER_REGISTRY_TAG; + const job = { + apiVersion: 'batch/v1', + kind: 'Job', + metadata: { + name: jobName, + labels: { + 'created-by': 'alert-handler', + 'time-to-live': '24h', + }, + }, + spec: { + // TTL feature is currently alpha[Kubernetes 1.15] + // To avoid using this fearure, jobs with label `time-to-live=24h` & `created-by=alert-handler` will be cleaned with function `cleanTTL24HJobs` regularlly + // ttlSecondsAfterFinished: 86400, + template: { + spec: { + containers: [ + { + name: 'nvidia-gpu-low-perf-fixer', + image: `${DOCKER_REGISTRY_PREFIX}nvidia-gpu-low-perf-fixer:${DOCKER_REGISTRY_TAG}`, + imagePullPolicy: 'Always', + env: [ + { + name: 'MINOR_NUMBER', + value: `${minorNumber}`, + }, + ], + securityContext: { + privileged: true, + }, + }, + ], + restartPolicy: 'Never', + nodeSelector: { + 'kubernetes.io/hostname': nodeName, + }, + }, + }, + }, + }; + return job; +}; + +// start a k8s job for each GPU card to fix NvidiaGPULowPerf issue +const fixNvidiaGPULowPerf = (req, res) => { + logger.info( + 'Received `fixNvidiaGPULowPerf` post request from alert-manager.', + ); + // filter alerts which are firing and contain `node_name` & `minor_number` as label + const jobsInfo = req.body.alerts + .filter( + (alert) => + alert.status === 'firing' && + 'node_name' in alert.labels && + 'minor_number' in alert.labels, + ) + // map each alert to a job + .map((alert) => ({ + jobName: `nvidia-gpu-low-perf-fixer-${crypto + .createHash('md5') + .update(alert.labels.node_name + alert.labels.minor_number) + .digest('hex')}`, // unique job by GPU card + nodeName: alert.labels.node_name, + minorNumber: alert.labels.minor_number, + DOCKER_REGISTRY_PREFIX: process.env.DOCKER_REGISTRY_PREFIX, + DOCKER_REGISTRY_TAG: process.env.DOCKER_REGISTRY_TAG, + })); + + const k8sApi = kc.makeApiClient(k8s.BatchV1Api); + jobsInfo.forEach(async (jobInfo) => { + // get k8s V1Job + const job = getK8sV1Job( + jobInfo.jobName, + jobInfo.nodeName, + jobInfo.minorNumber, + ); + k8sApi + .createNamespacedJob('default', job) + .then((response) => { + logger.info( + `Successfully start job ${jobInfo.jobName} for GPU Low Performance issue in node: ${jobInfo.nodeName}, minor number: ${jobInfo.minorNumber}`, + ); + }) + .catch((error) => { + // ignore the job creation if already exists + if (error.response && error.response.statusCode === 409) { + logger.warn(`Kubernetes job ${jobInfo.jobName} already exists.`); + } else { + logger.error(error); + res.status(500).json({ + message: `Failed to start job to fix NvidiaGPULowPerf`, + }); + } + }); + }); +}; + // module exports module.exports = { cordonNodes, + fixNvidiaGPULowPerf, }; diff --git a/src/alert-manager/src/alert-handler/index.js b/src/alert-manager/src/alert-handler/index.js index d0d78a279b..bc0d121c89 100755 --- a/src/alert-manager/src/alert-handler/index.js +++ b/src/alert-manager/src/alert-handler/index.js @@ -23,6 +23,7 @@ require('module-alias/register'); const express = require('express'); const bearerToken = require('express-bearer-token'); const actions = require('@alert-handler/routes/actions'); +const k8sController = require('@alert-handler/controllers/kubernetes'); const logger = require('@alert-handler/common/logger'); const app = express(); @@ -36,3 +37,6 @@ const port = parseInt(process.env.SERVER_PORT); app.listen(port, () => { logger.info(`alert-handler listening at http://localhost:${port}`); }); + +// check completed jobs which were used to fix NvidiaGPULowPerf issue every 1 hour +setInterval(k8sController.cleanTTL24HJobs, 60 * 60 * 1000); diff --git a/src/alert-manager/src/alert-handler/routes/actions.js b/src/alert-manager/src/alert-handler/routes/actions.js index 6442f2056e..734eedad7f 100644 --- a/src/alert-manager/src/alert-handler/routes/actions.js +++ b/src/alert-manager/src/alert-handler/routes/actions.js @@ -50,4 +50,9 @@ router /** POST /alert-handler/cordon-nodes */ .post(nodeController.cordonNodes); +router + .route('/alert-handler/fix-nvidia-gpu-low-perf') + /** POST /alert-handler/fix-nvidia-gpu-low-perf */ + .post(nodeController.fixNvidiaGPULowPerf); + module.exports = router; diff --git a/src/alert-manager/src/nvidia-gpu-low-perf-fixer/nvidia-gpu-low-perf-fixer.sh b/src/alert-manager/src/nvidia-gpu-low-perf-fixer/nvidia-gpu-low-perf-fixer.sh new file mode 100644 index 0000000000..8903f09f3f --- /dev/null +++ b/src/alert-manager/src/nvidia-gpu-low-perf-fixer/nvidia-gpu-low-perf-fixer.sh @@ -0,0 +1,12 @@ +#!/bin/bash +set -ex + +echo "MINOR_NUMBER: ${MINOR_NUMBER}" + +nvidia-smi -pm ENABLED -i ${MINOR_NUMBER} + +MAX_MEMORY_CLOCK=$(nvidia-smi -q -d SUPPORTED_CLOCKS | grep Memory | awk -v max=0 '{if($3>max){max=$3}}END{print max}') +MAX_GRAPHICS_CLOCK=$(nvidia-smi -q -d SUPPORTED_CLOCKS | grep Graphics | awk -v max=0 '{if($3>max){max=$3}}END{print max}') +echo "MAX_MEMORY_CLOCK: ${MAX_MEMORY_CLOCK}, MAX_GRAPHICS_CLOCK: ${MAX_GRAPHICS_CLOCK}" + +nvidia-smi -ac ${MAX_MEMORY_CLOCK},${MAX_GRAPHICS_CLOCK} -i ${MINOR_NUMBER}