microsoft · suiguoxin · Mar 31, 2021 · Mar 17, 2021 · Mar 19, 2021 · Mar 19, 2021
diff --git a/contrib/kubespray/quick-start/services-configuration.yaml.template b/contrib/kubespray/quick-start/services-configuration.yaml.template
@@ -232,6 +232,9 @@ authentication:
 #     - receiver: pai-email-admin-user-and-stop-job
 #       match:
 #         alertname: PAIJobGpuPercentLowerThan0_3For1h
+#     - receiver: pai-email-admin-and-fix-nvidia-gpu-low-perf
+#       match:
+#         alertname: NodeGpuLowPerfState
 #   customized-receivers: # receivers are combination of several actions
 #   - name: "pai-email-admin-user-and-stop-job"
 #     actions:
@@ -244,6 +247,11 @@ authentication:
 #       tag-jobs:
 #         tags:
 #         - 'stopped-by-alert-manager'
+#   - name: "pai-email-admin-and-fix-nvidia-gpu-low-perf"
+#     actions:
+#       email-admin:
+#       fix-nvidia-gpu-low-perf:
+
 
 # uncomment following if you want to customize prometheus
 # prometheus:

diff --git a/deployment/quick-start/services-configuration.yaml.template b/deployment/quick-start/services-configuration.yaml.template
@@ -92,6 +92,9 @@ rest-server:
 #     - receiver: pai-email-admin-user-and-stop-job
 #       match:
 #         alertname: PAIJobGpuPercentLowerThan0_3For1h
+#     - receiver: pai-email-admin-and-fix-nvidia-gpu-low-perf
+#       match:
+#         alertname: NodeGpuLowPerfState
 #   customized-receivers: # receivers are combination of several actions
 #   - name: "pai-email-admin-user-and-stop-job"
 #     actions:
@@ -104,6 +107,11 @@ rest-server:
 #       tag-jobs:
 #         tags:
 #         - 'stopped-by-alert-manager'
+#   - name: "pai-email-admin-and-fix-nvidia-gpu-low-perf"
+#     actions:
+#       email-admin:
+#       fix-nvidia-gpu-low-perf:
+
 
 # uncomment following if you want to customize prometheus
 # prometheus:

diff --git a/docs/manual/cluster-admin/how-to-use-alert-system.md b/docs/manual/cluster-admin/how-to-use-alert-system.md
@@ -114,26 +114,29 @@ We have provided so far these following actions:
   - `stop-jobs`: Stop jobs by calling OpenPAI REST API. **Be careful about this action because it stops jobs without notifying related users.**
   - `tag-jobs`: Add a tag to jobs by calling OpenPAI REST API.
   - `cordon-nodes`: Call Kubernetes API to cordon the corresponding nodes.
+  - `fix-nvidia-gpu-low-perf`: Start a privileged container to fix NVIDIA GPU Low Performance State issue.
 
 But before you use them, you have to add proper configuration in the `alert-handler` field. For example, `email-admin` needs you to set up an SMTP account to send the email and an admin email address to receive the email. Also, the `tag-jobs` and `stop-jobs` action calls OpenPAI REST API, so you should set a rest server token for them. To get the token, you should go to your profile page (in the top-right corner on Webporal, click `View my profile`), and use `Create application token` to create one. Generally speaking, there are two parts of the configuration in the `alert-handler` field. One is `email-configs`. The other is `pai-bearer-token`. The requirements for different actions are shown in the following table:
 
-|              | email-configs | pai-bearer-token |
-| :-----------:| :-----------: | :--------------: |
-| cordon-nodes | -             | -                |
-| email-admin  | required      | -                |
-| email-user   | required      | required         |
-| stop-jobs    | -             | required         |
-| tag-jobs     | -             | required         |
+|                             | email-configs | pai-bearer-token |
+| :-------------------------: | :-----------: | :--------------: |
+| cordon-nodes                | -             | -                |
+| email-admin                 | required      | -                |
+| email-user                  | required      | required         |
+| stop-jobs                   | -             | required         |
+| tag-jobs                    | -             | required         |
+| fix-nvidia-gpu-low-perf     | -             | -                |
 
 In addition, some actions may depend on certain fields in the `labels` of alert instances. The labels of the `alert instance` are generated based on the expression in the alert rule. For example, the expression of the `PAIJobGpuPercentLowerThan0_3For1h` alert we mentioned in previous section is `avg(task_gpu_percent{virtual_cluster=~"default"}) by (job_name) < 0.3`. This expression returns a list, the element in which contains the `job_name` field. So there will be also a `job_name` field in the labels of the alert instance. `stop-jobs` action depends on the `job_name` field, and it will stop the corresponding job based on it. To inspect the labels of an alert, you can visit `http(s)://<your master IP>/prometheus/alerts`. If the alert is firing, you can see its labels on this page. For the depended fields of each pre-defined action, please refer to the following table:
 
-|              | depended on label field |
-| :-----------:| :------------------: |
-| cordon-nodes | node_name            |
-| email-admin  | -                    | 
-| email-user   | -                    |
-| stop-jobs    | job_name             |
-| tag-jobs     | job_name             |
+|                             | depended on label field |
+| :-------------------------: | :---------------------: |
+| cordon-nodes                | node_name               |
+| email-admin                 | -                       | 
+| email-user                  | -                       |
+| stop-jobs                   | job_name                |
+| tag-jobs                    | job_name                |
+| fix-nvidia-gpu-low-perf     | node_name, minor_number |
 
 
 The matching rules between alerts and actions are defined using `receivers` and `routes`.

diff --git a/examples/cluster-configuration/services-configuration.yaml b/examples/cluster-configuration/services-configuration.yaml
@@ -82,7 +82,6 @@ rest-server:
   #github-path: marketplace
   # Job Debugging Reservation Seconds.
   #debugging-reservation-seconds: 604800
-
 # uncomment following section if you want to customize the port of web portal
 # webportal:
 #   server-port: 9286
@@ -125,6 +124,9 @@ rest-server:
 #     - receiver: pai-email-admin-user-and-stop-job
 #       match:
 #         alertname: PAIJobGpuPercentLowerThan0_3For1h
+#     - receiver: pai-email-admin-and-fix-nvidia-gpu-low-perf
+#       match:
+#         alertname: NodeGpuLowPerfState
 #   customized-receivers: # receivers are combination of several actions
 #   - name: "pai-email-admin-user-and-stop-job"
 #     actions: 
@@ -137,6 +139,10 @@ rest-server:
 #       tag-jobs:
 #         tags: 
 #         - 'stopped-by-alert-manager'
+#   - name: "pai-email-admin-and-fix-nvidia-gpu-low-perf"
+#     actions:
+#       email-admin:
+#       fix-nvidia-gpu-low-perf:
 
 # uncomment following if you want to customize prometheus
 # prometheus:
@@ -172,8 +178,6 @@ rest-server:
 #  # key_name: yyyyyy
 #  # key_path: /path/to/yyyyyy
 
-
-
 # uncomment following section if you want to customize the threshold of cleaner
 # cleaner:
 #  threshold: 90
@@ -185,65 +189,65 @@ rest-server:
 
 # uncomment following section, if you want to customize the authentication solution.
 #authentication:
-  #OIDC: false
-
-  # If OIDC is set as the value true, you will have to configure the following properties.
-  #OIDC-type: AAD
-  #
-  #AAD:
-  #  # If you wanna configure AAD-OIDC for OpenPAI, the following configuration is mandatory.
-  #  # National Clouds endpoint list https://docs.microsoft.com/en-us/azure/active-directory/develop/authentication-national-cloud
-  #  # AZURE: https://login.microsoftonline.com/{tenantID}/v2.0/.well-known/openid-configuration
-  #  # China: https://login.partner.microsoftonline.cn/{tenantID}/v2.0/.well-known/openid-configuration
-  #  # Germany: https://login.microsoftonline.de/{tenantID}/v2.0/.well-known/openid-configuration
-  #  wellKnownURL: https://login.microsoftonline.com/{tenantID}/v2.0/.well-known/openid-configuration
-  #
-  #  # If you wanna configure AAD-OIDC for OpenPAI, the following configuration is mandatory.
-  #  tenantID: ${tenat_id}
-  #
-  #  # Required, the client ID of your app in AAD
-  #  clientID: ${your_client_id}
-  #
-  #  # Required if `responseType` is 'code', 'id_token code' or 'code id_token'.
-  #  # If app key contains '\', replace it with '\\'.
-  #  clientSecret: '${your_client_secret}'
-  #
-  #  # Optional. The lifetime of nonce in session or cookie, the default value is 3600 (seconds).
-  #  nonceLifetime: null
-  #
-  #  # Optional. The max amount of nonce saved in session or cookie, the default value is 10.
-  #  nonceMaxAmount: 5
-  #
-  #  # Optional. The clock skew allowed in token validation, the default value is 300 seconds.
-  #  clockSkew: null
-  #
-  #group-manager:
-  #  # basic: If you set group-data-source as the value basic, admin should manually modify user's grouplist.
-  #  # winbind: If you set group-data-source as the value winbind, the user's grouplist will get from winbind server based on your configuration.
-  #  group-data-source: basic
-  #
-  #  # If you set winbind as your data source, you should configure this configuration.
-  #  # winbind-server-address: xxxxxxx
-  #
-  #  # Admin group name and its user list
-  #  admin-group:
-  #    groupname: admingroup
-  #    description: "admin's group"
-  #    externalName: ""
-  #
-  #  # Group for default vc.
-  #  # For yarn default queue hack.
-  #  default-group:
-  #    groupname: default
-  #    description: "group for default vc"
-  #    externalName: ""
-  #
-  #  # If the following groups are not in the data store, it will be created by default.
-  #  grouplist:
-  #    - groupname: forexample
-  #      # internal name
-  #      description: forexample
-  #      # description of the group
-  #      externalName: ""
-  #      # external name, it should be set if your group-data-source is winbind. And the name will be used to query and match the group from
-  #      # the result of winbind. If the group-data-source is basic, this field is useless.
+#OIDC: false
+
+# If OIDC is set as the value true, you will have to configure the following properties.
+#OIDC-type: AAD
+#
+#AAD:
+#  # If you wanna configure AAD-OIDC for OpenPAI, the following configuration is mandatory.
+#  # National Clouds endpoint list https://docs.microsoft.com/en-us/azure/active-directory/develop/authentication-national-cloud
+#  # AZURE: https://login.microsoftonline.com/{tenantID}/v2.0/.well-known/openid-configuration
+#  # China: https://login.partner.microsoftonline.cn/{tenantID}/v2.0/.well-known/openid-configuration
+#  # Germany: https://login.microsoftonline.de/{tenantID}/v2.0/.well-known/openid-configuration
+#  wellKnownURL: https://login.microsoftonline.com/{tenantID}/v2.0/.well-known/openid-configuration
+#
+#  # If you wanna configure AAD-OIDC for OpenPAI, the following configuration is mandatory.
+#  tenantID: ${tenat_id}
+#
+#  # Required, the client ID of your app in AAD
+#  clientID: ${your_client_id}
+#
+#  # Required if `responseType` is 'code', 'id_token code' or 'code id_token'.
+#  # If app key contains '\', replace it with '\\'.
+#  clientSecret: '${your_client_secret}'
+#
+#  # Optional. The lifetime of nonce in session or cookie, the default value is 3600 (seconds).
+#  nonceLifetime: null
+#
+#  # Optional. The max amount of nonce saved in session or cookie, the default value is 10.
+#  nonceMaxAmount: 5
+#
+#  # Optional. The clock skew allowed in token validation, the default value is 300 seconds.
+#  clockSkew: null
+#
+#group-manager:
+#  # basic: If you set group-data-source as the value basic, admin should manually modify user's grouplist.
+#  # winbind: If you set group-data-source as the value winbind, the user's grouplist will get from winbind server based on your configuration.
+#  group-data-source: basic
+#
+#  # If you set winbind as your data source, you should configure this configuration.
+#  # winbind-server-address: xxxxxxx
+#
+#  # Admin group name and its user list
+#  admin-group:
+#    groupname: admingroup
+#    description: "admin's group"
+#    externalName: ""
+#
+#  # Group for default vc.
+#  # For yarn default queue hack.
+#  default-group:
+#    groupname: default
+#    description: "group for default vc"
+#    externalName: ""
+#
+#  # If the following groups are not in the data store, it will be created by default.
+#  grouplist:
+#    - groupname: forexample
+#      # internal name
+#      description: forexample
+#      # description of the group
+#      externalName: ""
+#      # external name, it should be set if your group-data-source is winbind. And the name will be used to query and match the group from
+#      # the result of winbind. If the group-data-source is basic, this field is useless.
diff --git a/src/alert-manager/build/nvidia-gpu-low-perf-fixer.common.dockerfile b/src/alert-manager/build/nvidia-gpu-low-perf-fixer.common.dockerfile
@@ -0,0 +1,25 @@
+# Copyright (c) Microsoft Corporation
+# All rights reserved.
+#
+# MIT License
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
+# documentation files (the "Software"), to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
+# to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
+# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+FROM nvidia/cuda:11.2.2-base-ubuntu16.04
+
+RUN apt-get -y update && \
+    apt-get install sudo
+
+COPY ./src/nvidia-gpu-low-perf-fixer .
+
+ENTRYPOINT /bin/bash nvidia-gpu-low-perf-fixer.sh
diff --git a/src/alert-manager/config/alert_manager.py b/src/alert-manager/config/alert_manager.py
@@ -74,17 +74,14 @@ def run(self):
         else:
             token_configured = False
 
+        result["alert-handler"]["configured"] = True
+        result["actions-available"] = ["fix-nvidia-gpu-low-perf"]
         if email_configured and token_configured:
-            result["alert-handler"]["configured"] = True
             result["actions-available"].extend(["email-admin", "email-user", "stop-jobs", "tag-jobs"])
         elif email_configured:
-            result["alert-handler"]["configured"] = True
             result["actions-available"].append("email-admin")
         elif token_configured:
-            result["alert-handler"]["configured"] = True
             result["actions-available"].extend(["stop-jobs", "tag-jobs"])
-        else:
-            result["alert-handler"]["configured"] = False
 
         if result.get("cluster-utilization") is not None and \
             result["cluster-utilization"].get("schedule") is not None and \

diff --git a/src/alert-manager/deploy/alert-manager-configmap.yaml.template b/src/alert-manager/deploy/alert-manager-configmap.yaml.template
@@ -122,6 +122,11 @@ data:
       - url: 'http://localhost:{{ cluster_cfg["alert-manager"]["alert-handler"]["port"] }}/alert-handler/cordon-nodes'
         send_resolved: false
       {% endif %}
+
+      {% if (receiver["actions"]["fix-nvidia-gpu-low-perf"] is defined) and ('fix-nvidia-gpu-low-perf' in cluster_cfg["alert-manager"]["actions-available"]) %}
+      - url: 'http://localhost:{{ cluster_cfg["alert-manager"]["alert-handler"]["port"] }}/alert-handler/fix-nvidia-gpu-low-perf'
+        send_resolved: false
+      {% endif %}
 
     {% endfor %}
 

diff --git a/src/alert-manager/deploy/alert-manager-deployment.yaml.template b/src/alert-manager/deploy/alert-manager-deployment.yaml.template
@@ -67,6 +67,10 @@ spec:
           value: {{ cluster_cfg["cluster"]["common"]["cluster-id"] }}
         - name: REST_SERVER_URI
           value: {{ cluster_cfg['rest-server']['uri'] }}
+        - name: DOCKER_REGISTRY_PREFIX
+          value: {{ cluster_cfg['cluster']['docker-registry']['prefix'] }}
+        - name: DOCKER_REGISTRY_TAG
+          value: {{ cluster_cfg['cluster']['docker-registry']['tag'] }}
         - name: WEBPORTAL_URI
 {%- if "ssl" in cluster_cfg["pylon"] and cluster_cfg["pylon"]["ssl"] %}
           value: "{{ cluster_cfg['pylon']['uri-https']}}"

diff --git a/src/alert-manager/deploy/rbac.yaml b/src/alert-manager/deploy/rbac.yaml
@@ -15,6 +15,9 @@ rules:
   - apiGroups: [""]
     resources: ["nodes"]
     verbs: ["patch"]
+  - apiGroups: ["batch"]
+    resources: ["jobs"]
+    verbs: ["create", "list", "delete"]
 ---
 apiVersion: rbac.authorization.k8s.io/v1
 kind: ClusterRoleBinding

diff --git a/src/alert-manager/src/alert-handler/controllers/mail.js b/src/alert-manager/src/alert-handler/controllers/mail.js
@@ -88,19 +88,6 @@ const sendEmailToAdmin = (req, res) => {
     });
 };
 
-const getUserNameByJobName = async (jobName, token) => {
-  return axios
-    .get(`${process.env.REST_SERVER_URI}/api/v2/jobs/${jobName}`, {
-      headers: {
-        Authorization: `Bearer ${token}`,
-        'Content-Type': 'application/json',
-      },
-    })
-    .then((response) => {
-      return response.data.jobStatus.username;
-    });
-};
-
 const getUserEmail = async (username, token) => {
   return axios
     .get(`${process.env.REST_SERVER_URI}/api/v2/users/${username}`, {
@@ -132,7 +119,7 @@ const sendEmailToUser = async (req, res) => {
   // group alerts by username
   const alertsGrouped = {};
   alerts.map((alert, index) => {
-    let userName = alert.labels.job_name.split('~')[0];
+    const userName = alert.labels.job_name.split('~')[0];
     if (userName in alertsGrouped) {
       alertsGrouped[userName].push(alerts[index]);
     } else {