From 0f8f3bb568788d9ca424a3f208581a8d1d033210 Mon Sep 17 00:00:00 2001 From: Uday Bhaskar Date: Sat, 7 Mar 2026 03:00:24 +0530 Subject: [PATCH 1/2] update NPD documentation page (#1179) * update node problem detector documentation * update NPD documentation page (cherry picked from commit fa9da1534ff254d59fd3baa0f82d68c9a898cfcc) --- .wordlist.txt | 3 + docs/npd/node-problem-detector.md | 237 +++++++++++++++++++++--------- 2 files changed, 167 insertions(+), 73 deletions(-) diff --git a/.wordlist.txt b/.wordlist.txt index cbb3ae1bc..2f587f58a 100644 --- a/.wordlist.txt +++ b/.wordlist.txt @@ -15,6 +15,7 @@ CEL CheckUnitStatus CleanupPreState CLI +ClusterRole CN CNI computePartition @@ -132,6 +133,7 @@ NodeRemediationLabels NodeRemediationTaints NoExecute NPD +NPD's NotReady numGPUsAssigned Observability @@ -1106,6 +1108,7 @@ skipRebootStep slurm spx staticAuthorization +stdout svc symlinks sys diff --git a/docs/npd/node-problem-detector.md b/docs/npd/node-problem-detector.md index dda335d73..01210057a 100644 --- a/docs/npd/node-problem-detector.md +++ b/docs/npd/node-problem-detector.md @@ -1,74 +1,166 @@ - # Node Problem Detector Integration -Node-problem-detector(NPD) aims to make various node problems visible to the upstream layers in the cluster management stack. It is a daemon that runs on each node, detects node problems and reports them to apiserver. NPD can be extended to detect AMD GPU problems. +**Node Problem Detector (NPD)** surfaces node problems to the rest of the cluster management stack. It runs as a daemon on each node, detects issues, and reports them to the API server. NPD can be extended to detect AMD GPU problems. -## Node-Problem-Detector Installation +## Installation -Many Kubernetes clusters like GKE, AKS, etc. come with NPD enabled by default. If not already present, easiest way to install is to use Helm chart. Follow the official [Node-problem-detector installation guide](https://github.com/kubernetes/node-problem-detector?tab=readme-ov-file#installation) for more information about installation. +Many Kubernetes clusters (for example, GKE and AKS) ship with NPD enabled by default. If it is not present, the simplest option is to install it via a Helm chart. For details, see the official [Node Problem Detector installation guide](https://github.com/kubernetes/node-problem-detector?tab=readme-ov-file#installation). - -```{note} -**For OpenShift users:** To install NPD on OpenShift clusters, follow these commands: +### Node Problem Detector Installation Steps - Kubernetes - 1. Create namespace and service account: +NPD is typically installed in the `kube-system` namespace. Use the same namespace for consistency with common deployments. - ```bash - oc create namespace node-problem-detector - oc create serviceaccount npd -n node-problem-detector - ``` +#### Create a service account - 2. Grant required access to the service account: +The service account must be able to access metrics exporter endpoints. See the example [RBAC config](https://github.com/ROCm/gpu-operator/blob/main/tests/e2e/yamls/config/npd/node-problem-detector-rbac.yaml). - ```bash - oc create clusterrolebinding npd-privileged-scc \ - --clusterrole=system:openshift:scc:privileged \ - --serviceaccount=node-problem-detector:npd +The ClusterRole must include the following permissions: - oc create clusterrole npd-pod-endpoint-access \ - --verb=get,list,watch --resource=pods,endpoints +```yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: node-problem-detector +rules: +- apiGroups: [""] + resources: ["nodes", "pods", "services"] + verbs: ["get", "list", "watch"] +- apiGroups: [""] + resources: ["events"] + verbs: ["create", "patch"] +- apiGroups: [""] + resources: ["nodes/status"] + verbs: ["patch"] +- nonResourceURLs: ["/metrics", "/gpumetrics", "/inbandraserrors"] + verbs: ["get"] +``` + +```bash +kubectl apply -f node-problem-detector-rbac.yaml +``` - oc create clusterrolebinding npd-pod-endpoint-access-binding \ - --clusterrole=npd-pod-endpoint-access \ - --serviceaccount=node-problem-detector:npd - ``` +#### Create a custom plugin monitor config - 3. Install NPD using Helm chart: +Add a config file that defines the custom plugin monitor rules used for AMD GPU checks. See the [Custom Plugin Monitor](#custom-plugin-monitor) section and the [example DaemonSet config](https://github.com/ROCm/gpu-operator/blob/main/tests/e2e/yamls/config/npd/node-problem-detector.yaml) for the structure and mount details. - ```bash - helm install npd oci://ghcr.io/deliveryhero/helm-charts/node-problem-detector \ - --version 2.4.0 \ - -n node-problem-detector \ - --set serviceAccount.name=npd \ - --set serviceAccount.create=false - ``` +#### Install NPD +#### Option A — Helm chart + +```bash +helm repo add deliveryhero https://charts.deliveryhero.io/ +helm install --generate-name deliveryhero/node-problem-detector ``` - -## Custom Plugin Monitor +#### Option B — Standalone YAML -Custom plugin monitor is a plugin mechanism for node-problem-detector. It will extend node-problem-detector to execute any monitor scripts written in any language. The monitor scripts must conform to the plugin protocol in exit code and standard output. For more info about the plugin protocol, please refer to the [node-problem-detector plugin interface](https://docs.google.com/document/d/1jK_5YloSYtboj-DtfjmYKxfNnUxCAvohLnsH5aGCAYQ/edit#). +Use the [node-problem-detector deployment manifest](https://github.com/kubernetes/node-problem-detector/blob/master/deployment/node-problem-detector.yaml): -Exit codes 0, 1, and 2 are used for plugin monitor. Exit code 0 is treated as working state. Exit code 1 is treated as problem state. Exit code 2 is used for any unknown error. When plugin monitor detects exit code 1, it sets NodeCondition based on the rules defined in custom plugin monitor config file +```bash +kubectl create -f node-problem-detector.yaml +``` -## Node-Problem-Detector Integration +### Node Problem Detector Installation Steps - OpenShift -We provide a small utility, `amdgpuhealth`, queries various AMD GPU metrics from `device-metrics-exporter` and `Prometheus` endpoint. Based on user-configured thresholds, it determines if any AMD GPU is in problem state. NPD custom plugin monitor can invoke this program at configurable intervals to monitor various metrics and assess overall health of AMD GPUs. +#### Create namespace and service account -The utility `amdgpuhealth` is packaged with device-metrics-exporter docker image and will be copied to host path `/var/lib/amd-metrics-exporter`. NPD needs to mount this host path to be able to use the utility via custom plugin monitor. +```bash +oc create namespace node-problem-detector +oc create serviceaccount npd -n node-problem-detector +``` -Example usage of amdgpuhealth CLI: +#### Grant required access to the service account ```bash -./amdgpuhealth query counter-metric -m -t +oc create clusterrolebinding npd-privileged-scc \ + --clusterrole=system:openshift:scc:privileged \ + --serviceaccount=node-problem-detector:npd + +oc create clusterrole npd-pod-endpoint-access \ + --verb=get,list,watch --resource=pods,endpoints + +oc create clusterrolebinding npd-pod-endpoint-access-binding \ + --clusterrole=npd-pod-endpoint-access \ + --serviceaccount=node-problem-detector:npd +``` + +#### Install NPD using Helm + +```bash +helm install npd oci://ghcr.io/deliveryhero/helm-charts/node-problem-detector \ + --version 2.4.0 \ + -n node-problem-detector \ + --set serviceAccount.name=npd \ + --set serviceAccount.create=false +``` + +## Custom Plugin Monitor +The **custom plugin monitor** is NPD's plugin mechanism. It lets you run monitor scripts in any language. Scripts must follow the [NPD plugin interface](https://docs.google.com/document/d/1jK_5YloSYtboj-DtfjmYKxfNnUxCAvohLnsH5aGCAYQ/edit#) for exit codes and standard output. + +| Exit code | Meaning | +| --------- | ------- | +| 0 | Healthy | +| 1 | Problem | +| 2 | Unknown error | + +When a plugin returns exit code 1, NPD updates the node condition according to the rules in your custom plugin monitor config. + +## AMD GPU integration + +The **`amdgpuhealth`** utility queries AMD GPU metrics from the device metrics exporter and (optionally) Prometheus. You configure thresholds; if a metric exceeds its threshold, the tool reports a problem. NPD's custom plugin monitor can run `amdgpuhealth` on a schedule to assess AMD GPU health. + +- **Location:** `amdgpuhealth` is included in the device-metrics-exporter image and is written to the host at `/var/lib/amd-metrics-exporter`. +- **NPD:** The NPD DaemonSet must mount that host path so the `amdgpuhealth` binary is available inside the NPD container. + +For a full example, see the [NPD DaemonSet config](https://github.com/ROCm/gpu-operator/blob/main/tests/e2e/yamls/config/npd/node-problem-detector.yaml). The snippet below shows only the relevant mounts: + +```yaml +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: node-problem-detector + namespace: kube-system +spec: + template: + spec: + containers: + - name: node-problem-detector + command: + - /node-problem-detector + - --logtostderr + - --config.custom-plugin-monitor=/config/custom-plugin-monitor.json + volumeMounts: + - name: config + mountPath: /config + readOnly: true + - name: amdexporter + mountPath: /var/lib/amd-metrics-exporter + volumes: + - name: config + configMap: + name: node-problem-detector-config + items: + - key: custom-plugin-monitor.json + path: custom-plugin-monitor.json + - name: amdexporter + hostPath: + path: /var/lib/amd-metrics-exporter +``` + +### Using the `amdgpuhealth` CLI + +```bash +./amdgpuhealth query counter-metric -m -t ./amdgpuhealth query gauge-metric -m -d -t ``` -In the above examples, the program queries either a counter or gauge metric. You can define a threshold for each metric. If the reported AMD GPU metric value exceeds the threshold, `amdgpuhealth` prints an error message to standard output and exits with code 1. The NPD plugin uses this exit code and output to update the node condition's status and message respectively, indicating problem with AMD GPU. +- **Counter metrics:** Use a threshold; if the metric value exceeds it, the tool exits with code 1. +- **Gauge metrics:** Use a threshold and optional duration; the tool can evaluate the average over that period. + +When `amdgpuhealth` exits with code 1, it prints an error to stdout. NPD uses that exit code and message to set the node condition status and message. -Example custom plugin monitor config: +### Example custom plugin monitor config ```json { @@ -122,31 +214,30 @@ Example custom plugin monitor config: } ``` -The above NPD config rule #1 queries counter metric `GPU_ECC_UNCORRECT_UMC` value every 30 seconds. If the value crosses threshold(set to 1), then NPD marks the node condition as True. - -If you want to query average value of gauge metrics over a period of time, you need to setup Prometheus to scrape metrics from `amd-device-metrics-exporter` endpoint and store them. Rule #2 in above config queries gauge metric `GPUMetricField_GPU_EDGE_TEMPERATURE` value from Promethues server. It queries average temperature for the last 1 hour from the Prometheus endpoint mentioned in CLI argument. +- **Rule 1:** Runs every 30 seconds and checks the counter metric `GPU_ECC_UNCORRECT_UMC`. If the value exceeds the threshold (1), NPD sets the node condition to indicate a problem. +- **Rule 2:** Checks the gauge metric `GPUMetricField_GPU_EDGE_TEMPERATURE` from Prometheus. To use gauge metrics over a time window, Prometheus must be scraping the `amd-device-metrics-exporter` endpoint. This rule evaluates the average temperature over the last hour using the Prometheus endpoint given in the CLI args. -## Handling Authorization Tokens +## Authorization tokens -If your AMD Device Metrics Exporter or Prometheus endpoints require token-based authorization, Node Problem Detector(NPD) must include the token as an HTTP header in its requests. Since authorization tokens are sensitive, they should be stored in secure way. We recommend using **Kubernetes Secrets** to store the token information and mount them as volumes in the NPD pod. +If the AMD Device Metrics Exporter or Prometheus endpoints use token-based authorization, NPD must send the token in an HTTP header. Store the token in a **Kubernetes Secret** and mount it into the NPD pod. -1. **Creating a Authorization token Secret for AMD Device Metrics Exporter endpoint:** +### Token for AMD Device Metrics Exporter -You can create a Kubernetes Secret to store the token for the AMD Device Metrics Exporter endpoint in two ways: +Create a Secret in the NPD namespace: -- From a file: +**From a file:** ```bash kubectl create secret generic -n amd-exporter-auth-token --from-file=token= ``` -- From a string literal: +**From a literal value:** ```bash -kubectl create secret genreic -n amd-exporter-auth-token --from-literal=token= +kubectl create secret generic -n amd-exporter-auth-token --from-literal=token= ``` -Mount this secret as a volume in your NPD deployment yaml. The same path must be specified as CLI argument in the NPD custom plugin monitor config yaml. +Mount this Secret as a volume in the NPD deployment. In the custom plugin monitor config, pass the mount path as the CLI argument for the exporter token. ```json "rules": [ @@ -167,23 +258,23 @@ Mount this secret as a volume in your NPD deployment yaml. The same path must be ] ``` -2. **Creating a Authorization token Secret for Prometheus endpoint:** +### Token for Prometheus -Similarly create secret for Prometheus endpoint. This will be needed for gauge metrics +Required when querying gauge metrics from Prometheus. Create a Secret the same way: -- From a file: +**From a file:** ```bash kubectl create secret generic -n prometheus-auth-token --from-file=token= ``` -- From a string literal: +**From a literal value:** ```bash -kubectl create secret genreic -n prometheus-auth-token --from-literal=token= +kubectl create secret generic -n prometheus-auth-token --from-literal=token= ``` -Mount this secret as a volume in your NPD deployment yaml. Pass the mount path in NPD custom plgin monitor json as CLI argument. +Mount the Secret in the NPD pod and pass the mount path in the custom plugin monitor config as the Prometheus token argument. ```json "rules": [ @@ -204,21 +295,21 @@ Mount this secret as a volume in your NPD deployment yaml. Pass the mount path i ] ``` -## Handling Mutual TLS (mTLS) Authentication +## Mutual TLS (mTLS) authentication -If your AMD Device Metrics Exporter or Prometheus endpoints require TLS/mTLS, Node Problem Detector(NPD) must have necessary certificates to be able to communicate with the endpoints. +If the AMD Device Metrics Exporter or Prometheus endpoints use TLS or mTLS, NPD must have the right certificates to connect. -For TLS, NPD needs to have server endpoint's Root CA certificate to authenticate the server's certificate. Root CA certificate must be stored as Kubernetes Secrets and mounted as volumes in the NPD pod. +**TLS (server authorization):** NPD needs the server’s Root CA certificate to verify the server. Store it in a Kubernetes Secret and mount it into the NPD pod. -1. **Creating Secret for AMD Device Metrics Exporter endpoint Root CA** +### Root CA for AMD Device Metrics Exporter -Please make sure the key in the secret is set to `ca.crt` +The Secret key must be `ca.crt`: ```bash kubectl create secret generic -n amd-exporter-rootca --from-file=ca.crt= ``` -Mount this secret as a volume in your NPD deployment yaml. Same mount path needs to be passed as CLI argument. Example: +Mount this Secret in the NPD deployment and pass the mount path as the CLI argument. Example: ```json "rules": [ @@ -239,13 +330,13 @@ Mount this secret as a volume in your NPD deployment yaml. Same mount path needs ] ``` -2. **Creating Secret for Prometheus endpoint Root CA** +### Root CA for Prometheus ```bash -kubectl create secret generic -n prometheus-rootca --from-file=ca.crt= +kubectl create secret generic -n prometheus-rootca --from-file=ca.crt= ``` -Mount this secret as a volume in your NPD deployment yaml. Pass the mount path in CLI argument followed by the key `ca.crt`. Example below: +Mount the Secret in the NPD pod and pass the path (including `ca.crt`) in the CLI argument. Example: ```json "rules": [ @@ -266,17 +357,17 @@ Mount this secret as a volume in your NPD deployment yaml. Pass the mount path i ] ``` -For mTLS, NPD needs to have a certificate and it's corresponding private key. Certificate information can be stored as Kubernetes TLS Secret and mounted as colume in the NPD pod. +**mTLS (client authorization):** NPD also needs a client certificate and its private key. Store them in a Kubernetes TLS Secret and mount it into the NPD pod. -3. **Creating Secret for NPD identity certificate** +### NPD client identity (mTLS) -Please make sure you use the keys `tls.crt` and `tls.key` for certificate and key respectively +Use the keys `tls.crt` and `tls.key` for the certificate and private key: ```bash -kubectl create secret tls -n npd-identity --tls.crt= --tls.key= +kubectl create secret tls -n npd-identity --cert= --key= ``` -Mount the secret as a volume in your NPD deployment yaml. Pass the mount path as CLI argument. Example below: +Mount the Secret in the NPD deployment and pass the mount path in the CLI arguments. Example: ```json "rules": [ From 6784d9a8d33fc56d2e38e47094d778f02378db73 Mon Sep 17 00:00:00 2001 From: Uday Bhaskar Date: Tue, 10 Mar 2026 21:23:49 +0530 Subject: [PATCH 2/2] auto node remediation fixes (#1168) * anr fixes * enable sim e2es * modify maxparallel workflows data type * handle delete remediation reconcile when enable is false * address review comments (cherry picked from commit 84d07524b2e40f7f790e461627c1f9e06b9a2a19) --- api/v1alpha1/deviceconfig_types.go | 4 +- ...md-gpu-operator.clusterserviceversion.yaml | 2 +- bundle/manifests/amd.com_deviceconfigs.yaml | 3 + config/crd/bases/amd.com_deviceconfigs.yaml | 3 + .../airgapped-install-openshift.md | 1 + .../specialized_networks/airgapped-install.md | 3 + helm-charts-k8s/Chart.lock | 2 +- helm-charts-k8s/crds/deviceconfig-crd.yaml | 3 + .../remediation/scripts/applylabels.sh | 4 +- .../controllers/remediation/scripts/drain.sh | 5 +- .../controllers/remediation/scripts/notify.sh | 6 +- .../remediation/scripts/removelabels.sh | 4 +- .../controllers/remediation/scripts/taint.sh | 4 +- .../controllers/remediation/scripts/test.sh | 26 ++-- .../remediation/scripts/untaint.sh | 4 +- .../controllers/remediation/scripts/wait.sh | 12 +- internal/controllers/remediation_handler.go | 9 +- internal/validator/specValidators.go | 46 ++++++- tests/e2e/client/client.go | 51 ++++++++ tests/e2e/remediation_test.go | 118 +++++++++++++++++- 20 files changed, 267 insertions(+), 43 deletions(-) diff --git a/api/v1alpha1/deviceconfig_types.go b/api/v1alpha1/deviceconfig_types.go index d512665c1..6ab2b32eb 100644 --- a/api/v1alpha1/deviceconfig_types.go +++ b/api/v1alpha1/deviceconfig_types.go @@ -113,7 +113,9 @@ type RemediationWorkflowSpec struct { // MaxParallelWorkflows specifies limit on how many remediation workflows can be executed in parallel. 0 is the default value and it means no limit. //+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="MaxParallelWorkflows",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:maxParallelWorkflows"} // +optional - MaxParallelWorkflows int `json:"maxParallelWorkflows"` + // +kubebuilder:default:=0 + // +kubebuilder:validation:Minimum:=0 + MaxParallelWorkflows int32 `json:"maxParallelWorkflows"` // Node Remediation taints are custom taints that we can apply on the node to specify that the node is undergoing remediation or needs attention by the administrator. // If user does not specify any taints, the operator will apply a taint with key "amd-gpu-unhealthy" and effect "NoSchedule" to the node under remediation. diff --git a/bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml b/bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml index 142a5319e..d7e5acfbf 100644 --- a/bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml +++ b/bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml @@ -36,7 +36,7 @@ metadata: capabilities: Seamless Upgrades categories: AI/Machine Learning,Monitoring containerImage: docker.io/rocm/gpu-operator:v1.4.0 - createdAt: "2026-03-06T04:57:05Z" + createdAt: "2026-03-10T05:41:17Z" description: |- Operator responsible for deploying AMD GPU kernel drivers, device plugin, device test runner and device metrics exporter For more information, visit [documentation](https://instinct.docs.amd.com/projects/gpu-operator/en/latest/) diff --git a/bundle/manifests/amd.com_deviceconfigs.yaml b/bundle/manifests/amd.com_deviceconfigs.yaml index 83b429c02..a2e37c3e3 100644 --- a/bundle/manifests/amd.com_deviceconfigs.yaml +++ b/bundle/manifests/amd.com_deviceconfigs.yaml @@ -1529,9 +1529,12 @@ spec: enable if operator should automatically handle remediation of node incase of gpu issues type: boolean maxParallelWorkflows: + default: 0 description: MaxParallelWorkflows specifies limit on how many remediation workflows can be executed in parallel. 0 is the default value and it means no limit. + format: int32 + minimum: 0 type: integer nodeDrainPolicy: description: Node drain policy during remediation workflow execution diff --git a/config/crd/bases/amd.com_deviceconfigs.yaml b/config/crd/bases/amd.com_deviceconfigs.yaml index b7f6dd71c..b29e2a26c 100644 --- a/config/crd/bases/amd.com_deviceconfigs.yaml +++ b/config/crd/bases/amd.com_deviceconfigs.yaml @@ -1525,9 +1525,12 @@ spec: enable if operator should automatically handle remediation of node incase of gpu issues type: boolean maxParallelWorkflows: + default: 0 description: MaxParallelWorkflows specifies limit on how many remediation workflows can be executed in parallel. 0 is the default value and it means no limit. + format: int32 + minimum: 0 type: integer nodeDrainPolicy: description: Node drain policy during remediation workflow execution diff --git a/docs/specialized_networks/airgapped-install-openshift.md b/docs/specialized_networks/airgapped-install-openshift.md index 301a98997..3d0c32b73 100644 --- a/docs/specialized_networks/airgapped-install-openshift.md +++ b/docs/specialized_networks/airgapped-install-openshift.md @@ -78,6 +78,7 @@ mirror: - name: docker.io/rocm/rocm-terminal:latest - name: docker.io/rocm/k8s-device-plugin:latest - name: docker.io/rocm/k8s-node-labeller:latest + - name: quay.io/argoproj/workflow-controller:v3.6.5 # adjust RHEL version and ROCm version if needed for source image # image tag format for CoreOS is coreos-- - name: docker.io/rocm/amdgpu-driver:coreos-9.6-7.0.2 diff --git a/docs/specialized_networks/airgapped-install.md b/docs/specialized_networks/airgapped-install.md index f74636473..b745e12c4 100644 --- a/docs/specialized_networks/airgapped-install.md +++ b/docs/specialized_networks/airgapped-install.md @@ -45,6 +45,9 @@ quay.io/jetstack/cert-manager-controller:v1.15.1 quay.io/jetstack/cert-manager-webhook:v1.15.1 quay.io/jetstack/cert-manager-cainjector:v1.15.1 quay.io/jetstack/cert-manager-acmesolver:v1.15.1 + +# Argo workflow controller image (for Auto Node Remediation) +quay.io/argoproj/workflow-controller:v3.6.5 ``` ### Required RPM/DEB Packages diff --git a/helm-charts-k8s/Chart.lock b/helm-charts-k8s/Chart.lock index b61db3516..e5f145931 100644 --- a/helm-charts-k8s/Chart.lock +++ b/helm-charts-k8s/Chart.lock @@ -9,4 +9,4 @@ dependencies: repository: file://./charts/remediation-crds version: v1.0.0 digest: sha256:4c6b1f3224839e54d1523759be597d20ca2fc6508eb17fda2992a95a00e1fd70 -generated: "2026-03-05T00:12:04.97865576Z" +generated: "2026-03-10T05:41:14.760044744Z" diff --git a/helm-charts-k8s/crds/deviceconfig-crd.yaml b/helm-charts-k8s/crds/deviceconfig-crd.yaml index 486fbfcfc..6db394587 100644 --- a/helm-charts-k8s/crds/deviceconfig-crd.yaml +++ b/helm-charts-k8s/crds/deviceconfig-crd.yaml @@ -1531,9 +1531,12 @@ spec: enable if operator should automatically handle remediation of node incase of gpu issues type: boolean maxParallelWorkflows: + default: 0 description: MaxParallelWorkflows specifies limit on how many remediation workflows can be executed in parallel. 0 is the default value and it means no limit. + format: int32 + minimum: 0 type: integer nodeDrainPolicy: description: Node drain policy during remediation workflow execution diff --git a/internal/controllers/remediation/scripts/applylabels.sh b/internal/controllers/remediation/scripts/applylabels.sh index ffe072d75..2a265f3be 100644 --- a/internal/controllers/remediation/scripts/applylabels.sh +++ b/internal/controllers/remediation/scripts/applylabels.sh @@ -1,6 +1,6 @@ set -e -NODE_NAME="{{inputs.parameters.node_name}}" -NODE_LABELS="{{inputs.parameters.node_labels}}" +NODE_NAME='{{inputs.parameters.node_name}}' +NODE_LABELS='{{inputs.parameters.node_labels}}' # Check if jq is installed if ! command -v jq &> /dev/null; then diff --git a/internal/controllers/remediation/scripts/drain.sh b/internal/controllers/remediation/scripts/drain.sh index 9267b92a1..414134777 100644 --- a/internal/controllers/remediation/scripts/drain.sh +++ b/internal/controllers/remediation/scripts/drain.sh @@ -1,7 +1,6 @@ set -e -echo "Fetching node name..." -NODE_NAME="{{inputs.parameters.node_name}}" -DRAIN_POLICY="{{inputs.parameters.drain_policy}}" +NODE_NAME='{{inputs.parameters.node_name}}' +DRAIN_POLICY='{{inputs.parameters.drain_policy}}' # Check if jq is installed if ! command -v jq &> /dev/null; then diff --git a/internal/controllers/remediation/scripts/notify.sh b/internal/controllers/remediation/scripts/notify.sh index 39f013e74..af6ff85d0 100644 --- a/internal/controllers/remediation/scripts/notify.sh +++ b/internal/controllers/remediation/scripts/notify.sh @@ -1,7 +1,7 @@ set -e -NODE_NAME="{{inputs.parameters.nodeName}}" -NOTIFY_MESSAGE="{{inputs.parameters.notifyMessage}}" -EVENT_NAME="{{inputs.parameters.eventName}}" +NODE_NAME='{{inputs.parameters.nodeName}}' +NOTIFY_MESSAGE='{{inputs.parameters.notifyMessage}}' +EVENT_NAME='{{inputs.parameters.eventName}}' kubectl create -f - < /dev/null; then diff --git a/internal/controllers/remediation/scripts/taint.sh b/internal/controllers/remediation/scripts/taint.sh index 999bac212..130532ec8 100644 --- a/internal/controllers/remediation/scripts/taint.sh +++ b/internal/controllers/remediation/scripts/taint.sh @@ -1,6 +1,6 @@ set -e -NODE_NAME="{{inputs.parameters.node_name}}" -NODE_TAINTS="{{inputs.parameters.node_taints}}" +NODE_NAME='{{inputs.parameters.node_name}}' +NODE_TAINTS='{{inputs.parameters.node_taints}}' # Check if jq is installed if ! command -v jq &> /dev/null; then diff --git a/internal/controllers/remediation/scripts/test.sh b/internal/controllers/remediation/scripts/test.sh index 5a4c947e6..4981f9b48 100644 --- a/internal/controllers/remediation/scripts/test.sh +++ b/internal/controllers/remediation/scripts/test.sh @@ -1,18 +1,18 @@ set -e -NODE_NAME="{{inputs.parameters.node_name}}" +NODE_NAME='{{inputs.parameters.node_name}}' JOB_NAME="{{workflow.name}}-test-run" -CM_NAME="{{workflow.name}}-test-configmap" -FRAMEWORK="{{inputs.parameters.framework}}" -RECIPE="{{inputs.parameters.recipe}}" -ITERATIONS="{{inputs.parameters.iterations}}" -STOPONFAILURE="{{inputs.parameters.stopOnFailure}}" -TIMEOUTSECONDS="{{inputs.parameters.timeoutSeconds}}" -TESTRUNNERIMAGE="{{inputs.parameters.testRunnerImage}}" -TESTRUNNERSA="{{inputs.parameters.testRunnerServiceAccount}}" -NAMESPACE="{{inputs.parameters.namespace}}" -INITCONTAINERIMAGE="{{inputs.parameters.initContainerImage}}" -WFNAME="{{workflow.name}}" -WFUID="{{workflow.uid}}" +CM_NAME='{{workflow.name}}-test-configmap' +FRAMEWORK='{{inputs.parameters.framework}}' +RECIPE='{{inputs.parameters.recipe}}' +ITERATIONS='{{inputs.parameters.iterations}}' +STOPONFAILURE='{{inputs.parameters.stopOnFailure}}' +TIMEOUTSECONDS='{{inputs.parameters.timeoutSeconds}}' +TESTRUNNERIMAGE='{{inputs.parameters.testRunnerImage}}' +TESTRUNNERSA='{{inputs.parameters.testRunnerServiceAccount}}' +NAMESPACE='{{inputs.parameters.namespace}}' +INITCONTAINERIMAGE='{{inputs.parameters.initContainerImage}}' +WFNAME='{{workflow.name}}' +WFUID='{{workflow.uid}}' if [ -z "$FRAMEWORK" ] || [ -z "$RECIPE" ] || [ -z "$ITERATIONS" ] || [ -z "$STOPONFAILURE" ] || [ -z "$TIMEOUTSECONDS" ]; then echo "Validation profile incomplete, skipping configmap and job creation. Please enter framework, recipe, iterations, stopOnFailure, timeoutSeconds as per testrunner requirements" diff --git a/internal/controllers/remediation/scripts/untaint.sh b/internal/controllers/remediation/scripts/untaint.sh index 4c3240f2a..9fccb067b 100644 --- a/internal/controllers/remediation/scripts/untaint.sh +++ b/internal/controllers/remediation/scripts/untaint.sh @@ -1,6 +1,6 @@ set -e -NODE_NAME="{{inputs.parameters.node_name}}" -NODE_TAINTS="{{inputs.parameters.node_taints}}" +NODE_NAME='{{inputs.parameters.node_name}}' +NODE_TAINTS='{{inputs.parameters.node_taints}}' # Check if jq is installed if ! command -v jq &> /dev/null; then diff --git a/internal/controllers/remediation/scripts/wait.sh b/internal/controllers/remediation/scripts/wait.sh index 3b4522c75..e06103e9b 100644 --- a/internal/controllers/remediation/scripts/wait.sh +++ b/internal/controllers/remediation/scripts/wait.sh @@ -1,11 +1,13 @@ set -e -NODE_NAME="{{inputs.parameters.node_name}}" -echo "Waiting for {{inputs.parameters.node_condition}} condition to be False on node $NODE_NAME for 2 consecutive minutes (timeout: 15 minutes)" +NODE_NAME='{{inputs.parameters.node_name}}' +NODE_CONDITION='{{inputs.parameters.node_condition}}' + +echo "Waiting for $NODE_CONDITION condition to be False on node $NODE_NAME for 2 consecutive minutes (timeout: 15 minutes)" STABLE_COUNT=0 TOTAL_WAIT=0 while [ "$TOTAL_WAIT" -lt 15 ]; do - STATUS=$(kubectl get node "$NODE_NAME" -o jsonpath="{.status.conditions[?(@.type=='{{inputs.parameters.node_condition}}')].status}") - echo "[$(date)] {{inputs.parameters.node_condition}} status: $STATUS" + STATUS=$(kubectl get node "$NODE_NAME" -o jsonpath="{.status.conditions[?(@.type==\"$NODE_CONDITION\")].status}") + echo "[$(date)] $NODE_CONDITION status: $STATUS" if [ "$STATUS" = "False" ]; then STABLE_COUNT=$((STABLE_COUNT + 1)) echo "Condition is stable (False) for $STABLE_COUNT minute(s)" @@ -20,5 +22,5 @@ while [ "$TOTAL_WAIT" -lt 15 ]; do sleep 60 TOTAL_WAIT=$((TOTAL_WAIT + 1)) done -echo "{{inputs.parameters.node_condition}} did not remain False for 2 consecutive minutes within 15 minutes. Exiting with failure." +echo "$NODE_CONDITION did not remain False for 2 consecutive minutes within 15 minutes. Exiting with failure." exit 1 \ No newline at end of file diff --git a/internal/controllers/remediation_handler.go b/internal/controllers/remediation_handler.go index 25a619f0c..6013ad5e4 100644 --- a/internal/controllers/remediation_handler.go +++ b/internal/controllers/remediation_handler.go @@ -247,6 +247,11 @@ func (n *remediationMgr) HandleRemediation(ctx context.Context, devConfig *amdv1 // HandleDelete handles the delete operations during remediation process func (n *remediationMgr) HandleDelete(ctx context.Context, deviceConfig *amdv1alpha1.DeviceConfig, nodeList *v1.NodeList) (res ctrl.Result, err error) { + res = ctrl.Result{Requeue: true, RequeueAfter: time.Second * 20} + remediationDisabled, err := n.helper.isRemediationDisabled(ctx, deviceConfig) + if err != nil || remediationDisabled { + return res, err + } wfList, err := n.helper.getWorkflowList(ctx, deviceConfig.Namespace) if err != nil { @@ -337,7 +342,7 @@ type remediationMgrHelper struct { k8sInterface kubernetes.Interface recoveryTracker *sync.Map serviceAccountName string - maxParallelWorkflows int + maxParallelWorkflows int32 tolerationsCache *sync.Map } @@ -968,7 +973,7 @@ func (h *remediationMgrHelper) updateMaxParallelWorkflows(ctx context.Context, d } // Update parallelism in Argo workflow controller configmap. // https://github.com/argoproj/argo-workflows/blob/main/config/config.go#L69 - acm.Data["namespaceParallelism"] = strconv.Itoa(devConfig.Spec.RemediationWorkflow.MaxParallelWorkflows) + acm.Data["namespaceParallelism"] = strconv.Itoa(int(devConfig.Spec.RemediationWorkflow.MaxParallelWorkflows)) return h.client.Update(ctx, acm) }) if err != nil { diff --git a/internal/validator/specValidators.go b/internal/validator/specValidators.go index aed597be2..bf62dbb5e 100644 --- a/internal/validator/specValidators.go +++ b/internal/validator/specValidators.go @@ -19,10 +19,12 @@ package validator import ( "context" "fmt" - "time" + "strings" amdv1alpha1 "github.com/ROCm/gpu-operator/api/v1alpha1" utils "github.com/ROCm/gpu-operator/internal" + v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/util/validation" "sigs.k8s.io/controller-runtime/pkg/client" ) @@ -176,6 +178,32 @@ func ValidateDevicePluginSpec(ctx context.Context, client client.Client, devConf return nil } +func validateTaintEffect(effect v1.TaintEffect) error { + if effect != v1.TaintEffectNoSchedule && effect != v1.TaintEffectPreferNoSchedule && effect != v1.TaintEffectNoExecute { + return fmt.Errorf("unsupported taint effect %v", effect) + } + + return nil +} + +func checkTaintValidation(taint v1.Taint) error { + if errs := validation.IsQualifiedName(taint.Key); len(errs) > 0 { + return fmt.Errorf("invalid taint key: %s", strings.Join(errs, "; ")) + } + if taint.Value != "" { + if errs := validation.IsValidLabelValue(taint.Value); len(errs) > 0 { + return fmt.Errorf("invalid taint value: %s", strings.Join(errs, "; ")) + } + } + if taint.Effect != "" { + if err := validateTaintEffect(taint.Effect); err != nil { + return err + } + } + + return nil +} + func ValidateRemediationWorkflowSpec(ctx context.Context, client client.Client, devConfig *amdv1alpha1.DeviceConfig) error { rSpec := devConfig.Spec.RemediationWorkflow @@ -189,9 +217,19 @@ func ValidateRemediationWorkflowSpec(ctx context.Context, client client.Client, } } - if rSpec.TtlForFailedWorkflows != "" { - if _, err := time.ParseDuration(rSpec.TtlForFailedWorkflows); err != nil { - return fmt.Errorf("parsing ttlForFailedWorkflows: %v", err) + for key, value := range rSpec.NodeRemediationLabels { + if len(validation.IsQualifiedName(key)) > 0 { + return fmt.Errorf("invalid label key: %s", key) + } + if len(validation.IsValidLabelValue(value)) > 0 { + return fmt.Errorf("invalid label value: %s", value) + } + } + + for _, taint := range rSpec.NodeRemediationTaints { + err := checkTaintValidation(taint) + if err != nil { + return err } } diff --git a/tests/e2e/client/client.go b/tests/e2e/client/client.go index b5a95f7a4..4792dcad3 100644 --- a/tests/e2e/client/client.go +++ b/tests/e2e/client/client.go @@ -77,6 +77,8 @@ type DeviceConfigsInterface interface { PatchMetricsExporterImage(config *v1alpha1.DeviceConfig) (*v1alpha1.DeviceConfig, error) PatchDRADriverEnablement(config *v1alpha1.DeviceConfig) (*v1alpha1.DeviceConfig, error) PatchDevicePluginEnablement(config *v1alpha1.DeviceConfig) (*v1alpha1.DeviceConfig, error) + PatchAutoRemediationEnablement(config *v1alpha1.DeviceConfig) (*v1alpha1.DeviceConfig, error) + PatchRemediationWorkflowSpec(config *v1alpha1.DeviceConfig, patch map[string]interface{}) (*v1alpha1.DeviceConfig, error) Get(name string, options metav1.GetOptions) (*v1alpha1.DeviceConfig, error) Delete(name string) (*v1alpha1.DeviceConfig, error) } @@ -425,6 +427,55 @@ func (c *deviceConfigsClient) PatchDevicePluginEnablement(devCfg *v1alpha1.Devic return &result, err } +func (c *deviceConfigsClient) PatchAutoRemediationEnablement(devCfg *v1alpha1.DeviceConfig) (*v1alpha1.DeviceConfig, error) { + result := v1alpha1.DeviceConfig{} + devCfg.TypeMeta = metav1.TypeMeta{ + Kind: "DeviceConfig", + APIVersion: "amd.com/v1alpha1", + } + + patch := map[string]interface{}{ + "spec": map[string]interface{}{ + "remediationWorkflow": map[string]bool{ + "enable": *devCfg.Spec.RemediationWorkflow.Enable, + }, + }, + } + patchBytes, _ := json.Marshal(patch) + + err := c.restClient. + Patch(types.MergePatchType). + Namespace(devCfg.Namespace). + Resource("deviceConfigs"). + Name(devCfg.Name). + Body(patchBytes). + Do(context.TODO()). + Into(&result) + + return &result, err +} + +func (c *deviceConfigsClient) PatchRemediationWorkflowSpec(devCfg *v1alpha1.DeviceConfig, patch map[string]interface{}) (*v1alpha1.DeviceConfig, error) { + result := v1alpha1.DeviceConfig{} + devCfg.TypeMeta = metav1.TypeMeta{ + Kind: "DeviceConfig", + APIVersion: "amd.com/v1alpha1", + } + + patchBytes, _ := json.Marshal(patch) + + err := c.restClient. + Patch(types.MergePatchType). + Namespace(devCfg.Namespace). + Resource("deviceConfigs"). + Name(devCfg.Name). + Body(patchBytes). + Do(context.TODO()). + Into(&result) + + return &result, err +} + func (c *deviceConfigsClient) Delete(name string) (*v1alpha1.DeviceConfig, error) { result := v1alpha1.DeviceConfig{} err := c.restClient. diff --git a/tests/e2e/remediation_test.go b/tests/e2e/remediation_test.go index 21137d7fe..d00312f04 100644 --- a/tests/e2e/remediation_test.go +++ b/tests/e2e/remediation_test.go @@ -285,11 +285,12 @@ func (s *E2ESuite) TestAutoNodeRemediationWithPhysicalAction(c *C) { } func (s *E2ESuite) TestAutoNodeRemediationAbortWorkflow(c *C) { - logger.Infof("Starting Auto Node Remediation abort workflow test") if s.simEnable { skipTest(c, "Skipping for non amd gpu testbed") } + logger.Infof("Starting Auto Node Remediation abort workflow test") + nodes, err := s.clientSet.CoreV1().Nodes().List(context.Background(), metav1.ListOptions{ LabelSelector: "feature.node.kubernetes.io/amd-gpu=true", }) @@ -311,6 +312,7 @@ func (s *E2ESuite) TestAutoNodeRemediationAbortWorkflow(c *C) { // Wait for cluster to be up logger.Infof("Waiting for device config to be applied") time.Sleep(5 * time.Second) + s.clearRemediationWorkflowStatusMetaData(devCfg.Namespace, c) // Setup NPD logger.Infof("Setting up Node Problem Detector (NPD)") @@ -326,6 +328,10 @@ func (s *E2ESuite) TestAutoNodeRemediationAbortWorkflow(c *C) { // Trigger error condition by modifying NPD config logger.Infof("Edit Node Problem Detector (NPD) thresholds to simulate error condition") s.updateConfigForNPD(c, npdInbandRASConfigPath, npdInband2RASErrorConfigPath) + defer func() { + s.untaintNode(nodeName) + s.clearRemediationWorkflowStatusMetaData(devCfg.Namespace, c) + }() s.verifyNodeCondition(c, conditionInternalError, corev1.ConditionTrue) @@ -351,7 +357,6 @@ func (s *E2ESuite) TestAutoNodeRemediationAbortWorkflow(c *C) { assert.Eventually(c, func() bool { return s.checkWorkflowExistence(c, nodeName, false) }, 1*time.Minute, 10*time.Second, "Remediation workflow was not aborted and deleted") - s.untaintNode(nodeName) } func (s *E2ESuite) TestAutoNodeRemediationRecoveryPolicy(c *C) { @@ -444,3 +449,112 @@ func (s *E2ESuite) TestAutoNodeRemediationRecoveryPolicy(c *C) { }, 1*time.Minute, 10*time.Second, "Remediation workflow was not aborted and deleted") s.untaintNode(nodeName) } + +func (s *E2ESuite) verifyDeviceConfigErrorStatus(devCfg *v1alpha1.DeviceConfig, c *C, errStr string) { + assert.Eventually(c, func() bool { + devCfg, err := s.dClient.DeviceConfigs(s.ns).Get(devCfg.Name, metav1.GetOptions{}) + if err != nil { + logger.Errorf("failed to get deviceConfig %v", err) + return false + } + readyCondition := false + errorCondition := false + for _, condition := range devCfg.Status.Conditions { + if condition.Type == "Ready" && condition.Status == "False" { + readyCondition = true + } + if condition.Type == "Error" && condition.Status == "True" && strings.Contains(condition.Message, errStr) { + errorCondition = true + } + } + return readyCondition && errorCondition + }, 2*time.Minute, 5*time.Second) +} + +func (s *E2ESuite) verifyDeviceConfigReadyStatus(devCfg *v1alpha1.DeviceConfig, c *C) { + assert.Eventually(c, func() bool { + devCfg, err := s.dClient.DeviceConfigs(s.ns).Get(devCfg.Name, metav1.GetOptions{}) + if err != nil { + logger.Errorf("failed to get deviceConfig %v", err) + return false + } + readyCondition := false + for _, condition := range devCfg.Status.Conditions { + if condition.Type == "Ready" && condition.Status == "True" { + readyCondition = true + } + } + return readyCondition + }, 2*time.Minute, 5*time.Second) +} + +func (s *E2ESuite) TestRemediationSpecValidation(c *C) { + logger.Infof("Starting RemediationSpec validation test") + _, err := s.dClient.DeviceConfigs(s.ns).Get(s.cfgName, metav1.GetOptions{}) + assert.Errorf(c, err, fmt.Sprintf("expected no config to be present. but config %v exists", s.cfgName)) + + devCfg := s.getDeviceConfig(c) + // set remediation workflow spec to invalid values + trueValue := true + + devCfg.Spec.RemediationWorkflow.Enable = &trueValue + devCfg.Spec.RemediationWorkflow.Config = &corev1.LocalObjectReference{ + Name: "invalid-config", + } + + logger.Infof("creating DeviceConfig with remediation workflow spec with invalid config map") + s.createDeviceConfig(devCfg, c) + s.verifyDeviceConfigErrorStatus(devCfg, c, "validating remediation workflow config map") + + logger.Infof("patching DeviceConfig with remediation workflow spec with invalid nodeRemediationTaints") + patch := map[string]interface{}{ + "spec": map[string]interface{}{ + "remediationWorkflow": map[string]interface{}{ + "enable": true, + "config": nil, + "ttlForFailedWorkflows": "24h", + "nodeRemediationTaints": []interface{}{ + map[string]interface{}{ + "key": "amd-gpu-unhealthy", + "value": "test", + "effect": "Invalid taint effect", + }, + }, + }, + }, + } + _, err = s.dClient.DeviceConfigs(s.ns).PatchRemediationWorkflowSpec(devCfg, patch) + assert.NoError(c, err, "failed to patch DeviceConfig with remediation workflow spec with invalid nodeRemediationTaints") + s.verifyDeviceConfigErrorStatus(devCfg, c, "unsupported taint effect") + + logger.Infof("patching DeviceConfig with remediation workflow spec with invalid nodeRemediationLabels") + patch = map[string]interface{}{ + "spec": map[string]interface{}{ + "remediationWorkflow": map[string]interface{}{ + "enable": true, + "nodeRemediationTaints": nil, + "nodeRemediationLabels": map[string]string{ + "invalid key": "value", + }, + }, + }, + } + _, err = s.dClient.DeviceConfigs(s.ns).PatchRemediationWorkflowSpec(devCfg, patch) + assert.NoError(c, err, "failed to patch DeviceConfig with remediation workflow spec with invalid nodeRemediationLabels") + s.verifyDeviceConfigErrorStatus(devCfg, c, "invalid label key") + + logger.Infof("patching DeviceConfig with remediation workflow spec with accepted values") + patch = map[string]interface{}{ + "spec": map[string]interface{}{ + "remediationWorkflow": map[string]interface{}{ + "enable": true, + "nodeRemediationLabels": nil, + }, + }, + } + _, err = s.dClient.DeviceConfigs(s.ns).PatchRemediationWorkflowSpec(devCfg, patch) + assert.NoError(c, err, "failed to patch DeviceConfig with remediation workflow spec with accepted values") + s.verifyDeviceConfigReadyStatus(devCfg, c) + + s.deleteDeviceConfig(devCfg, c) +}