Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .wordlist.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ CEL
CheckUnitStatus
CleanupPreState
CLI
ClusterRole
CN
CNI
computePartition
Expand Down Expand Up @@ -132,6 +133,7 @@ NodeRemediationLabels
NodeRemediationTaints
NoExecute
NPD
NPD's
NotReady
numGPUsAssigned
Observability
Expand Down Expand Up @@ -1106,6 +1108,7 @@ skipRebootStep
slurm
spx
staticAuthorization
stdout
svc
symlinks
sys
Expand Down
4 changes: 3 additions & 1 deletion api/v1alpha1/deviceconfig_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,9 @@ type RemediationWorkflowSpec struct {
// MaxParallelWorkflows specifies limit on how many remediation workflows can be executed in parallel. 0 is the default value and it means no limit.
//+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="MaxParallelWorkflows",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:maxParallelWorkflows"}
// +optional
MaxParallelWorkflows int `json:"maxParallelWorkflows"`
// +kubebuilder:default:=0
// +kubebuilder:validation:Minimum:=0
MaxParallelWorkflows int32 `json:"maxParallelWorkflows"`

// Node Remediation taints are custom taints that we can apply on the node to specify that the node is undergoing remediation or needs attention by the administrator.
// If user does not specify any taints, the operator will apply a taint with key "amd-gpu-unhealthy" and effect "NoSchedule" to the node under remediation.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ metadata:
capabilities: Seamless Upgrades
categories: AI/Machine Learning,Monitoring
containerImage: docker.io/rocm/gpu-operator:v1.4.0
createdAt: "2026-03-06T04:57:05Z"
createdAt: "2026-03-10T05:41:17Z"
description: |-
Operator responsible for deploying AMD GPU kernel drivers, device plugin, device test runner and device metrics exporter
For more information, visit [documentation](https://instinct.docs.amd.com/projects/gpu-operator/en/latest/)
Expand Down
3 changes: 3 additions & 0 deletions bundle/manifests/amd.com_deviceconfigs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1529,9 +1529,12 @@ spec:
enable if operator should automatically handle remediation of node incase of gpu issues
type: boolean
maxParallelWorkflows:
default: 0
description: MaxParallelWorkflows specifies limit on how many
remediation workflows can be executed in parallel. 0 is the
default value and it means no limit.
format: int32
minimum: 0
type: integer
nodeDrainPolicy:
description: Node drain policy during remediation workflow execution
Expand Down
3 changes: 3 additions & 0 deletions config/crd/bases/amd.com_deviceconfigs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1525,9 +1525,12 @@ spec:
enable if operator should automatically handle remediation of node incase of gpu issues
type: boolean
maxParallelWorkflows:
default: 0
description: MaxParallelWorkflows specifies limit on how many
remediation workflows can be executed in parallel. 0 is the
default value and it means no limit.
format: int32
minimum: 0
type: integer
nodeDrainPolicy:
description: Node drain policy during remediation workflow execution
Expand Down
237 changes: 164 additions & 73 deletions docs/npd/node-problem-detector.md

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions docs/specialized_networks/airgapped-install-openshift.md
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ mirror:
- name: docker.io/rocm/rocm-terminal:latest
- name: docker.io/rocm/k8s-device-plugin:latest
- name: docker.io/rocm/k8s-node-labeller:latest
- name: quay.io/argoproj/workflow-controller:v3.6.5
# adjust RHEL version and ROCm version if needed for source image
# image tag format for CoreOS is coreos-<RHEL version>-<ROCm version>
- name: docker.io/rocm/amdgpu-driver:coreos-9.6-7.0.2
Expand Down
3 changes: 3 additions & 0 deletions docs/specialized_networks/airgapped-install.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,9 @@ quay.io/jetstack/cert-manager-controller:v1.15.1
quay.io/jetstack/cert-manager-webhook:v1.15.1
quay.io/jetstack/cert-manager-cainjector:v1.15.1
quay.io/jetstack/cert-manager-acmesolver:v1.15.1

# Argo workflow controller image (for Auto Node Remediation)
quay.io/argoproj/workflow-controller:v3.6.5
```

### Required RPM/DEB Packages
Expand Down
2 changes: 1 addition & 1 deletion helm-charts-k8s/Chart.lock
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,4 @@ dependencies:
repository: file://./charts/remediation-crds
version: v1.0.0
digest: sha256:4c6b1f3224839e54d1523759be597d20ca2fc6508eb17fda2992a95a00e1fd70
generated: "2026-03-05T00:12:04.97865576Z"
generated: "2026-03-10T05:41:14.760044744Z"
3 changes: 3 additions & 0 deletions helm-charts-k8s/crds/deviceconfig-crd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1531,9 +1531,12 @@ spec:
enable if operator should automatically handle remediation of node incase of gpu issues
type: boolean
maxParallelWorkflows:
default: 0
description: MaxParallelWorkflows specifies limit on how many remediation
workflows can be executed in parallel. 0 is the default value
and it means no limit.
format: int32
minimum: 0
type: integer
nodeDrainPolicy:
description: Node drain policy during remediation workflow execution
Expand Down
4 changes: 2 additions & 2 deletions internal/controllers/remediation/scripts/applylabels.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
set -e
NODE_NAME="{{inputs.parameters.node_name}}"
NODE_LABELS="{{inputs.parameters.node_labels}}"
NODE_NAME='{{inputs.parameters.node_name}}'
NODE_LABELS='{{inputs.parameters.node_labels}}'

# Check if jq is installed
if ! command -v jq &> /dev/null; then
Expand Down
5 changes: 2 additions & 3 deletions internal/controllers/remediation/scripts/drain.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
set -e
echo "Fetching node name..."
NODE_NAME="{{inputs.parameters.node_name}}"
DRAIN_POLICY="{{inputs.parameters.drain_policy}}"
NODE_NAME='{{inputs.parameters.node_name}}'
DRAIN_POLICY='{{inputs.parameters.drain_policy}}'

# Check if jq is installed
if ! command -v jq &> /dev/null; then
Expand Down
6 changes: 3 additions & 3 deletions internal/controllers/remediation/scripts/notify.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
set -e
NODE_NAME="{{inputs.parameters.nodeName}}"
NOTIFY_MESSAGE="{{inputs.parameters.notifyMessage}}"
EVENT_NAME="{{inputs.parameters.eventName}}"
NODE_NAME='{{inputs.parameters.nodeName}}'
NOTIFY_MESSAGE='{{inputs.parameters.notifyMessage}}'
EVENT_NAME='{{inputs.parameters.eventName}}'

kubectl create -f - <<EOF
apiVersion: v1
Expand Down
4 changes: 2 additions & 2 deletions internal/controllers/remediation/scripts/removelabels.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
set -e
NODE_NAME="{{inputs.parameters.node_name}}"
NODE_LABELS="{{inputs.parameters.node_labels}}"
NODE_NAME='{{inputs.parameters.node_name}}'
NODE_LABELS='{{inputs.parameters.node_labels}}'

# Check if jq is installed
if ! command -v jq &> /dev/null; then
Expand Down
4 changes: 2 additions & 2 deletions internal/controllers/remediation/scripts/taint.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
set -e
NODE_NAME="{{inputs.parameters.node_name}}"
NODE_TAINTS="{{inputs.parameters.node_taints}}"
NODE_NAME='{{inputs.parameters.node_name}}'
NODE_TAINTS='{{inputs.parameters.node_taints}}'

# Check if jq is installed
if ! command -v jq &> /dev/null; then
Expand Down
26 changes: 13 additions & 13 deletions internal/controllers/remediation/scripts/test.sh
Original file line number Diff line number Diff line change
@@ -1,18 +1,18 @@
set -e
NODE_NAME="{{inputs.parameters.node_name}}"
NODE_NAME='{{inputs.parameters.node_name}}'
JOB_NAME="{{workflow.name}}-test-run"
CM_NAME="{{workflow.name}}-test-configmap"
FRAMEWORK="{{inputs.parameters.framework}}"
RECIPE="{{inputs.parameters.recipe}}"
ITERATIONS="{{inputs.parameters.iterations}}"
STOPONFAILURE="{{inputs.parameters.stopOnFailure}}"
TIMEOUTSECONDS="{{inputs.parameters.timeoutSeconds}}"
TESTRUNNERIMAGE="{{inputs.parameters.testRunnerImage}}"
TESTRUNNERSA="{{inputs.parameters.testRunnerServiceAccount}}"
NAMESPACE="{{inputs.parameters.namespace}}"
INITCONTAINERIMAGE="{{inputs.parameters.initContainerImage}}"
WFNAME="{{workflow.name}}"
WFUID="{{workflow.uid}}"
CM_NAME='{{workflow.name}}-test-configmap'
FRAMEWORK='{{inputs.parameters.framework}}'
RECIPE='{{inputs.parameters.recipe}}'
ITERATIONS='{{inputs.parameters.iterations}}'
STOPONFAILURE='{{inputs.parameters.stopOnFailure}}'
TIMEOUTSECONDS='{{inputs.parameters.timeoutSeconds}}'
TESTRUNNERIMAGE='{{inputs.parameters.testRunnerImage}}'
TESTRUNNERSA='{{inputs.parameters.testRunnerServiceAccount}}'
NAMESPACE='{{inputs.parameters.namespace}}'
INITCONTAINERIMAGE='{{inputs.parameters.initContainerImage}}'
WFNAME='{{workflow.name}}'
WFUID='{{workflow.uid}}'

if [ -z "$FRAMEWORK" ] || [ -z "$RECIPE" ] || [ -z "$ITERATIONS" ] || [ -z "$STOPONFAILURE" ] || [ -z "$TIMEOUTSECONDS" ]; then
echo "Validation profile incomplete, skipping configmap and job creation. Please enter framework, recipe, iterations, stopOnFailure, timeoutSeconds as per testrunner requirements"
Expand Down
4 changes: 2 additions & 2 deletions internal/controllers/remediation/scripts/untaint.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
set -e
NODE_NAME="{{inputs.parameters.node_name}}"
NODE_TAINTS="{{inputs.parameters.node_taints}}"
NODE_NAME='{{inputs.parameters.node_name}}'
NODE_TAINTS='{{inputs.parameters.node_taints}}'

# Check if jq is installed
if ! command -v jq &> /dev/null; then
Expand Down
12 changes: 7 additions & 5 deletions internal/controllers/remediation/scripts/wait.sh
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
set -e
NODE_NAME="{{inputs.parameters.node_name}}"
echo "Waiting for {{inputs.parameters.node_condition}} condition to be False on node $NODE_NAME for 2 consecutive minutes (timeout: 15 minutes)"
NODE_NAME='{{inputs.parameters.node_name}}'
NODE_CONDITION='{{inputs.parameters.node_condition}}'

echo "Waiting for $NODE_CONDITION condition to be False on node $NODE_NAME for 2 consecutive minutes (timeout: 15 minutes)"
STABLE_COUNT=0
TOTAL_WAIT=0
while [ "$TOTAL_WAIT" -lt 15 ]; do
STATUS=$(kubectl get node "$NODE_NAME" -o jsonpath="{.status.conditions[?(@.type=='{{inputs.parameters.node_condition}}')].status}")
echo "[$(date)] {{inputs.parameters.node_condition}} status: $STATUS"
STATUS=$(kubectl get node "$NODE_NAME" -o jsonpath="{.status.conditions[?(@.type==\"$NODE_CONDITION\")].status}")
echo "[$(date)] $NODE_CONDITION status: $STATUS"
if [ "$STATUS" = "False" ]; then
STABLE_COUNT=$((STABLE_COUNT + 1))
echo "Condition is stable (False) for $STABLE_COUNT minute(s)"
Expand All @@ -20,5 +22,5 @@ while [ "$TOTAL_WAIT" -lt 15 ]; do
sleep 60
TOTAL_WAIT=$((TOTAL_WAIT + 1))
done
echo "{{inputs.parameters.node_condition}} did not remain False for 2 consecutive minutes within 15 minutes. Exiting with failure."
echo "$NODE_CONDITION did not remain False for 2 consecutive minutes within 15 minutes. Exiting with failure."
exit 1
9 changes: 7 additions & 2 deletions internal/controllers/remediation_handler.go
Original file line number Diff line number Diff line change
Expand Up @@ -247,6 +247,11 @@ func (n *remediationMgr) HandleRemediation(ctx context.Context, devConfig *amdv1

// HandleDelete handles the delete operations during remediation process
func (n *remediationMgr) HandleDelete(ctx context.Context, deviceConfig *amdv1alpha1.DeviceConfig, nodeList *v1.NodeList) (res ctrl.Result, err error) {
res = ctrl.Result{Requeue: true, RequeueAfter: time.Second * 20}
remediationDisabled, err := n.helper.isRemediationDisabled(ctx, deviceConfig)
if err != nil || remediationDisabled {
return res, err
}

wfList, err := n.helper.getWorkflowList(ctx, deviceConfig.Namespace)
if err != nil {
Expand Down Expand Up @@ -337,7 +342,7 @@ type remediationMgrHelper struct {
k8sInterface kubernetes.Interface
recoveryTracker *sync.Map
serviceAccountName string
maxParallelWorkflows int
maxParallelWorkflows int32
tolerationsCache *sync.Map
}

Expand Down Expand Up @@ -968,7 +973,7 @@ func (h *remediationMgrHelper) updateMaxParallelWorkflows(ctx context.Context, d
}
// Update parallelism in Argo workflow controller configmap.
// https://github.com/argoproj/argo-workflows/blob/main/config/config.go#L69
acm.Data["namespaceParallelism"] = strconv.Itoa(devConfig.Spec.RemediationWorkflow.MaxParallelWorkflows)
acm.Data["namespaceParallelism"] = strconv.Itoa(int(devConfig.Spec.RemediationWorkflow.MaxParallelWorkflows))
return h.client.Update(ctx, acm)
})
if err != nil {
Expand Down
46 changes: 42 additions & 4 deletions internal/validator/specValidators.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,12 @@ package validator
import (
"context"
"fmt"
"time"
"strings"

amdv1alpha1 "github.com/ROCm/gpu-operator/api/v1alpha1"
utils "github.com/ROCm/gpu-operator/internal"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/util/validation"
"sigs.k8s.io/controller-runtime/pkg/client"
)

Expand Down Expand Up @@ -176,6 +178,32 @@ func ValidateDevicePluginSpec(ctx context.Context, client client.Client, devConf
return nil
}

func validateTaintEffect(effect v1.TaintEffect) error {
if effect != v1.TaintEffectNoSchedule && effect != v1.TaintEffectPreferNoSchedule && effect != v1.TaintEffectNoExecute {
return fmt.Errorf("unsupported taint effect %v", effect)
}

return nil
}

func checkTaintValidation(taint v1.Taint) error {
if errs := validation.IsQualifiedName(taint.Key); len(errs) > 0 {
return fmt.Errorf("invalid taint key: %s", strings.Join(errs, "; "))
}
if taint.Value != "" {
if errs := validation.IsValidLabelValue(taint.Value); len(errs) > 0 {
return fmt.Errorf("invalid taint value: %s", strings.Join(errs, "; "))
}
}
if taint.Effect != "" {
if err := validateTaintEffect(taint.Effect); err != nil {
return err
}
}

return nil
}

func ValidateRemediationWorkflowSpec(ctx context.Context, client client.Client, devConfig *amdv1alpha1.DeviceConfig) error {
rSpec := devConfig.Spec.RemediationWorkflow

Expand All @@ -189,9 +217,19 @@ func ValidateRemediationWorkflowSpec(ctx context.Context, client client.Client,
}
}

if rSpec.TtlForFailedWorkflows != "" {
if _, err := time.ParseDuration(rSpec.TtlForFailedWorkflows); err != nil {
return fmt.Errorf("parsing ttlForFailedWorkflows: %v", err)
for key, value := range rSpec.NodeRemediationLabels {
if len(validation.IsQualifiedName(key)) > 0 {
return fmt.Errorf("invalid label key: %s", key)
}
if len(validation.IsValidLabelValue(value)) > 0 {
return fmt.Errorf("invalid label value: %s", value)
}
}

for _, taint := range rSpec.NodeRemediationTaints {
err := checkTaintValidation(taint)
if err != nil {
return err
}
}

Expand Down
51 changes: 51 additions & 0 deletions tests/e2e/client/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,8 @@ type DeviceConfigsInterface interface {
PatchMetricsExporterImage(config *v1alpha1.DeviceConfig) (*v1alpha1.DeviceConfig, error)
PatchDRADriverEnablement(config *v1alpha1.DeviceConfig) (*v1alpha1.DeviceConfig, error)
PatchDevicePluginEnablement(config *v1alpha1.DeviceConfig) (*v1alpha1.DeviceConfig, error)
PatchAutoRemediationEnablement(config *v1alpha1.DeviceConfig) (*v1alpha1.DeviceConfig, error)
PatchRemediationWorkflowSpec(config *v1alpha1.DeviceConfig, patch map[string]interface{}) (*v1alpha1.DeviceConfig, error)
Get(name string, options metav1.GetOptions) (*v1alpha1.DeviceConfig, error)
Delete(name string) (*v1alpha1.DeviceConfig, error)
}
Expand Down Expand Up @@ -425,6 +427,55 @@ func (c *deviceConfigsClient) PatchDevicePluginEnablement(devCfg *v1alpha1.Devic
return &result, err
}

func (c *deviceConfigsClient) PatchAutoRemediationEnablement(devCfg *v1alpha1.DeviceConfig) (*v1alpha1.DeviceConfig, error) {
result := v1alpha1.DeviceConfig{}
devCfg.TypeMeta = metav1.TypeMeta{
Kind: "DeviceConfig",
APIVersion: "amd.com/v1alpha1",
}

patch := map[string]interface{}{
"spec": map[string]interface{}{
"remediationWorkflow": map[string]bool{
"enable": *devCfg.Spec.RemediationWorkflow.Enable,
},
},
}
patchBytes, _ := json.Marshal(patch)

err := c.restClient.
Patch(types.MergePatchType).
Namespace(devCfg.Namespace).
Resource("deviceConfigs").
Name(devCfg.Name).
Body(patchBytes).
Do(context.TODO()).
Into(&result)

return &result, err
}

func (c *deviceConfigsClient) PatchRemediationWorkflowSpec(devCfg *v1alpha1.DeviceConfig, patch map[string]interface{}) (*v1alpha1.DeviceConfig, error) {
result := v1alpha1.DeviceConfig{}
devCfg.TypeMeta = metav1.TypeMeta{
Kind: "DeviceConfig",
APIVersion: "amd.com/v1alpha1",
}

patchBytes, _ := json.Marshal(patch)

err := c.restClient.
Patch(types.MergePatchType).
Namespace(devCfg.Namespace).
Resource("deviceConfigs").
Name(devCfg.Name).
Body(patchBytes).
Do(context.TODO()).
Into(&result)

return &result, err
}

func (c *deviceConfigsClient) Delete(name string) (*v1alpha1.DeviceConfig, error) {
result := v1alpha1.DeviceConfig{}
err := c.restClient.
Expand Down
Loading
Loading