From 17bd373aa5880e9fc5b413cd5ca51440581a84f4 Mon Sep 17 00:00:00 2001 From: Yan Sun Date: Thu, 12 Mar 2026 06:51:22 -0700 Subject: [PATCH] [Fix] fix the AutoNodeRemediation pods check in OpenShift (#1187) Signed-off-by: yansun1996 (cherry picked from commit bb8b6414ca1a512b11aee6446b67a4a29dc36023) --- .../controllers/device_config_reconciler.go | 2 +- internal/controllers/remediation_handler.go | 40 +++++++++++-------- 2 files changed, 24 insertions(+), 18 deletions(-) diff --git a/internal/controllers/device_config_reconciler.go b/internal/controllers/device_config_reconciler.go index c2903210..65332e27 100644 --- a/internal/controllers/device_config_reconciler.go +++ b/internal/controllers/device_config_reconciler.go @@ -104,7 +104,7 @@ func NewDeviceConfigReconciler( workerMgr workermgr.WorkerMgrAPI, isOpenShift bool) *DeviceConfigReconciler { upgradeMgrHandler := newUpgradeMgrHandler(client, k8sConfig, isOpenShift) - remediationMgrHandler := newRemediationMgrHandler(client, k8sConfig) + remediationMgrHandler := newRemediationMgrHandler(client, k8sConfig, isOpenShift) helper := newDeviceConfigReconcilerHelper(client, kmmHandler, dpHandler, nlHandler, upgradeMgrHandler, remediationMgrHandler, metricsHandler, testrunnerHandler, configmanagerHandler, workerMgr) podEventHandler := watchers.NewPodEventHandler(client, workerMgr) nodeEventHandler := watchers.NewNodeEventHandler(client, workerMgr) diff --git a/internal/controllers/remediation_handler.go b/internal/controllers/remediation_handler.go index 6013ad5e..760f59ec 100644 --- a/internal/controllers/remediation_handler.go +++ b/internal/controllers/remediation_handler.go @@ -128,13 +128,13 @@ type remediationMgrAPI interface { HandleDelete(ctx context.Context, deviceConfig *amdv1alpha1.DeviceConfig, nodes *v1.NodeList) (ctrl.Result, error) } -func newRemediationMgrHandler(client client.Client, k8sConfig *rest.Config) remediationMgrAPI { +func newRemediationMgrHandler(client client.Client, k8sConfig *rest.Config, isOpenShift bool) remediationMgrAPI { k8sIntf, err := kubernetes.NewForConfig(k8sConfig) if err != nil { return nil } return &remediationMgr{ - helper: newRemediationMgrHelperHandler(client, k8sIntf), + helper: newRemediationMgrHelperHandler(client, k8sIntf, isOpenShift), } } @@ -344,15 +344,17 @@ type remediationMgrHelper struct { serviceAccountName string maxParallelWorkflows int32 tolerationsCache *sync.Map + isOpenShift bool } // Initialize remediation manager helper interface -func newRemediationMgrHelperHandler(client client.Client, k8sInterface kubernetes.Interface) remediationMgrHelperAPI { +func newRemediationMgrHelperHandler(client client.Client, k8sInterface kubernetes.Interface, isOpenShift bool) remediationMgrHelperAPI { return &remediationMgrHelper{ client: client, k8sInterface: k8sInterface, recoveryTracker: new(sync.Map), tolerationsCache: new(sync.Map), + isOpenShift: isOpenShift, } } @@ -378,23 +380,27 @@ func (h *remediationMgrHelper) isRemediationDisabled(ctx context.Context, devCon return true, nil } - podList := &v1.PodList{} - if err := h.client.List(ctx, podList, client.InNamespace(devConfig.Namespace)); err != nil { - logger.Error(err, "failed to list pods") - return false, err - } + // Skip workflow controller pod check for OpenShift clusters + // in OpenShift the argo workflow pods need to be allowed in different namespaces and checking for the controller pod in the operator namespace may not be sufficient to determine if workflow controller is present or not + if !h.isOpenShift { + podList := &v1.PodList{} + if err := h.client.List(ctx, podList, client.InNamespace(devConfig.Namespace)); err != nil { + logger.Error(err, "failed to list pods") + return false, err + } - found := false - for _, pod := range podList.Items { - if strings.HasPrefix(pod.Name, "amd-gpu-operator-workflow-controller") { - found = true - break + found := false + for _, pod := range podList.Items { + if strings.HasPrefix(pod.Name, "amd-gpu-operator-workflow-controller") { + found = true + break + } } - } - if !found { - logger.Info("Workflow controller pod not found. Please check if it was disabled during bringup, skipping remediation") - return true, nil + if !found { + logger.Info("Workflow controller pod not found. Please check if it was disabled during bringup, skipping remediation") + return true, nil + } } return false, nil }