Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion internal/controllers/device_config_reconciler.go
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ func NewDeviceConfigReconciler(
workerMgr workermgr.WorkerMgrAPI,
isOpenShift bool) *DeviceConfigReconciler {
upgradeMgrHandler := newUpgradeMgrHandler(client, k8sConfig, isOpenShift)
remediationMgrHandler := newRemediationMgrHandler(client, k8sConfig)
remediationMgrHandler := newRemediationMgrHandler(client, k8sConfig, isOpenShift)
helper := newDeviceConfigReconcilerHelper(client, kmmHandler, dpHandler, nlHandler, upgradeMgrHandler, remediationMgrHandler, metricsHandler, testrunnerHandler, configmanagerHandler, workerMgr)
podEventHandler := watchers.NewPodEventHandler(client, workerMgr)
nodeEventHandler := watchers.NewNodeEventHandler(client, workerMgr)
Expand Down
40 changes: 23 additions & 17 deletions internal/controllers/remediation_handler.go
Original file line number Diff line number Diff line change
Expand Up @@ -128,13 +128,13 @@ type remediationMgrAPI interface {
HandleDelete(ctx context.Context, deviceConfig *amdv1alpha1.DeviceConfig, nodes *v1.NodeList) (ctrl.Result, error)
}

func newRemediationMgrHandler(client client.Client, k8sConfig *rest.Config) remediationMgrAPI {
func newRemediationMgrHandler(client client.Client, k8sConfig *rest.Config, isOpenShift bool) remediationMgrAPI {
k8sIntf, err := kubernetes.NewForConfig(k8sConfig)
if err != nil {
return nil
}
return &remediationMgr{
helper: newRemediationMgrHelperHandler(client, k8sIntf),
helper: newRemediationMgrHelperHandler(client, k8sIntf, isOpenShift),
}
}

Expand Down Expand Up @@ -344,15 +344,17 @@ type remediationMgrHelper struct {
serviceAccountName string
maxParallelWorkflows int32
tolerationsCache *sync.Map
isOpenShift bool
}

// Initialize remediation manager helper interface
func newRemediationMgrHelperHandler(client client.Client, k8sInterface kubernetes.Interface) remediationMgrHelperAPI {
func newRemediationMgrHelperHandler(client client.Client, k8sInterface kubernetes.Interface, isOpenShift bool) remediationMgrHelperAPI {
return &remediationMgrHelper{
client: client,
k8sInterface: k8sInterface,
recoveryTracker: new(sync.Map),
tolerationsCache: new(sync.Map),
isOpenShift: isOpenShift,
}
}

Expand All @@ -378,23 +380,27 @@ func (h *remediationMgrHelper) isRemediationDisabled(ctx context.Context, devCon
return true, nil
}

podList := &v1.PodList{}
if err := h.client.List(ctx, podList, client.InNamespace(devConfig.Namespace)); err != nil {
logger.Error(err, "failed to list pods")
return false, err
}
// Skip workflow controller pod check for OpenShift clusters
// in OpenShift the argo workflow pods need to be allowed in different namespaces and checking for the controller pod in the operator namespace may not be sufficient to determine if workflow controller is present or not
if !h.isOpenShift {
podList := &v1.PodList{}
if err := h.client.List(ctx, podList, client.InNamespace(devConfig.Namespace)); err != nil {
logger.Error(err, "failed to list pods")
return false, err
}

found := false
for _, pod := range podList.Items {
if strings.HasPrefix(pod.Name, "amd-gpu-operator-workflow-controller") {
found = true
break
found := false
for _, pod := range podList.Items {
if strings.HasPrefix(pod.Name, "amd-gpu-operator-workflow-controller") {
found = true
break
}
}
}

if !found {
logger.Info("Workflow controller pod not found. Please check if it was disabled during bringup, skipping remediation")
return true, nil
if !found {
logger.Info("Workflow controller pod not found. Please check if it was disabled during bringup, skipping remediation")
return true, nil
}
}
return false, nil
}
Expand Down
Loading