From 2390cb1a4b1f6a4f1c973254577948c3311de3e7 Mon Sep 17 00:00:00 2001 From: Yan Sun Date: Mon, 16 Mar 2026 15:20:30 -0700 Subject: [PATCH] [Feature] Add support to disable watch for KMM resources to totally disable KMM for users (#1171) Signed-off-by: yansun1996 (cherry picked from commit 3733e127bd13be9464a74e24a7aa4125c5d27f46) --- README.md | 1 + cmd/main.go | 25 ++++- docs/installation/kubernetes-helm.md | 8 +- docs/releasenotes.md | 12 +++ hack/k8s-patch/metadata-patch/values.yaml | 4 +- hack/k8s-patch/template-patch/deployment.yaml | 2 + helm-charts-k8s/README.md | 4 +- helm-charts-k8s/templates/deployment.yaml | 2 + helm-charts-k8s/values.yaml | 4 +- .../controllers/device_config_reconciler.go | 95 ++++++++++++++----- .../device_config_reconciler_test.go | 52 ++++++++-- internal/kmmmodule/kmmmodule.go | 19 ++++ .../e2e/yamls/charts/gpu-operator/values.yaml | 3 +- tests/helm-e2e/helm_e2e_test.go | 55 +++++++++++ 14 files changed, 250 insertions(+), 36 deletions(-) diff --git a/README.md b/README.md index d89c7a4ad..dd3cb401d 100644 --- a/README.md +++ b/README.md @@ -78,6 +78,7 @@ helm install amd-gpu-operator rocm/gpu-operator-charts \ Installation Options - Skip NFD installation: `--set node-feature-discovery.enabled=false` - Skip KMM installation: `--set kmm.enabled=false` + - Disable KMM watching/usage: `--set kmm.watch=false` - Skip Auto Node Remediation: `--set remediation.enabled=false` - Enable DRA driver (instead of device plugin): `--set deviceConfig.spec.draDriver.enable=true --set deviceConfig.spec.devicePlugin.enableDevicePlugin=false` - Disable DeviceClass creation: `--set draDriver.deviceClass.create=false` diff --git a/cmd/main.go b/cmd/main.go index 4407ccbae..ff147bdb3 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -34,6 +34,8 @@ package main import ( "flag" + "os" + "strconv" workflowv1alpha1 "github.com/argoproj/argo-workflows/v3/pkg/apis/workflow/v1alpha1" kmmv1beta1 "github.com/rh-ecosystem-edge/kernel-module-management/api/v1beta1" @@ -98,7 +100,18 @@ func main() { setupLogger := logger.WithName("setup") - setupLogger.Info("Creating manager", "version", Version, "git commit", GitCommit, "build tag", BuildTag) + // Read KMM watch configuration from environment variable + kmmWatchEnabled := true // default to true for backward compatibility + if kmmWatchEnv := os.Getenv("KMM_WATCH_ENABLED"); kmmWatchEnv != "" { + var err error + kmmWatchEnabled, err = strconv.ParseBool(kmmWatchEnv) + if err != nil { + setupLogger.Error(err, "invalid KMM_WATCH_ENABLED value, defaulting to true", "value", kmmWatchEnv) + kmmWatchEnabled = true + } + } + + setupLogger.Info("Creating manager", "version", Version, "git commit", GitCommit, "build tag", BuildTag, "kmmWatchEnabled", kmmWatchEnabled) setupLogger.Info("Parsing configuration file", "path", configFile) @@ -118,7 +131,12 @@ func main() { // Use manager's client, it may read from a cache. client := mgr.GetClient() isOpenShift := utils.IsOpenShift(setupLogger) - kmmHandler := kmmmodule.NewKMMModule(client, scheme, isOpenShift) + var kmmHandler kmmmodule.KMMModuleAPI + if kmmWatchEnabled { + kmmHandler = kmmmodule.NewKMMModule(client, scheme, isOpenShift) + } else { + kmmHandler = kmmmodule.NewNoOpKMMModule() + } dpHandler := plugin.NewDevicePlugin(client, scheme, isOpenShift) nlHandler := nodelabeller.NewNodeLabeller(scheme, isOpenShift) metricsHandler := metricsexporter.NewMetricsExporter(scheme) @@ -135,7 +153,8 @@ func main() { testrunnerHandler, configmanagerHandler, workerMgr, - isOpenShift) + isOpenShift, + kmmWatchEnabled) if err = dcr.SetupWithManager(mgr); err != nil { cmd.FatalError(setupLogger, err, "unable to create controller", "name", controllers.DeviceConfigReconcilerName) } diff --git a/docs/installation/kubernetes-helm.md b/docs/installation/kubernetes-helm.md index 78f5d301d..b829c6318 100644 --- a/docs/installation/kubernetes-helm.md +++ b/docs/installation/kubernetes-helm.md @@ -126,8 +126,14 @@ helm install amd-gpu-operator rocm/gpu-operator-charts \ Installation Options - Skip NFD installation: `--set node-feature-discovery.enabled=false` - Skip KMM installation: `--set kmm.enabled=false`
Although KMM is a [Kubernetes-SIGs](https://github.com/kubernetes-sigs) maintained project, it is strongly recommended to use AMD optimized and published KMM images included in each operator release. + - Disable GPU operator watching KMM resources: `--set kmm.watch=false` - Skip Auto Node Remediation: `--set remediation.enabled=false` - Disable default DeviceConfig installation: `--set crds.defaultCR.install=false` + +**KMM Configuration Examples:** + - Use existing KMM installation: `--set kmm.enabled=false --set kmm.watch=true` + - Skip KMM for alternative driver management: `--set kmm.enabled=false --set kmm.watch=false` + - Install KMM without GPU operator using it: `--set kmm.enabled=true --set kmm.watch=false` ``` ```{tip} @@ -167,6 +173,7 @@ The following parameters are able to be configued when using the Helm Chart. In | controllerManager.nodeSelector | object | `{}` | Node selector for AMD GPU operator controller manager deployment | | installdefaultNFDRule | bool | `true` | Set to true to install default NFD rule for detecting AMD GPU hardware based on pci vendor ID and device ID | | kmm.enabled | bool | `true` | Set to true/false to enable/disable the installation of kernel module management (KMM) operator | +| kmm.watch | bool | `true` | Set to true/false to enable/disable GPU operator watching and using KMM resources | | node-feature-discovery.enabled | bool | `true` | Set to true/false to enable/disable the installation of node feature discovery (NFD) operator | | upgradeCRD | bool | `true` | CRD will be patched as pre-upgrade/pre-rollback hook when doing helm upgrade/rollback to current helm chart | | kmm.controller.affinity | object | `{"nodeAffinity":{"preferredDuringSchedulingIgnoredDuringExecution":[{"preference":{"matchExpressions":[{"key":"node-role.kubernetes.io/control-plane","operator":"Exists"}]},"weight":1}]}}` | Affinity for the KMM controller manager deployment | @@ -206,7 +213,6 @@ The following parameters are able to be configued when using the Helm Chart. In | kmm.kubernetesClusterDomain | string | `"cluster.local"` | | | kmm.managerConfig.controllerConfigYaml | string | `"healthProbeBindAddress: :8081\nwebhookPort: 9443\nleaderElection:\n enabled: true\n resourceID: kmm.sigs.x-k8s.io\nmetrics:\n enableAuthnAuthz: true\n bindAddress: 0.0.0.0:8443\n secureServing: true\nworker:\n runAsUser: 0\n seLinuxType: spc_t\n firmwareHostPath: /var/lib/firmware"` | | | kmm.webhookServer.affinity | object | `{"nodeAffinity":{"preferredDuringSchedulingIgnoredDuringExecution":[{"preference":{"matchExpressions":[{"key":"node-role.kubernetes.io/control-plane","operator":"Exists"}]},"weight":1}]}}` | KMM webhook's deployment affinity configs | -| kmm.enabled | bool | `true` | Set to true/false to enable/disable the installation of kernel module management (KMM) operator | | kmm.webhookServer.nodeAffinity.nodeSelectorTerms | list | `[{"key":"node-role.kubernetes.io/control-plane","operator":"Exists"},{"key":"node-role.kubernetes.io/master","operator":"Exists"}]` | Node affinity selector terms config for the KMM webhook deployment, set it to [] if you want to make affinity config empty | | kmm.webhookServer.nodeSelector | object | `{}` | KMM webhook's deployment node selector | | kmm.webhookServer.replicas | int | `1` | | diff --git a/docs/releasenotes.md b/docs/releasenotes.md index 8f21c9088..a19d5d7d0 100644 --- a/docs/releasenotes.md +++ b/docs/releasenotes.md @@ -1,5 +1,17 @@ # Release Notes +## GPU Operator v1.5.0 Release Notes + +### Release Highlights + +- **Enhanced KMM (Kernel Module Management) Configuration Control** + - **Independent Control of KMM Installation and Usage** + - New helm parameters provide separate control over KMM installation and resource watching: + - `kmm.enabled`: Controls KMM subchart installation (default: `true`) + - `kmm.watch`: Controls GPU operator watching KMM resources (default: `true`) + - Supports multiple deployment scenarios: use existing KMM installations (`enabled=false, watch=true`), skip KMM entirely for alternative driver solutions (`enabled=false, watch=false`), or install KMM without asking for GPU Operator to use it (`enabled=true, watch=false`) + - Fully backward compatible: existing configurations with `kmm.enabled=false` continue to work without changes + ## GPU Operator v1.4.1 Release Notes The AMD GPU Operator v1.4.1 release extends platform support to OpenShift v4.20 and Debian 12, and introduces the ability to build `amdgpu` kernel modules directly within air-gapped OpenShift clusters. diff --git a/hack/k8s-patch/metadata-patch/values.yaml b/hack/k8s-patch/metadata-patch/values.yaml index 36c6d8da4..e050da3c0 100644 --- a/hack/k8s-patch/metadata-patch/values.yaml +++ b/hack/k8s-patch/metadata-patch/values.yaml @@ -17,8 +17,10 @@ node-feature-discovery: nodeSelector: {} # KMM related configs kmm: - # -- Set to true/false to enable/disable the installation of kernel module management (KMM) operator + # -- Set to true/false to enable/disable the installation of kernel module management (KMM) operator subchart enabled: true + # -- Set to true/false to enable/disable GPU operator watching and using KMM resources + watch: true # Remediation related configs remediation: # -- Set to true/false to enable/disable the installation of remediation workflow controller diff --git a/hack/k8s-patch/template-patch/deployment.yaml b/hack/k8s-patch/template-patch/deployment.yaml index aa79b0c7e..0bd56c848 100644 --- a/hack/k8s-patch/template-patch/deployment.yaml +++ b/hack/k8s-patch/template-patch/deployment.yaml @@ -41,6 +41,8 @@ spec: value: {{ quote .Values.kubernetesClusterDomain }} - name: SIM_ENABLE value: {{ quote .Values.controllerManager.env.simEnable }} + - name: KMM_WATCH_ENABLED + value: {{ quote .Values.kmm.watch }} image: {{ .Values.controllerManager.manager.image.repository }}:{{ .Values.controllerManager.manager.image.tag | default .Chart.AppVersion }} imagePullPolicy: {{ .Values.controllerManager.manager.imagePullPolicy }} diff --git a/helm-charts-k8s/README.md b/helm-charts-k8s/README.md index ea2c36681..5aebfd7b3 100644 --- a/helm-charts-k8s/README.md +++ b/helm-charts-k8s/README.md @@ -78,6 +78,7 @@ helm install amd-gpu-operator rocm/gpu-operator-charts \ Installation Options - Skip NFD installation: `--set node-feature-discovery.enabled=false` - Skip KMM installation: `--set kmm.enabled=false` + - Disable KMM watching/usage: `--set kmm.watch=false` - Skip Auto Node Remediation: `--set remediation.enabled=false` - Enable DRA driver (instead of device plugin): `--set deviceConfig.spec.draDriver.enable=true --set deviceConfig.spec.devicePlugin.enableDevicePlugin=false` - Disable DeviceClass creation: `--set draDriver.deviceClass.create=false` @@ -266,7 +267,8 @@ Kubernetes: `>= 1.29.0-0` | deviceConfig.spec.testRunner.upgradePolicy.upgradeStrategy | string | `"RollingUpdate"` | the type of daemonset upgrade, RollingUpdate or OnDelete | | draDriver.deviceClass.create | bool | `true` | Create the gpu.amd.com DeviceClass resource. Set to false if managing the DRA driver independently. | | installdefaultNFDRule | bool | `true` | Default NFD rule will detect amd gpu based on pci vendor ID | -| kmm.enabled | bool | `true` | Set to true/false to enable/disable the installation of kernel module management (KMM) operator | +| kmm.enabled | bool | `true` | Set to true/false to enable/disable the installation of kernel module management (KMM) operator subchart | +| kmm.watch | bool | `true` | Set to true/false to enable/disable GPU operator watching and using KMM resources | | node-feature-discovery.enabled | bool | `true` | Set to true/false to enable/disable the installation of node feature discovery (NFD) operator | | node-feature-discovery.worker.nodeSelector | object | `{}` | Set nodeSelector for NFD worker daemonset | | node-feature-discovery.worker.tolerations | list | `[{"effect":"NoExecute","key":"amd-dcm","operator":"Equal","value":"up"},{"effect":"NoSchedule","key":"amd-gpu-unhealthy","operator":"Exists"}]` | Set tolerations for NFD worker daemonset | diff --git a/helm-charts-k8s/templates/deployment.yaml b/helm-charts-k8s/templates/deployment.yaml index aa79b0c7e..0bd56c848 100644 --- a/helm-charts-k8s/templates/deployment.yaml +++ b/helm-charts-k8s/templates/deployment.yaml @@ -41,6 +41,8 @@ spec: value: {{ quote .Values.kubernetesClusterDomain }} - name: SIM_ENABLE value: {{ quote .Values.controllerManager.env.simEnable }} + - name: KMM_WATCH_ENABLED + value: {{ quote .Values.kmm.watch }} image: {{ .Values.controllerManager.manager.image.repository }}:{{ .Values.controllerManager.manager.image.tag | default .Chart.AppVersion }} imagePullPolicy: {{ .Values.controllerManager.manager.imagePullPolicy }} diff --git a/helm-charts-k8s/values.yaml b/helm-charts-k8s/values.yaml index 36c6d8da4..e050da3c0 100644 --- a/helm-charts-k8s/values.yaml +++ b/helm-charts-k8s/values.yaml @@ -17,8 +17,10 @@ node-feature-discovery: nodeSelector: {} # KMM related configs kmm: - # -- Set to true/false to enable/disable the installation of kernel module management (KMM) operator + # -- Set to true/false to enable/disable the installation of kernel module management (KMM) operator subchart enabled: true + # -- Set to true/false to enable/disable GPU operator watching and using KMM resources + watch: true # Remediation related configs remediation: # -- Set to true/false to enable/disable the installation of remediation workflow controller diff --git a/internal/controllers/device_config_reconciler.go b/internal/controllers/device_config_reconciler.go index 65332e277..8f9856e25 100644 --- a/internal/controllers/device_config_reconciler.go +++ b/internal/controllers/device_config_reconciler.go @@ -86,6 +86,7 @@ type DeviceConfigReconciler struct { client.Client once sync.Once initErr error + kmmWatchEnabled bool helper deviceConfigReconcilerHelperAPI podEventHandler watchers.PodEventHandlerAPI nodeEventHandler watchers.NodeEventHandlerAPI @@ -102,15 +103,17 @@ func NewDeviceConfigReconciler( testrunnerHandler testrunner.TestRunner, configmanagerHandler configmanager.ConfigManager, workerMgr workermgr.WorkerMgrAPI, - isOpenShift bool) *DeviceConfigReconciler { + isOpenShift bool, + kmmWatchEnabled bool) *DeviceConfigReconciler { upgradeMgrHandler := newUpgradeMgrHandler(client, k8sConfig, isOpenShift) remediationMgrHandler := newRemediationMgrHandler(client, k8sConfig, isOpenShift) - helper := newDeviceConfigReconcilerHelper(client, kmmHandler, dpHandler, nlHandler, upgradeMgrHandler, remediationMgrHandler, metricsHandler, testrunnerHandler, configmanagerHandler, workerMgr) + helper := newDeviceConfigReconcilerHelper(client, kmmHandler, dpHandler, nlHandler, upgradeMgrHandler, remediationMgrHandler, metricsHandler, testrunnerHandler, configmanagerHandler, workerMgr, kmmWatchEnabled) podEventHandler := watchers.NewPodEventHandler(client, workerMgr) nodeEventHandler := watchers.NewNodeEventHandler(client, workerMgr) daemonsetEventHandler := watchers.NewDaemonsetEventHandler(client) return &DeviceConfigReconciler{ Client: client, + kmmWatchEnabled: kmmWatchEnabled, helper: helper, podEventHandler: podEventHandler, nodeEventHandler: nodeEventHandler, @@ -120,12 +123,24 @@ func NewDeviceConfigReconciler( // SetupWithManager sets up the controller with the Manager. func (r *DeviceConfigReconciler) SetupWithManager(mgr ctrl.Manager) error { - return ctrl.NewControllerManagedBy(mgr). + controllerBuilder := ctrl.NewControllerManagedBy(mgr). Named(DeviceConfigReconcilerName). // just reconcile the spec change or deletion For(&amdv1alpha1.DeviceConfig{}, builder.WithPredicates(watchers.SpecChangedOrDeletionPredicate{})). - Owns(&v1.Service{}, builder.WithPredicates(watchers.SpecChangedOrDeletionPredicate{})). - Owns(&kmmv1beta1.Module{}). + Owns(&v1.Service{}, builder.WithPredicates(watchers.SpecChangedOrDeletionPredicate{})) + + // Conditionally add KMM watches only if KMM watch is enabled + if r.kmmWatchEnabled { + controllerBuilder = controllerBuilder. + Owns(&kmmv1beta1.Module{}). + Watches( // watch NMC for upgrademgr + &kmmv1beta1.NodeModulesConfig{}, + handler.EnqueueRequestsFromMapFunc(r.helper.findDeviceConfigsForNMC), + builder.WithPredicates(predicate.ResourceVersionChangedPredicate{}), + ) + } + + return controllerBuilder. Watches( // watch for owned daemonset, only update status &appsv1.DaemonSet{}, r.daemonsetEventHandler, @@ -139,11 +154,6 @@ func (r *DeviceConfigReconciler) SetupWithManager(mgr ctrl.Manager) error { r.nodeEventHandler, builder.WithPredicates(watchers.NodePredicate{}), ). - Watches( // watch NMC for upgrademgr - &kmmv1beta1.NodeModulesConfig{}, - handler.EnqueueRequestsFromMapFunc(r.helper.findDeviceConfigsForNMC), - builder.WithPredicates(predicate.ResourceVersionChangedPredicate{}), - ). Watches( // watch pod event to auto-clean unknown status builder pod and cleanup post-process worker pod &v1.Pod{}, r.podEventHandler, @@ -380,6 +390,7 @@ type deviceConfigReconcilerHelperAPI interface { type deviceConfigReconcilerHelper struct { client client.Client + kmmWatchEnabled bool kmmHandler kmmmodule.KMMModuleAPI devicePluginHandler plugin.DevicePluginAPI nlHandler nodelabeller.NodeLabeller @@ -405,11 +416,13 @@ func newDeviceConfigReconcilerHelper(client client.Client, metricsHandler metricsexporter.MetricsExporter, testrunnerHandler testrunner.TestRunner, configmanagerHandler configmanager.ConfigManager, - workerMgr workermgr.WorkerMgrAPI) deviceConfigReconcilerHelperAPI { + workerMgr workermgr.WorkerMgrAPI, + kmmWatchEnabled bool) deviceConfigReconcilerHelperAPI { conditionUpdater := conditions.NewDeviceConfigConditionMgr() validator := validator.NewValidator() return &deviceConfigReconcilerHelper{ client: client, + kmmWatchEnabled: kmmWatchEnabled, kmmHandler: kmmHandler, devicePluginHandler: dpHandler, nlHandler: nlHandler, @@ -426,6 +439,14 @@ func newDeviceConfigReconcilerHelper(client client.Client, } } +// shouldUseKMMOperatorLevel checks both operator-level and device-level KMM settings +func (dcrh *deviceConfigReconcilerHelper) shouldUseKMMOperatorLevel(devConfig *amdv1alpha1.DeviceConfig) bool { + if !dcrh.kmmWatchEnabled { + return false // Operator-level: KMM watch is disabled globally + } + return utils.ShouldUseKMM(devConfig) // Device-level: check driver type +} + func (dcrh *deviceConfigReconcilerHelper) shouldReconcile(ctx context.Context, ugpgradeRes, remediationRes ctrl.Result) ctrl.Result { var finalRes ctrl.Result switch { @@ -520,7 +541,7 @@ func (dcrh *deviceConfigReconcilerHelper) hasSecretReference(secretName string, func (dcrh *deviceConfigReconcilerHelper) buildDeviceConfigStatus(ctx context.Context, devConfig *amdv1alpha1.DeviceConfig, nodes *v1.NodeList) error { // fetch DeviceConfig-owned custom resource // then retrieve its status and put it to DeviceConfig's status fields - if utils.ShouldUseKMM(devConfig) { + if dcrh.shouldUseKMMOperatorLevel(devConfig) { kmmModuleObj, err := dcrh.getDeviceConfigOwnedKMMModule(ctx, devConfig) if err != nil { return fmt.Errorf("failed to fetch owned kmm module for DeviceConfig %+v: %+v", @@ -566,9 +587,15 @@ func (dcrh *deviceConfigReconcilerHelper) updateDeviceConfigStatus(ctx context.C } func (dcrh *deviceConfigReconcilerHelper) getDeviceConfigOwnedKMMModule(ctx context.Context, devConfig *amdv1alpha1.DeviceConfig) (*kmmv1beta1.Module, error) { + if !dcrh.kmmWatchEnabled { + return nil, nil + } module := kmmv1beta1.Module{} namespacedName := types.NamespacedName{Namespace: devConfig.Namespace, Name: devConfig.Name} if err := dcrh.client.Get(ctx, namespacedName, &module); err != nil { + if k8serrors.IsNotFound(err) || meta.IsNoMatchError(err) { + return nil, nil + } return nil, fmt.Errorf("failed to get KMM Module %s: %v", namespacedName, err) } return &module, nil @@ -603,10 +630,15 @@ func (dcrh *deviceConfigReconcilerHelper) buildDeviceConfigNodeStatus(ctx contex } devConfig.Status.NodeModuleStatus[node.Name] = amdv1alpha1.ModuleStatus{Status: dcrh.upgradeMgrHandler.GetNodeStatus(node.Name), UpgradeStartTime: upgradeStartTime, BootId: bootId} + if !dcrh.kmmWatchEnabled { + // Skip NMC lookup if KMM watch is disabled + continue + } + nmc := kmmv1beta1.NodeModulesConfig{} err := dcrh.client.Get(ctx, types.NamespacedName{Name: node.Name}, &nmc) if err != nil { - if !k8serrors.IsNotFound(err) { + if !k8serrors.IsNotFound(err) && !meta.IsNoMatchError(err) { logger.Error(err, fmt.Sprintf("failed to fetch NMC for node %+v", node.Name)) } continue @@ -621,7 +653,7 @@ func (dcrh *deviceConfigReconcilerHelper) buildDeviceConfigNodeStatus(ctx contex // need to remove this redundant info to unblock a known issue https://github.com/ROCm/gpu-operator/issues/403 // the driver management is disabled but there is probability that dirver status get stuck in Install-In-Progress nodeStatus := amdv1alpha1.UpgradeStateEmpty - if utils.ShouldUseKMM(devConfig) { + if dcrh.shouldUseKMMOperatorLevel(devConfig) { // only assign node driver status value when DeviceConfig is managing drivers nodeStatus = dcrh.upgradeMgrHandler.GetNodeStatus(node.Name) } @@ -864,15 +896,23 @@ func (dcrh *deviceConfigReconcilerHelper) finalizeDeviceConfig(ctx context.Conte if err := utils.UpdateDriverTypeNodeLabel(ctx, dcrh.client, devConfig, nodes, true); err != nil { return fmt.Errorf("failed to remove driver type node label: %+v", err) } + + if !dcrh.kmmWatchEnabled { + // When KMM watch is disabled, skip KMM Module deletion and leave any existing KMM resources orphaned + devConfigCopy := devConfig.DeepCopy() + controllerutil.RemoveFinalizer(devConfig, deviceConfigFinalizer) + return dcrh.client.Patch(ctx, devConfig, client.MergeFrom(devConfigCopy)) + } + mod := kmmv1beta1.Module{} namespacedName = types.NamespacedName{ Namespace: devConfig.Namespace, Name: devConfig.Name, } if err := dcrh.client.Get(ctx, namespacedName, &mod); err != nil { - if k8serrors.IsNotFound(err) { + if k8serrors.IsNotFound(err) || meta.IsNoMatchError(err) { // if KMM module CR is not found - if utils.ShouldUseKMM(devConfig) { + if dcrh.shouldUseKMMOperatorLevel(devConfig) { // when KMM was trigger switch devConfig.Spec.Driver.DriverType { case utils.DriverTypeVFPassthrough: @@ -964,6 +1004,11 @@ func (dcrh *deviceConfigReconcilerHelper) checkPostProcessFinalizeCondition(ctx // findDeviceConfigsForNMC when a NMC changed, only trigger reconcile for related DeviceConfig func (drch *deviceConfigReconcilerHelper) findDeviceConfigsForNMC(ctx context.Context, nmc client.Object) []reconcile.Request { reqs := []reconcile.Request{} + + if !drch.kmmWatchEnabled { + return reqs + } + logger := log.FromContext(ctx) nmcObj, ok := nmc.(*kmmv1beta1.NodeModulesConfig) if !ok { @@ -999,8 +1044,11 @@ func (drch *deviceConfigReconcilerHelper) findDeviceConfigsForNMC(ctx context.Co func (dcrh *deviceConfigReconcilerHelper) handleBuildConfigMap(ctx context.Context, devConfig *amdv1alpha1.DeviceConfig, nodes *v1.NodeList) error { logger := log.FromContext(ctx) - if !utils.ShouldUseKMM(devConfig) { - logger.Info("skip handling build config map as KMM driver mode is disabled") + if !dcrh.shouldUseKMMOperatorLevel(devConfig) { + if !dcrh.kmmWatchEnabled && devConfig.Spec.Driver.Enable != nil && *devConfig.Spec.Driver.Enable { + logger.Info("KMM watch is disabled but DeviceConfig requests driver installation - drivers will not be installed", + "namespace", devConfig.Namespace, "name", devConfig.Name) + } return nil } if nodes == nil || len(nodes.Items) == 0 { @@ -1055,7 +1103,7 @@ func (dcrh *deviceConfigReconcilerHelper) handleKMMModule(ctx context.Context, d return err } - if utils.ShouldUseKMM(devConfig) { + if dcrh.shouldUseKMMOperatorLevel(devConfig) { // the newly created KMM Module will always has the same namespace and name as its parent DeviceConfig kmmMod := &kmmv1beta1.Module{ ObjectMeta: metav1.ObjectMeta{ @@ -1072,8 +1120,11 @@ func (dcrh *deviceConfigReconcilerHelper) handleKMMModule(ctx context.Context, d } return err } - logger.Info("skip handling KMM module as KMM driver mode is disabled") - // if driver mode switched from enable to disable + if !dcrh.kmmWatchEnabled && devConfig.Spec.Driver.Enable != nil && *devConfig.Spec.Driver.Enable { + logger.Info("KMM watch is disabled but DeviceConfig requests driver installation - drivers will not be installed", + "namespace", devConfig.Namespace, "name", devConfig.Name) + } + // if driver mode switched from enable to disable or KMM watch is disabled // we won't delete the existing KMM module return nil @@ -1130,7 +1181,7 @@ func (dcrh *deviceConfigReconcilerHelper) handleDRADriver(ctx context.Context, d func (dcrh *deviceConfigReconcilerHelper) handleKMMVersionLabel(ctx context.Context, devConfig *amdv1alpha1.DeviceConfig, nodes *v1.NodeList) error { // label corresponding node with given kmod version // so that KMM could manage the upgrade by watching the node's version label change - if utils.ShouldUseKMM(devConfig) { + if dcrh.shouldUseKMMOperatorLevel(devConfig) { err := dcrh.kmmHandler.SetNodeVersionLabelAsDesired(ctx, devConfig, nodes) if err != nil { return fmt.Errorf("failed to update node version label for DeviceConfig %s/%s: %v", devConfig.Namespace, devConfig.Name, err) diff --git a/internal/controllers/device_config_reconciler_test.go b/internal/controllers/device_config_reconciler_test.go index 75d15b49e..9a85b8bae 100644 --- a/internal/controllers/device_config_reconciler_test.go +++ b/internal/controllers/device_config_reconciler_test.go @@ -194,7 +194,7 @@ var _ = Describe("getLabelsPerModules", func() { BeforeEach(func() { ctrl := gomock.NewController(GinkgoT()) kubeClient = mock_client.NewMockClient(ctrl) - dcrh = newDeviceConfigReconcilerHelper(kubeClient, nil, nil, nil, nil, nil, nil, nil, nil, nil) + dcrh = newDeviceConfigReconcilerHelper(kubeClient, nil, nil, nil, nil, nil, nil, nil, nil, nil, true) }) ctx := context.Background() @@ -230,6 +230,46 @@ var _ = Describe("getLabelsPerModules", func() { }) }) +var _ = Describe("deviceConfigReconcilerHelper with KMM watch disabled", func() { + var ( + kubeClient *mock_client.MockClient + dcrh deviceConfigReconcilerHelperAPI + ) + BeforeEach(func() { + ctrl := gomock.NewController(GinkgoT()) + kubeClient = mock_client.NewMockClient(ctrl) + dcrh = newDeviceConfigReconcilerHelper(kubeClient, nil, nil, nil, nil, nil, nil, nil, nil, nil, true) + }) + ctx := context.Background() + nn := types.NamespacedName{ + Name: devConfigName, + Namespace: devConfigNamespace, + } + It("good flow with KMM watch disabled", func() { + expectedDevConfig := amdv1alpha1.DeviceConfig{ + ObjectMeta: metav1.ObjectMeta{ + Name: nn.Name, + Namespace: nn.Namespace, + }, + } + kubeClient.EXPECT().Get(ctx, nn, gomock.Any()).Do( + func(_ interface{}, _ interface{}, devConfig *amdv1alpha1.DeviceConfig, _ ...client.GetOption) { + devConfig.Name = nn.Name + devConfig.Namespace = nn.Namespace + }, + ) + res, err := dcrh.getRequestedDeviceConfig(ctx, nn) + Expect(err).ToNot(HaveOccurred()) + Expect(*res).To(Equal(expectedDevConfig)) + }) + It("error flow with KMM watch disabled", func() { + kubeClient.EXPECT().Get(ctx, nn, gomock.Any()).Return(fmt.Errorf("some error")) + res, err := dcrh.getRequestedDeviceConfig(ctx, nn) + Expect(err).To(HaveOccurred()) + Expect(res).To(BeNil()) + }) +}) + var _ = Describe("setFinalizer", func() { var ( kubeClient *mock_client.MockClient @@ -239,7 +279,7 @@ var _ = Describe("setFinalizer", func() { BeforeEach(func() { ctrl := gomock.NewController(GinkgoT()) kubeClient = mock_client.NewMockClient(ctrl) - dcrh = newDeviceConfigReconcilerHelper(kubeClient, nil, nil, nil, nil, nil, nil, nil, nil, nil) + dcrh = newDeviceConfigReconcilerHelper(kubeClient, nil, nil, nil, nil, nil, nil, nil, nil, nil, true) }) ctx := context.Background() @@ -275,7 +315,7 @@ var _ = Describe("finalizeDeviceConfig", func() { BeforeEach(func() { ctrl := gomock.NewController(GinkgoT()) kubeClient = mock_client.NewMockClient(ctrl) - dcrh = newDeviceConfigReconcilerHelper(kubeClient, nil, nil, nil, nil, nil, nil, nil, nil, nil) + dcrh = newDeviceConfigReconcilerHelper(kubeClient, nil, nil, nil, nil, nil, nil, nil, nil, nil, true) }) ctx := context.Background() @@ -486,7 +526,7 @@ var _ = Describe("handleKMMModule", func() { ctrl := gomock.NewController(GinkgoT()) kubeClient = mock_client.NewMockClient(ctrl) kmmHelper = kmmmodule.NewMockKMMModuleAPI(ctrl) - dcrh = newDeviceConfigReconcilerHelper(kubeClient, kmmHelper, nil, nil, nil, nil, nil, nil, nil, nil) + dcrh = newDeviceConfigReconcilerHelper(kubeClient, kmmHelper, nil, nil, nil, nil, nil, nil, nil, nil, true) }) ctx := context.Background() @@ -556,7 +596,7 @@ var _ = Describe("handleBuildConfigMap", func() { ctrl := gomock.NewController(GinkgoT()) kubeClient = mock_client.NewMockClient(ctrl) kmmHelper = kmmmodule.NewMockKMMModuleAPI(ctrl) - dcrh = newDeviceConfigReconcilerHelper(kubeClient, kmmHelper, nil, nil, nil, nil, nil, nil, nil, nil) + dcrh = newDeviceConfigReconcilerHelper(kubeClient, kmmHelper, nil, nil, nil, nil, nil, nil, nil, nil, true) }) ctx := context.Background() @@ -623,7 +663,7 @@ var _ = Describe("handleNodeLabeller", func() { ctrl := gomock.NewController(GinkgoT()) kubeClient = mock_client.NewMockClient(ctrl) nodeLabellerHelper = nodelabeller.NewMockNodeLabeller(ctrl) - dcrh = newDeviceConfigReconcilerHelper(kubeClient, nil, nil, nodeLabellerHelper, nil, nil, nil, nil, nil, nil) + dcrh = newDeviceConfigReconcilerHelper(kubeClient, nil, nil, nodeLabellerHelper, nil, nil, nil, nil, nil, nil, true) }) ctx := context.Background() diff --git a/internal/kmmmodule/kmmmodule.go b/internal/kmmmodule/kmmmodule.go index eeb125212..63ffd35bd 100644 --- a/internal/kmmmodule/kmmmodule.go +++ b/internal/kmmmodule/kmmmodule.go @@ -114,6 +114,25 @@ func NewKMMModule(client client.Client, scheme *runtime.Scheme, isOpenShift bool } } +// noOpKMMModule is a no-op implementation of KMMModuleAPI used when KMM watch is disabled +type noOpKMMModule struct{} + +func NewNoOpKMMModule() KMMModuleAPI { + return &noOpKMMModule{} +} + +func (n *noOpKMMModule) SetNodeVersionLabelAsDesired(ctx context.Context, devConfig *amdv1alpha1.DeviceConfig, nodes *v1.NodeList) error { + return nil +} + +func (n *noOpKMMModule) SetBuildConfigMapAsDesired(buildCM *v1.ConfigMap, devConfig *amdv1alpha1.DeviceConfig) error { + return nil +} + +func (n *noOpKMMModule) SetKMMModuleAsDesired(ctx context.Context, mod *kmmv1beta1.Module, devConfig *amdv1alpha1.DeviceConfig, nodes *v1.NodeList) error { + return nil +} + func (km *kmmModule) SetNodeVersionLabelAsDesired(ctx context.Context, devConfig *amdv1alpha1.DeviceConfig, nodes *v1.NodeList) error { // for each selected node // put the KMM version label given by CR's driver version diff --git a/tests/e2e/yamls/charts/gpu-operator/values.yaml b/tests/e2e/yamls/charts/gpu-operator/values.yaml index 7d0eb7003..6415b7715 100644 --- a/tests/e2e/yamls/charts/gpu-operator/values.yaml +++ b/tests/e2e/yamls/charts/gpu-operator/values.yaml @@ -1,7 +1,8 @@ node-feature-discovery: enabled: true # Set to false to disable node-feature-discovery kmm: - enabled: true # Set to false to disable kmm + enabled: true # Set to false to skip KMM installation + watch: true # Set to false to disable KMM watching/usage remediation: enabled: true installCRDs: true diff --git a/tests/helm-e2e/helm_e2e_test.go b/tests/helm-e2e/helm_e2e_test.go index a41c38c03..86a0ece5d 100644 --- a/tests/helm-e2e/helm_e2e_test.go +++ b/tests/helm-e2e/helm_e2e_test.go @@ -18,6 +18,7 @@ package e2e import ( "bytes" + "context" "fmt" "os" "os/exec" @@ -1088,3 +1089,57 @@ deviceConfig: s.verifyDefaultDeviceConfig(c, tc.description, tc.expectDefaultCR, tc.expectSpec, tc.verifyFunc) } } + +func (s *E2ESuite) TestHelmInstallWithKMMDisabled(c *C) { + // Test case: install helm chart with kmm.enabled=false and kmm.watch=false + // Verify: + // 1. No KMM CRDs installed (modules.kmm.sigs.x-k8s.io, nodemodulesconfigs.kmm.sigs.x-k8s.io) + // 2. No KMM controller/webhook deployed + // 3. GPU Operator controller is running without crashes + + logger.Info("Installing helm chart with KMM completely disabled") + s.installHelmChart(c, false, []string{ + "--set", "kmm.enabled=false", + "--set", "kmm.watch=false", + }) + + // Verify 1: No KMM controller/webhook deployments + logger.Info("Verifying KMM deployments are not created") + deploymentList, err := s.clientSet.AppsV1().Deployments(s.ns).List(context.TODO(), v1.ListOptions{ + LabelSelector: "app.kubernetes.io/component=kmm", + }) + assert.NoError(c, err, "Failed to list KMM deployments") + assert.True(c, len(deploymentList.Items) == 0, + "Expected no KMM deployments, but found %d", len(deploymentList.Items)) + + // Verify 2: GPU Operator controller deployment exists and has correct environment variables + logger.Info("Verifying GPU Operator controller deployment exists") + deploymentList, err = s.clientSet.AppsV1().Deployments(s.ns).List(context.TODO(), v1.ListOptions{ + LabelSelector: "control-plane=controller-manager", + }) + assert.NoError(c, err, "Failed to list GPU Operator controller deployments") + assert.True(c, len(deploymentList.Items) > 0, "No GPU Operator controller deployment found") + + deployment := deploymentList.Items[0] + + // Verify KMM_WATCH_ENABLED environment variable is set to "false" + logger.Info("Verifying KMM_WATCH_ENABLED environment variable is set correctly") + found := false + for _, container := range deployment.Spec.Template.Spec.Containers { + if container.Name == "manager" { + for _, env := range container.Env { + if env.Name == "KMM_WATCH_ENABLED" { + found = true + assert.Equal(c, "false", env.Value, + "Expected KMM_WATCH_ENABLED to be 'false', got '%s'", env.Value) + break + } + } + break + } + } + assert.True(c, found, "KMM_WATCH_ENABLED environment variable not found in manager container") + + logger.Info("KMM disabled test passed - cleaning up") + s.uninstallHelmChart(c, false, nil) +}