Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ helm install amd-gpu-operator rocm/gpu-operator-charts \
Installation Options
- Skip NFD installation: `--set node-feature-discovery.enabled=false`
- Skip KMM installation: `--set kmm.enabled=false`
- Disable KMM watching/usage: `--set kmm.watch=false`
- Skip Auto Node Remediation: `--set remediation.enabled=false`
- Enable DRA driver (instead of device plugin): `--set deviceConfig.spec.draDriver.enable=true --set deviceConfig.spec.devicePlugin.enableDevicePlugin=false`
- Disable DeviceClass creation: `--set draDriver.deviceClass.create=false`
Expand Down
25 changes: 22 additions & 3 deletions cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ package main

import (
"flag"
"os"
"strconv"

workflowv1alpha1 "github.com/argoproj/argo-workflows/v3/pkg/apis/workflow/v1alpha1"
kmmv1beta1 "github.com/rh-ecosystem-edge/kernel-module-management/api/v1beta1"
Expand Down Expand Up @@ -98,7 +100,18 @@ func main() {

setupLogger := logger.WithName("setup")

setupLogger.Info("Creating manager", "version", Version, "git commit", GitCommit, "build tag", BuildTag)
// Read KMM watch configuration from environment variable
kmmWatchEnabled := true // default to true for backward compatibility
if kmmWatchEnv := os.Getenv("KMM_WATCH_ENABLED"); kmmWatchEnv != "" {
var err error
kmmWatchEnabled, err = strconv.ParseBool(kmmWatchEnv)
if err != nil {
setupLogger.Error(err, "invalid KMM_WATCH_ENABLED value, defaulting to true", "value", kmmWatchEnv)
kmmWatchEnabled = true
}
}

setupLogger.Info("Creating manager", "version", Version, "git commit", GitCommit, "build tag", BuildTag, "kmmWatchEnabled", kmmWatchEnabled)

setupLogger.Info("Parsing configuration file", "path", configFile)

Expand All @@ -118,7 +131,12 @@ func main() {
// Use manager's client, it may read from a cache.
client := mgr.GetClient()
isOpenShift := utils.IsOpenShift(setupLogger)
kmmHandler := kmmmodule.NewKMMModule(client, scheme, isOpenShift)
var kmmHandler kmmmodule.KMMModuleAPI
if kmmWatchEnabled {
kmmHandler = kmmmodule.NewKMMModule(client, scheme, isOpenShift)
} else {
kmmHandler = kmmmodule.NewNoOpKMMModule()
}
dpHandler := plugin.NewDevicePlugin(client, scheme, isOpenShift)
nlHandler := nodelabeller.NewNodeLabeller(scheme, isOpenShift)
metricsHandler := metricsexporter.NewMetricsExporter(scheme)
Expand All @@ -135,7 +153,8 @@ func main() {
testrunnerHandler,
configmanagerHandler,
workerMgr,
isOpenShift)
isOpenShift,
kmmWatchEnabled)
if err = dcr.SetupWithManager(mgr); err != nil {
cmd.FatalError(setupLogger, err, "unable to create controller", "name", controllers.DeviceConfigReconcilerName)
}
Expand Down
8 changes: 7 additions & 1 deletion docs/installation/kubernetes-helm.md
Original file line number Diff line number Diff line change
Expand Up @@ -126,8 +126,14 @@ helm install amd-gpu-operator rocm/gpu-operator-charts \
Installation Options
- Skip NFD installation: `--set node-feature-discovery.enabled=false`
- Skip KMM installation: `--set kmm.enabled=false` <br> Although KMM is a [Kubernetes-SIGs](https://github.com/kubernetes-sigs) maintained project, it is strongly recommended to use AMD optimized and published KMM images included in each operator release.
- Disable GPU operator watching KMM resources: `--set kmm.watch=false`
- Skip Auto Node Remediation: `--set remediation.enabled=false`
- Disable default DeviceConfig installation: `--set crds.defaultCR.install=false`

**KMM Configuration Examples:**
- Use existing KMM installation: `--set kmm.enabled=false --set kmm.watch=true`
- Skip KMM for alternative driver management: `--set kmm.enabled=false --set kmm.watch=false`
- Install KMM without GPU operator using it: `--set kmm.enabled=true --set kmm.watch=false`
```

```{tip}
Expand Down Expand Up @@ -167,6 +173,7 @@ The following parameters are able to be configued when using the Helm Chart. In
| controllerManager.nodeSelector | object | `{}` | Node selector for AMD GPU operator controller manager deployment |
| installdefaultNFDRule | bool | `true` | Set to true to install default NFD rule for detecting AMD GPU hardware based on pci vendor ID and device ID |
| kmm.enabled | bool | `true` | Set to true/false to enable/disable the installation of kernel module management (KMM) operator |
| kmm.watch | bool | `true` | Set to true/false to enable/disable GPU operator watching and using KMM resources |
| node-feature-discovery.enabled | bool | `true` | Set to true/false to enable/disable the installation of node feature discovery (NFD) operator |
| upgradeCRD | bool | `true` | CRD will be patched as pre-upgrade/pre-rollback hook when doing helm upgrade/rollback to current helm chart |
| kmm.controller.affinity | object | `{"nodeAffinity":{"preferredDuringSchedulingIgnoredDuringExecution":[{"preference":{"matchExpressions":[{"key":"node-role.kubernetes.io/control-plane","operator":"Exists"}]},"weight":1}]}}` | Affinity for the KMM controller manager deployment |
Expand Down Expand Up @@ -206,7 +213,6 @@ The following parameters are able to be configued when using the Helm Chart. In
| kmm.kubernetesClusterDomain | string | `"cluster.local"` | |
| kmm.managerConfig.controllerConfigYaml | string | `"healthProbeBindAddress: :8081\nwebhookPort: 9443\nleaderElection:\n enabled: true\n resourceID: kmm.sigs.x-k8s.io\nmetrics:\n enableAuthnAuthz: true\n bindAddress: 0.0.0.0:8443\n secureServing: true\nworker:\n runAsUser: 0\n seLinuxType: spc_t\n firmwareHostPath: /var/lib/firmware"` | |
| kmm.webhookServer.affinity | object | `{"nodeAffinity":{"preferredDuringSchedulingIgnoredDuringExecution":[{"preference":{"matchExpressions":[{"key":"node-role.kubernetes.io/control-plane","operator":"Exists"}]},"weight":1}]}}` | KMM webhook's deployment affinity configs |
| kmm.enabled | bool | `true` | Set to true/false to enable/disable the installation of kernel module management (KMM) operator |
| kmm.webhookServer.nodeAffinity.nodeSelectorTerms | list | `[{"key":"node-role.kubernetes.io/control-plane","operator":"Exists"},{"key":"node-role.kubernetes.io/master","operator":"Exists"}]` | Node affinity selector terms config for the KMM webhook deployment, set it to [] if you want to make affinity config empty |
| kmm.webhookServer.nodeSelector | object | `{}` | KMM webhook's deployment node selector |
| kmm.webhookServer.replicas | int | `1` | |
Expand Down
12 changes: 12 additions & 0 deletions docs/releasenotes.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,17 @@
# Release Notes

## GPU Operator v1.5.0 Release Notes

### Release Highlights

- **Enhanced KMM (Kernel Module Management) Configuration Control**
- **Independent Control of KMM Installation and Usage**
- New helm parameters provide separate control over KMM installation and resource watching:
- `kmm.enabled`: Controls KMM subchart installation (default: `true`)
- `kmm.watch`: Controls GPU operator watching KMM resources (default: `true`)
- Supports multiple deployment scenarios: use existing KMM installations (`enabled=false, watch=true`), skip KMM entirely for alternative driver solutions (`enabled=false, watch=false`), or install KMM without asking for GPU Operator to use it (`enabled=true, watch=false`)
- Fully backward compatible: existing configurations with `kmm.enabled=false` continue to work without changes

## GPU Operator v1.4.1 Release Notes

The AMD GPU Operator v1.4.1 release extends platform support to OpenShift v4.20 and Debian 12, and introduces the ability to build `amdgpu` kernel modules directly within air-gapped OpenShift clusters.
Expand Down
4 changes: 3 additions & 1 deletion hack/k8s-patch/metadata-patch/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,10 @@ node-feature-discovery:
nodeSelector: {}
# KMM related configs
kmm:
# -- Set to true/false to enable/disable the installation of kernel module management (KMM) operator
# -- Set to true/false to enable/disable the installation of kernel module management (KMM) operator subchart
enabled: true
# -- Set to true/false to enable/disable GPU operator watching and using KMM resources
watch: true
# Remediation related configs
remediation:
# -- Set to true/false to enable/disable the installation of remediation workflow controller
Expand Down
2 changes: 2 additions & 0 deletions hack/k8s-patch/template-patch/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@ spec:
value: {{ quote .Values.kubernetesClusterDomain }}
- name: SIM_ENABLE
value: {{ quote .Values.controllerManager.env.simEnable }}
- name: KMM_WATCH_ENABLED
value: {{ quote .Values.kmm.watch }}
image: {{ .Values.controllerManager.manager.image.repository }}:{{ .Values.controllerManager.manager.image.tag
| default .Chart.AppVersion }}
imagePullPolicy: {{ .Values.controllerManager.manager.imagePullPolicy }}
Expand Down
4 changes: 3 additions & 1 deletion helm-charts-k8s/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ helm install amd-gpu-operator rocm/gpu-operator-charts \
Installation Options
- Skip NFD installation: `--set node-feature-discovery.enabled=false`
- Skip KMM installation: `--set kmm.enabled=false`
- Disable KMM watching/usage: `--set kmm.watch=false`
- Skip Auto Node Remediation: `--set remediation.enabled=false`
- Enable DRA driver (instead of device plugin): `--set deviceConfig.spec.draDriver.enable=true --set deviceConfig.spec.devicePlugin.enableDevicePlugin=false`
- Disable DeviceClass creation: `--set draDriver.deviceClass.create=false`
Expand Down Expand Up @@ -266,7 +267,8 @@ Kubernetes: `>= 1.29.0-0`
| deviceConfig.spec.testRunner.upgradePolicy.upgradeStrategy | string | `"RollingUpdate"` | the type of daemonset upgrade, RollingUpdate or OnDelete |
| draDriver.deviceClass.create | bool | `true` | Create the gpu.amd.com DeviceClass resource. Set to false if managing the DRA driver independently. |
| installdefaultNFDRule | bool | `true` | Default NFD rule will detect amd gpu based on pci vendor ID |
| kmm.enabled | bool | `true` | Set to true/false to enable/disable the installation of kernel module management (KMM) operator |
| kmm.enabled | bool | `true` | Set to true/false to enable/disable the installation of kernel module management (KMM) operator subchart |
| kmm.watch | bool | `true` | Set to true/false to enable/disable GPU operator watching and using KMM resources |
| node-feature-discovery.enabled | bool | `true` | Set to true/false to enable/disable the installation of node feature discovery (NFD) operator |
| node-feature-discovery.worker.nodeSelector | object | `{}` | Set nodeSelector for NFD worker daemonset |
| node-feature-discovery.worker.tolerations | list | `[{"effect":"NoExecute","key":"amd-dcm","operator":"Equal","value":"up"},{"effect":"NoSchedule","key":"amd-gpu-unhealthy","operator":"Exists"}]` | Set tolerations for NFD worker daemonset |
Expand Down
2 changes: 2 additions & 0 deletions helm-charts-k8s/templates/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@ spec:
value: {{ quote .Values.kubernetesClusterDomain }}
- name: SIM_ENABLE
value: {{ quote .Values.controllerManager.env.simEnable }}
- name: KMM_WATCH_ENABLED
value: {{ quote .Values.kmm.watch }}
image: {{ .Values.controllerManager.manager.image.repository }}:{{ .Values.controllerManager.manager.image.tag
| default .Chart.AppVersion }}
imagePullPolicy: {{ .Values.controllerManager.manager.imagePullPolicy }}
Expand Down
4 changes: 3 additions & 1 deletion helm-charts-k8s/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,10 @@ node-feature-discovery:
nodeSelector: {}
# KMM related configs
kmm:
# -- Set to true/false to enable/disable the installation of kernel module management (KMM) operator
# -- Set to true/false to enable/disable the installation of kernel module management (KMM) operator subchart
enabled: true
# -- Set to true/false to enable/disable GPU operator watching and using KMM resources
watch: true
# Remediation related configs
remediation:
# -- Set to true/false to enable/disable the installation of remediation workflow controller
Expand Down
Loading
Loading