diff --git a/Makefile b/Makefile index de5c2b406..61c424c00 100644 --- a/Makefile +++ b/Makefile @@ -227,6 +227,7 @@ update-version: sed -i 's|defaultConfigManagerImage.*=.*"docker.io/rocm/device-config-manager:[^"]*"|defaultConfigManagerImage = "docker.io/rocm/device-config-manager:${PROJECT_VERSION}"|' internal/configmanager/configmanager.go sed -i 's|defaultMetricsExporterImage.*=.*"docker.io/rocm/device-metrics-exporter:[^"]*"|defaultMetricsExporterImage = "docker.io/rocm/device-metrics-exporter:${PROJECT_VERSION}"|' internal/metricsexporter/metricsexporter.go sed -i 's|defaultTestRunnerImage.*=.*"docker.io/rocm/test-runner:[^"]*"|defaultTestRunnerImage = "docker.io/rocm/test-runner:${PROJECT_VERSION}"|' internal/testrunner/testrunner.go + ${MAKE} fmt .PHONY: manifests manifests: controller-gen update-registry update-version ## Generate ClusterRole and CustomResourceDefinition objects. diff --git a/hack/k8s-patch/metadata-patch/values.yaml b/hack/k8s-patch/metadata-patch/values.yaml index 975c73441..3e82c8fc5 100644 --- a/hack/k8s-patch/metadata-patch/values.yaml +++ b/hack/k8s-patch/metadata-patch/values.yaml @@ -28,6 +28,17 @@ remediation: # -- Set the controller image for remediation workflow controller deployment controller: image: "quay.io/argoproj/workflow-controller:v3.6.5" + # -- Node selector for remediation workflow controller deployment + nodeSelector: {} + # -- Deployment affinity configs for remediation workflow controller + affinity: + nodeAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 1 + preference: + matchExpressions: + - key: node-role.kubernetes.io/control-plane + operator: Exists # -- Default NFD rule will detect amd gpu based on pci vendor ID installdefaultNFDRule: true # -- CRD will be patched as pre-upgrade/pre-rollback hook when doing helm upgrade/rollback to current helm chart diff --git a/hack/k8s-patch/template-patch/remediation-deployment.yaml b/hack/k8s-patch/template-patch/remediation-deployment.yaml index 6aaca100b..9aaf468cb 100644 --- a/hack/k8s-patch/template-patch/remediation-deployment.yaml +++ b/hack/k8s-patch/template-patch/remediation-deployment.yaml @@ -296,11 +296,11 @@ spec: labels: app: amd-gpu-operator-workflow-controller spec: - {{- with .Values.controllerManager.affinity }} + {{- with .Values.remediation.controller.affinity }} affinity: {{- toYaml . | nindent 8 }} {{- end }} - nodeSelector: {{- toYaml .Values.controllerManager.nodeSelector | nindent 8 }} + nodeSelector: {{- toYaml .Values.remediation.controller.nodeSelector | nindent 8 }} containers: - name: workflow-controller command: [ "workflow-controller" ] diff --git a/helm-charts-k8s/README.md b/helm-charts-k8s/README.md index 918947efa..0ae784a33 100644 --- a/helm-charts-k8s/README.md +++ b/helm-charts-k8s/README.md @@ -267,7 +267,9 @@ Kubernetes: `>= 1.29.0-0` | node-feature-discovery.enabled | bool | `true` | Set to true/false to enable/disable the installation of node feature discovery (NFD) operator | | node-feature-discovery.worker.nodeSelector | object | `{}` | Set nodeSelector for NFD worker daemonset | | node-feature-discovery.worker.tolerations | list | `[{"effect":"NoExecute","key":"amd-dcm","operator":"Equal","value":"up"},{"effect":"NoSchedule","key":"amd-gpu-unhealthy","operator":"Exists"}]` | Set tolerations for NFD worker daemonset | -| remediation.controller | object | `{"image":"quay.io/argoproj/workflow-controller:v3.6.5"}` | Set the controller image for remediation workflow controller deployment | +| remediation.controller | object | `{"affinity":{"nodeAffinity":{"preferredDuringSchedulingIgnoredDuringExecution":[{"preference":{"matchExpressions":[{"key":"node-role.kubernetes.io/control-plane","operator":"Exists"}]},"weight":1}]}},"image":"quay.io/argoproj/workflow-controller:v3.6.5","nodeSelector":{}}` | Set the controller image for remediation workflow controller deployment | +| remediation.controller.affinity | object | `{"nodeAffinity":{"preferredDuringSchedulingIgnoredDuringExecution":[{"preference":{"matchExpressions":[{"key":"node-role.kubernetes.io/control-plane","operator":"Exists"}]},"weight":1}]}}` | Deployment affinity configs for remediation workflow controller | +| remediation.controller.nodeSelector | object | `{}` | Node selector for remediation workflow controller deployment | | remediation.enabled | bool | `true` | Set to true/false to enable/disable the installation of remediation workflow controller | | remediation.installCRDs | bool | `true` | Set to true/false to enable/disable the installation of Argo CRDs used by the remediation workflow controller | | upgradeCRD | bool | `true` | CRD will be patched as pre-upgrade/pre-rollback hook when doing helm upgrade/rollback to current helm chart | diff --git a/helm-charts-k8s/templates/remediation-deployment.yaml b/helm-charts-k8s/templates/remediation-deployment.yaml index 6aaca100b..9aaf468cb 100644 --- a/helm-charts-k8s/templates/remediation-deployment.yaml +++ b/helm-charts-k8s/templates/remediation-deployment.yaml @@ -296,11 +296,11 @@ spec: labels: app: amd-gpu-operator-workflow-controller spec: - {{- with .Values.controllerManager.affinity }} + {{- with .Values.remediation.controller.affinity }} affinity: {{- toYaml . | nindent 8 }} {{- end }} - nodeSelector: {{- toYaml .Values.controllerManager.nodeSelector | nindent 8 }} + nodeSelector: {{- toYaml .Values.remediation.controller.nodeSelector | nindent 8 }} containers: - name: workflow-controller command: [ "workflow-controller" ] diff --git a/helm-charts-k8s/values.yaml b/helm-charts-k8s/values.yaml index 975c73441..3e82c8fc5 100644 --- a/helm-charts-k8s/values.yaml +++ b/helm-charts-k8s/values.yaml @@ -28,6 +28,17 @@ remediation: # -- Set the controller image for remediation workflow controller deployment controller: image: "quay.io/argoproj/workflow-controller:v3.6.5" + # -- Node selector for remediation workflow controller deployment + nodeSelector: {} + # -- Deployment affinity configs for remediation workflow controller + affinity: + nodeAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 1 + preference: + matchExpressions: + - key: node-role.kubernetes.io/control-plane + operator: Exists # -- Default NFD rule will detect amd gpu based on pci vendor ID installdefaultNFDRule: true # -- CRD will be patched as pre-upgrade/pre-rollback hook when doing helm upgrade/rollback to current helm chart diff --git a/tests/e2e/yamls/charts/gpu-operator/values.yaml b/tests/e2e/yamls/charts/gpu-operator/values.yaml index 9fcd6207d..7d0eb7003 100644 --- a/tests/e2e/yamls/charts/gpu-operator/values.yaml +++ b/tests/e2e/yamls/charts/gpu-operator/values.yaml @@ -2,6 +2,20 @@ node-feature-discovery: enabled: true # Set to false to disable node-feature-discovery kmm: enabled: true # Set to false to disable kmm +remediation: + enabled: true + installCRDs: true + controller: + image: "quay.io/argoproj/workflow-controller:v3.6.5" + nodeSelector: {} + affinity: + nodeAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 1 + preference: + matchExpressions: + - key: node-role.kubernetes.io/control-plane + operator: Exists installdefaultNFDRule: true # default NFD rule will detect amd gpu based on pci vendor ID controllerManager: manager: