Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,7 @@ update-version:
sed -i 's|defaultConfigManagerImage.*=.*"docker.io/rocm/device-config-manager:[^"]*"|defaultConfigManagerImage = "docker.io/rocm/device-config-manager:${PROJECT_VERSION}"|' internal/configmanager/configmanager.go
sed -i 's|defaultMetricsExporterImage.*=.*"docker.io/rocm/device-metrics-exporter:[^"]*"|defaultMetricsExporterImage = "docker.io/rocm/device-metrics-exporter:${PROJECT_VERSION}"|' internal/metricsexporter/metricsexporter.go
sed -i 's|defaultTestRunnerImage.*=.*"docker.io/rocm/test-runner:[^"]*"|defaultTestRunnerImage = "docker.io/rocm/test-runner:${PROJECT_VERSION}"|' internal/testrunner/testrunner.go
${MAKE} fmt

.PHONY: manifests
manifests: controller-gen update-registry update-version ## Generate ClusterRole and CustomResourceDefinition objects.
Expand Down
11 changes: 11 additions & 0 deletions hack/k8s-patch/metadata-patch/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,17 @@ remediation:
# -- Set the controller image for remediation workflow controller deployment
controller:
image: "quay.io/argoproj/workflow-controller:v3.6.5"
# -- Node selector for remediation workflow controller deployment
nodeSelector: {}
# -- Deployment affinity configs for remediation workflow controller
affinity:
nodeAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 1
preference:
matchExpressions:
- key: node-role.kubernetes.io/control-plane
operator: Exists
# -- Default NFD rule will detect amd gpu based on pci vendor ID
installdefaultNFDRule: true
# -- CRD will be patched as pre-upgrade/pre-rollback hook when doing helm upgrade/rollback to current helm chart
Expand Down
4 changes: 2 additions & 2 deletions hack/k8s-patch/template-patch/remediation-deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -296,11 +296,11 @@ spec:
labels:
app: amd-gpu-operator-workflow-controller
spec:
{{- with .Values.controllerManager.affinity }}
{{- with .Values.remediation.controller.affinity }}
affinity:
{{- toYaml . | nindent 8 }}
{{- end }}
nodeSelector: {{- toYaml .Values.controllerManager.nodeSelector | nindent 8 }}
nodeSelector: {{- toYaml .Values.remediation.controller.nodeSelector | nindent 8 }}
containers:
- name: workflow-controller
command: [ "workflow-controller" ]
Expand Down
4 changes: 3 additions & 1 deletion helm-charts-k8s/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -267,7 +267,9 @@ Kubernetes: `>= 1.29.0-0`
| node-feature-discovery.enabled | bool | `true` | Set to true/false to enable/disable the installation of node feature discovery (NFD) operator |
| node-feature-discovery.worker.nodeSelector | object | `{}` | Set nodeSelector for NFD worker daemonset |
| node-feature-discovery.worker.tolerations | list | `[{"effect":"NoExecute","key":"amd-dcm","operator":"Equal","value":"up"},{"effect":"NoSchedule","key":"amd-gpu-unhealthy","operator":"Exists"}]` | Set tolerations for NFD worker daemonset |
| remediation.controller | object | `{"image":"quay.io/argoproj/workflow-controller:v3.6.5"}` | Set the controller image for remediation workflow controller deployment |
| remediation.controller | object | `{"affinity":{"nodeAffinity":{"preferredDuringSchedulingIgnoredDuringExecution":[{"preference":{"matchExpressions":[{"key":"node-role.kubernetes.io/control-plane","operator":"Exists"}]},"weight":1}]}},"image":"quay.io/argoproj/workflow-controller:v3.6.5","nodeSelector":{}}` | Set the controller image for remediation workflow controller deployment |
| remediation.controller.affinity | object | `{"nodeAffinity":{"preferredDuringSchedulingIgnoredDuringExecution":[{"preference":{"matchExpressions":[{"key":"node-role.kubernetes.io/control-plane","operator":"Exists"}]},"weight":1}]}}` | Deployment affinity configs for remediation workflow controller |
| remediation.controller.nodeSelector | object | `{}` | Node selector for remediation workflow controller deployment |
| remediation.enabled | bool | `true` | Set to true/false to enable/disable the installation of remediation workflow controller |
| remediation.installCRDs | bool | `true` | Set to true/false to enable/disable the installation of Argo CRDs used by the remediation workflow controller |
| upgradeCRD | bool | `true` | CRD will be patched as pre-upgrade/pre-rollback hook when doing helm upgrade/rollback to current helm chart |
Expand Down
4 changes: 2 additions & 2 deletions helm-charts-k8s/templates/remediation-deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -296,11 +296,11 @@ spec:
labels:
app: amd-gpu-operator-workflow-controller
spec:
{{- with .Values.controllerManager.affinity }}
{{- with .Values.remediation.controller.affinity }}
affinity:
{{- toYaml . | nindent 8 }}
{{- end }}
nodeSelector: {{- toYaml .Values.controllerManager.nodeSelector | nindent 8 }}
nodeSelector: {{- toYaml .Values.remediation.controller.nodeSelector | nindent 8 }}
containers:
- name: workflow-controller
command: [ "workflow-controller" ]
Expand Down
11 changes: 11 additions & 0 deletions helm-charts-k8s/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,17 @@ remediation:
# -- Set the controller image for remediation workflow controller deployment
controller:
image: "quay.io/argoproj/workflow-controller:v3.6.5"
# -- Node selector for remediation workflow controller deployment
nodeSelector: {}
# -- Deployment affinity configs for remediation workflow controller
affinity:
nodeAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 1
preference:
matchExpressions:
- key: node-role.kubernetes.io/control-plane
operator: Exists
# -- Default NFD rule will detect amd gpu based on pci vendor ID
installdefaultNFDRule: true
# -- CRD will be patched as pre-upgrade/pre-rollback hook when doing helm upgrade/rollback to current helm chart
Expand Down
14 changes: 14 additions & 0 deletions tests/e2e/yamls/charts/gpu-operator/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,20 @@ node-feature-discovery:
enabled: true # Set to false to disable node-feature-discovery
kmm:
enabled: true # Set to false to disable kmm
remediation:
enabled: true
installCRDs: true
controller:
image: "quay.io/argoproj/workflow-controller:v3.6.5"
nodeSelector: {}
affinity:
nodeAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 1
preference:
matchExpressions:
- key: node-role.kubernetes.io/control-plane
operator: Exists
installdefaultNFDRule: true # default NFD rule will detect amd gpu based on pci vendor ID
controllerManager:
manager:
Expand Down
Loading