From f62c18418d789d358f963ab31c9db3d9b3d56a67 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Tue, 12 May 2026 17:16:33 +0000 Subject: [PATCH] Sync kubex charts from automation-controller main @ d8bde98 --- charts/kubex-automation-engine/README.md | 5 +- .../docs/Automation-Strategies.md | 2 + .../docs/Cluster-Automation-Strategies.md | 2 + .../docs/Configuration-Reference.md | 1 + .../docs/GPU-Sharing-with-KAI.md | 175 ++++++ .../docs/Global-Configuration.md | 2 + .../docs/Safety-Controls.md | 4 + .../docs/gpu-consolidation-policy.md | 86 +++ .../docs/gpu-rebalancing-policies.md | 146 +++++ .../templates/clusterautomationstrategy.yaml | 38 ++ .../templates/deployment.yaml | 2 + .../templates/globalconfiguration.yaml | 8 +- .../templates/kai-queues.yaml | 47 ++ .../templates/policyevaluation.yaml | 4 + .../templates/role.yaml | 9 + .../templates/validatingwebhook.yaml | 70 +++ charts/kubex-automation-engine/values.yaml | 37 ++ ...tsizing.kubex.ai_automationstrategies.yaml | 108 ++++ ....kubex.ai_clusterautomationstrategies.yaml | 108 ++++ ....ai_clusterwidegpurebalancingpolicies.yaml | 501 ++++++++++++++++++ ...tsizing.kubex.ai_globalconfigurations.yaml | 44 ++ ...ing.kubex.ai_gpuconsolidationpolicies.yaml | 208 ++++++++ ...izing.kubex.ai_gpurebalancingpolicies.yaml | 476 +++++++++++++++++ 23 files changed, 2080 insertions(+), 3 deletions(-) create mode 100644 charts/kubex-automation-engine/docs/GPU-Sharing-with-KAI.md create mode 100644 charts/kubex-automation-engine/docs/gpu-consolidation-policy.md create mode 100644 charts/kubex-automation-engine/docs/gpu-rebalancing-policies.md create mode 100644 charts/kubex-automation-engine/templates/kai-queues.yaml create mode 100644 charts/kubex-crds/templates/rightsizing.kubex.ai_clusterwidegpurebalancingpolicies.yaml create mode 100644 charts/kubex-crds/templates/rightsizing.kubex.ai_gpuconsolidationpolicies.yaml create mode 100644 charts/kubex-crds/templates/rightsizing.kubex.ai_gpurebalancingpolicies.yaml diff --git a/charts/kubex-automation-engine/README.md b/charts/kubex-automation-engine/README.md index becc601..05532df 100644 --- a/charts/kubex-automation-engine/README.md +++ b/charts/kubex-automation-engine/README.md @@ -36,8 +36,8 @@ The Helm chart supports both Helm-managed configuration and manually managed cus Important: -- The Helm-managed `scope` and `policy.policies` values preserve the existing values-driven flow from `values-edit.yaml` by generating `AutomationStrategy` and `ClusterProactivePolicy`, but those CRs can also be created and managed independently of Helm -- `ProactivePolicy`, `StaticPolicy`, `ClusterStaticPolicy`, and `ClusterAutomationStrategy` are supported by the controller but are managed as separate CR manifests today +- The Helm-managed `scope` and `policy.policies` values preserve the existing values-driven flow from `values-edit.yaml` by generating `ClusterAutomationStrategy` and `ClusterProactivePolicy`, but those CRs can also be created and managed independently of Helm +- `ProactivePolicy`, `StaticPolicy`, `ClusterStaticPolicy`, and namespaced `AutomationStrategy` are supported by the controller but are managed as separate CR manifests today ## Core Components @@ -101,6 +101,7 @@ This guide covers: | **[Global Configuration Reference](./docs/Global-Configuration.md)** | Field-by-field reference for the `GlobalConfiguration` custom resource | | **[Policy Configuration](./docs/Policy-Configuration.md)** | Configure strategies, policy scope, precedence, and Helm-managed policy generation | | **[Policy Evaluation Reference](./docs/Policy-Evaluation.md)** | Policy type precedence configuration via the `PolicyEvaluation` singleton | +| **[GPU Sharing with KAI](./docs/GPU-Sharing-with-KAI.md)** | Configure KAI-backed GPU sharing, rebalancing, and early consolidation | | **[Apply Updates](./docs/Getting-Started.md#apply-configuration-updates)** | Re-run `helm upgrade` after configuration changes | ## Advanced Topics diff --git a/charts/kubex-automation-engine/docs/Automation-Strategies.md b/charts/kubex-automation-engine/docs/Automation-Strategies.md index 50b96d5..22140f2 100644 --- a/charts/kubex-automation-engine/docs/Automation-Strategies.md +++ b/charts/kubex-automation-engine/docs/Automation-Strategies.md @@ -1,5 +1,7 @@ # Automation Strategies +> Experimental: GPU/KAI-related fields in this resource are subject to breaking changes. When using them, set `spec.experimental.gpuKaiContract: v1alpha1-2026-04`. + `AutomationStrategy` defines how resizing is allowed to happen within a namespace. Use it when a team owns its own namespace and should manage resize behavior locally. diff --git a/charts/kubex-automation-engine/docs/Cluster-Automation-Strategies.md b/charts/kubex-automation-engine/docs/Cluster-Automation-Strategies.md index 1c42178..84df302 100644 --- a/charts/kubex-automation-engine/docs/Cluster-Automation-Strategies.md +++ b/charts/kubex-automation-engine/docs/Cluster-Automation-Strategies.md @@ -1,5 +1,7 @@ # Cluster Automation Strategies +> Experimental: GPU/KAI-related fields in this resource are subject to breaking changes. When using them, set `spec.experimental.gpuKaiContract: v1alpha1-2026-04`. + `ClusterAutomationStrategy` defines how resizing is allowed to happen for cluster-scoped policy flows. Use it when a platform team wants one reusable resize behavior that can be referenced by `ClusterProactivePolicy` and `ClusterStaticPolicy` across multiple namespaces. diff --git a/charts/kubex-automation-engine/docs/Configuration-Reference.md b/charts/kubex-automation-engine/docs/Configuration-Reference.md index 95c355b..2a391be 100644 --- a/charts/kubex-automation-engine/docs/Configuration-Reference.md +++ b/charts/kubex-automation-engine/docs/Configuration-Reference.md @@ -151,6 +151,7 @@ Use [Global Configuration Reference](./Global-Configuration.md) for the CR field | `globalConfiguration.webhookProbe.resources` | `{}` | Resource requests and limits for the dry-run webhook probe container | | `globalConfiguration.webhookProbe.podSecurityContext` | `{}` | Pod security context for the dry-run webhook probe Pod | | `globalConfiguration.webhookProbe.securityContext` | `{}` | Container security context for the dry-run webhook probe container | +| `experimental.gpuKaiContract` | `v1alpha1-2026-04` | Required acknowledgement token for experimental GPU/KAI CR fields rendered by the chart | ## Helm-Managed Policy Values diff --git a/charts/kubex-automation-engine/docs/GPU-Sharing-with-KAI.md b/charts/kubex-automation-engine/docs/GPU-Sharing-with-KAI.md new file mode 100644 index 0000000..895204b --- /dev/null +++ b/charts/kubex-automation-engine/docs/GPU-Sharing-with-KAI.md @@ -0,0 +1,175 @@ +# GPU Sharing with KAI + +This guide shows how to configure GPU sharing with KAI and Kubex Automation Engine. + +Tested with KAI `v0.12.16`. + +> [!IMPORTANT] +> GPU/KAI fields and related custom resources are experimental and subject to breaking changes. Set `spec.experimental.gpuKaiContract: v1alpha1-2026-04` on GPU/KAI resources. + +## Prerequisites + +- KAI is already installed in the cluster +- `kubex-crds` and `kubex-automation-engine` are already installed +- Prometheus is available for GPU utilization metrics if you want to use `GpuRebalancingPolicy` + +This guide works with either: + +- a new KAI installation +- an existing KAI installation + +For existing KAI-managed workloads, Kubex Automation Engine can update the `gpu-fraction` annotation without replacing the existing `kai.scheduler/queue` label. + +## Starter Example + +The following example creates: + +- an `AutomationStrategy` for KAI-enabled workloads in namespace `ml-team-a` +- a `StaticPolicy` that sets an initial shared GPU request for matching `Deployment` workloads +- a `GpuRebalancingPolicy` that adjusts that shared GPU request based on Prometheus GPU metrics + +Both policies target `Deployment` workloads in a specific namespace that carry `nvidia.com/gpu.present: "true"`. + +```yaml +apiVersion: rightsizing.kubex.ai/v1alpha1 +kind: AutomationStrategy +metadata: + name: kai-gpu-sharing + namespace: ml-team-a +spec: + experimental: + gpuKaiContract: v1alpha1-2026-04 + enablement: + gpu: + overrideScheduler: "kai" + requests: + downsize: true + upsize: true + setFromUnspecified: false + kai: + queue: kubex-unlimited-gpu-queue + setQueueWhenSpecified: false + inPlaceResize: + enabled: false + podEviction: + enabled: true +--- +apiVersion: rightsizing.kubex.ai/v1alpha1 +kind: StaticPolicy +metadata: + name: kai-gpu-sharing-baseline + namespace: ml-team-a +spec: + scope: + labelSelector: + matchLabels: + nvidia.com/gpu.present: "true" + workloadTypes: + - Deployment + resources: + containers: + "*": + requests: + gpu: "0.25" + automationStrategyRef: + name: kai-gpu-sharing +--- +apiVersion: rightsizing.kubex.ai/v1alpha1 +kind: GpuRebalancingPolicy +metadata: + name: kai-gpu-sharing-rebalancing + namespace: ml-team-a +spec: + experimental: + gpuKaiContract: v1alpha1-2026-04 + scope: + labelSelector: + matchLabels: + nvidia.com/gpu.present: "true" + workloadTypes: + - Deployment + minPodMetricsAge: 15m + metrics: + compute: + upsize: + thresholdPercent: 125 + metricsWindow: 10m + headroomPercent: 20 + maxPercent: 200 + scaleBack: + thresholdPercent: 60 + metricsWindow: 10m + headroomPercent: 20 + prometheus: + metric: kubex_gpu_container_compute_utilization_percent + namespaceLabel: namespace + podLabel: pod + containerLabel: container + memory: + upsize: + thresholdPercent: 125 + metricsWindow: 10m + headroomPercent: 20 + maxPercent: 200 + scaleBack: + thresholdPercent: 60 + metricsWindow: 10m + headroomPercent: 20 + prometheus: + metric: kubex_gpu_container_memory_utilization_percent + namespaceLabel: namespace + podLabel: pod + containerLabel: container + automationStrategyRef: + name: kai-gpu-sharing +``` + +## Automation Strategy Notes + +For KAI-enabled workloads, start with `spec.inPlaceResize.enabled: false`. + +- Eviction-based resize is the safer path today for KAI-enabled workloads. +- In-place resizing for KAI-enabled workloads can be experimented with, but it is currently unstable. + +## Existing KAI Installations + +For workloads that are already scheduled through KAI: + +- keep the existing `kai.scheduler/queue` label on the workload template +- let Kubex Automation Engine update `gpu-fraction` as policies are applied + +That allows Kubex Automation Engine to participate in GPU sharing without taking over queue assignment. + +If you want queue assignment to be done via Kubex, set `spec.kai.setQueueWhenSpecified: false` in your AutomationStrategy. + +## GPU Node Consolidation + +`GpuConsolidationPolicy` can be used to consolidate KAI GPU workloads onto fewer GPU nodes. + +Example targeting a specific worker pool: + +```yaml +apiVersion: rightsizing.kubex.ai/v1alpha1 +kind: GpuConsolidationPolicy +metadata: + name: kai-gpu-workers-a +spec: + experimental: + gpuKaiContract: v1alpha1-2026-04 + nodeSelector: + matchLabels: + nodepool: gpu-workers-a + utilizationThresholdPercent: 70 + requeueAfter: 2m +``` + +## Consolidation Limitations + +GPU node consolidation is very early and has known limitations. + +- It assumes pods will be schedulable on other nodes if they fit by GPU fraction. +- It does not yet fully model all other scheduler constraints. +- That can lead to frequent evictions when the controller chooses a node that looks drainable from GPU capacity alone but cannot actually be rescheduled cleanly. +- It may behave unpredictably with nodes that have multiple GPUs. + +Use it carefully and start with a narrowly scoped worker pool. diff --git a/charts/kubex-automation-engine/docs/Global-Configuration.md b/charts/kubex-automation-engine/docs/Global-Configuration.md index 35108e1..d44ed90 100644 --- a/charts/kubex-automation-engine/docs/Global-Configuration.md +++ b/charts/kubex-automation-engine/docs/Global-Configuration.md @@ -1,5 +1,7 @@ # Global Configuration +> Experimental: GPU/KAI-related fields in this resource are subject to breaking changes. When using them, set `spec.experimental.gpuKaiContract: v1alpha1-2026-04`. + `GlobalConfiguration` defines cluster-wide controller behavior that applies across strategies and policies. Use it to control recommendation refresh timing, proactive rescans, heartbeat reporting, global automation switches, protected namespaces, and webhook health thresholds. diff --git a/charts/kubex-automation-engine/docs/Safety-Controls.md b/charts/kubex-automation-engine/docs/Safety-Controls.md index e08fc40..318a0a9 100644 --- a/charts/kubex-automation-engine/docs/Safety-Controls.md +++ b/charts/kubex-automation-engine/docs/Safety-Controls.md @@ -51,6 +51,8 @@ How to interpret `retry` in the context of this document: | Runtime name | Stage | Controlled by | Behavior | Targets / metadata | Typical message | | --- | --- | --- | --- | --- | --- | | `no-resize-needed` | plan build | always on | Marks resources where desired equals current and no action is needed | target list only | n/a (summary marker) | +| `floor-clamped` | plan build | `spec.enablement.*.(requests|limits).floor` | Clamps a desired value up to the configured floor before execution | filter metadata includes `value`, `originalDesired`, `clampedDesired`, and `rule` | n/a (summary marker) | +| `ceiling-clamped` | plan build | `spec.enablement.*.(requests|limits).ceiling` | Clamps a desired value down to the configured ceiling before execution | filter metadata includes `value`, `originalDesired`, `clampedDesired`, and `rule` | n/a (summary marker) | | `automation-strategy-disabled` | action filter | `spec.enablement.*` fields in the referenced strategy | Removes actions disallowed by direction such as `upsize`, `downsize`, or `setFromUnspecified` | filter metadata may include `direction` | `upsize disabled`, `downsize disabled`, or `setFromUnspecified disabled` | | `change-below-threshold` | pod action filter | `spec.safetyChecks.minCpuChangePercent`, `spec.safetyChecks.minMemoryChangePercent` | Removes actions whose percent delta is below threshold | target list only | `delta ... below minimum ...` | | `hpa-resource-managed` | pod action filter | `spec.safetyChecks.enableHpaFilter` | Removes actions for CPU or memory managed by a matching HPA, including KEDA-managed HPA handling | filter metadata may include `source=hpa` and `hpaMode` | `HPA targets ` | @@ -80,6 +82,7 @@ Notes: - `failedChecks` contains check failures with `name`, optional `message`, and optional `metadata`. - `appliedFilters` contains pruned actions with `name`, optional filter `metadata`, and `targets` with `container`, `usage`, and `resource`. +- Bounds enforcement also appears in `appliedFilters`: `floor-clamped` and `ceiling-clamped` indicate the controller adjusted the desired value before deciding whether any resize action still remained. - `pause-active` sets `failedChecks[].metadata.scope` to `pod` or `namespace`; namespace pauses also include `failedChecks[].metadata.namespace`. Example interpretation: @@ -87,6 +90,7 @@ Example interpretation: - `{"name":"min-ready-duration-not-met","message":"pod not ready"}` in `failedChecks` means execution is blocked for now and retried. - `{"name":"resource-quota-exceeded","message":"resource quota exceeded (ResourceQuota/team-quota)","metadata":{"name":"team-quota"}}` in `failedChecks` identifies the specific blocking quota. - `{"name":"hpa-resource-managed","targets":[{"container":"app","usage":"requests","resource":"cpu"}]}` in `appliedFilters` means that resize action was removed because HPA controls that resource. +- `{"name":"floor-clamped","metadata":{"value":"200Mi","originalDesired":"128Mi","clampedDesired":"200Mi","rule":"AutomationStrategy/example-rule"},"targets":[{"container":"app","usage":"requests","resource":"memory"}]}` in `appliedFilters` means the recommendation was raised to the strategy floor before execution. ## Related Guides diff --git a/charts/kubex-automation-engine/docs/gpu-consolidation-policy.md b/charts/kubex-automation-engine/docs/gpu-consolidation-policy.md new file mode 100644 index 0000000..72aaa47 --- /dev/null +++ b/charts/kubex-automation-engine/docs/gpu-consolidation-policy.md @@ -0,0 +1,86 @@ +# GPU Consolidation Policy + +> Experimental: GPU/KAI fields and related custom resources are subject to breaking changes. Set `spec.experimental.gpuKaiContract: v1alpha1-2026-04`. + +`GpuConsolidationPolicy` is a cluster-scoped controller that looks at scheduled pods carrying the `gpu-fraction` annotation and tries to consolidate them off an underutilized node. + +## Behavior + +- The controller scans all scheduled, non-terminal pods with `metadata.annotations["gpu-fraction"]`. +- `spec.nodeSelector` is required and uses standard Kubernetes label selector semantics. +- Each policy defines one compatibility pool. Create multiple policies when you need multiple compatible node pools. +- Only nodes selected by `spec.nodeSelector` are considered compatible for candidate selection and destination placement. +- Selected nodes are expected to be mutually compatible for GPU workload movement. +- Node GPU capacity is taken from `status.allocatable["nvidia.com/gpu"]`. +- Nodes with utilization below `spec.utilizationThresholdPercent` are candidates, but nodes with no GPU-fraction pods are ignored. +- Candidates are evaluated from most underutilized to least underutilized. +- A node is consolidated only when every GPU-fraction pod on that node can fit onto other non-empty GPU nodes without exceeding their allocatable capacity. +- The controller evicts all pods from the first drainable candidate node it finds in a reconcile loop. +- Eviction is node-wide for a selected consolidation candidate: once a node is marked for consolidation, every evictable pod on that node is targeted, including pods without workload owners such as static pods. +- Reconciliation is policy-driven: the controller runs on `GpuConsolidationPolicy` changes and on the periodic timer from `spec.requeueAfter`. +- Pod and Node changes do not trigger immediate rescans. +- If no node can be fully drained, the controller records that outcome in status and waits for the next `spec.requeueAfter`. + +## Examples + +```yaml +apiVersion: rightsizing.kubex.ai/v1alpha1 +kind: GpuConsolidationPolicy +metadata: + name: gpu-consolidation-pool-a +spec: + experimental: + gpuKaiContract: v1alpha1-2026-04 + nodeSelector: + matchLabels: + kubex.ai/gpu-pool: pool-a + utilizationThresholdPercent: 75 + requeueAfter: 1m +``` + +Use one policy per compatibility pool: + +```yaml +apiVersion: rightsizing.kubex.ai/v1alpha1 +kind: GpuConsolidationPolicy +metadata: + name: gpu-consolidation-l40s +spec: + experimental: + gpuKaiContract: v1alpha1-2026-04 + nodeSelector: + matchExpressions: + - key: kubex.ai/gpu-pool + operator: In + values: + - batch-l40s + - key: accelerator.nvidia.com/class + operator: In + values: + - l40s + utilizationThresholdPercent: 70 + requeueAfter: 2m +--- +apiVersion: rightsizing.kubex.ai/v1alpha1 +kind: GpuConsolidationPolicy +metadata: + name: gpu-consolidation-h100 +spec: + experimental: + gpuKaiContract: v1alpha1-2026-04 + nodeSelector: + matchLabels: + kubex.ai/gpu-pool: training-h100 + utilizationThresholdPercent: 80 + requeueAfter: 1m +``` + +## Notes + +- This policy is cluster-scoped only. +- `spec.nodeSelector` is the compatibility boundary for consolidation. +- It is self-contained and does not reference `AutomationStrategy`. +- Consolidation is based on GPU-fraction capacity only; it does not model CPU, memory, or scheduler affinity constraints. +- Consolidation drain behavior is not limited to GPU-fraction pods. After a node is selected, the node is drained by evicting all evictable pods on it, even when some of those pods do not have owners. +- If `spec.nodeSelector` matches no nodes, the policy reports `NoMatchingNodeSelector` and performs no evictions. +- If you need faster reaction to workload churn, lower `spec.requeueAfter`. diff --git a/charts/kubex-automation-engine/docs/gpu-rebalancing-policies.md b/charts/kubex-automation-engine/docs/gpu-rebalancing-policies.md new file mode 100644 index 0000000..0baf6ce --- /dev/null +++ b/charts/kubex-automation-engine/docs/gpu-rebalancing-policies.md @@ -0,0 +1,146 @@ +# GPU Rebalancing Policies + +> Experimental: GPU/KAI fields and related custom resources are subject to breaking changes. Set `spec.experimental.gpuKaiContract: v1alpha1-2026-04`. + +`GpuRebalancingPolicy` and `ClusterWideGpuRebalancingPolicy` emit GPU rebalancing recommendations (upsize and downsize) from Prometheus utilization. + +## Behavior + +- Baseline is the live GPU allocation before first upsize and is persisted per container. +- Pod metrics are considered only after the pod age reaches `spec.minPodMetricsAge` (default `15m`). +- The policy evaluates two required GPU signals: `spec.metrics.compute` and `spec.metrics.memory`. +- Each metric is evaluated from per-pod aggregate GPU usage: the controller sums inferred GPU usage across all GPU containers in the pod, and compares that pod total against the summed current GPU allocation for those same containers. +- Upsize evaluation is pod-level per metric: if any eligible pod total exceeds that metric's threshold, that metric can request an upsize. +- Scale-back evaluation is owner-wide per metric: all eligible pod totals must stay below `currentAllocation * (spec.metrics..scaleBack.thresholdPercent/100)` over `spec.metrics..scaleBack.metricsWindow`, and every included container in those pods must have samples. +- `spec.metrics..upsize.metricsWindow` and `spec.metrics..scaleBack.metricsWindow` must be at least `1m`. +- Pods without `status.startTime` are treated as ineligible for metric checks. +- Containers missing a current GPU allocation are excluded from aggregate math and omitted from the emitted recommendation for that reconcile. +- Recommendations are emitted as GPU **requests** (`gpu`) and consumed by normal policy evaluation. +- Upsize target is the hottest eligible pod total plus `spec.metrics..upsize.headroomPercent` (default `20`), capped by `spec.metrics..upsize.maxPercent` relative to the included containers' summed baseline. +- Scale-back target is the hottest eligible pod total plus `spec.metrics..scaleBack.headroomPercent` (default `20`), floored to the included containers' summed baseline. +- The controller compares the compute and memory recommendations and keeps the higher desired total. +- A lower recommendation is accepted when the existing recommendation was driven by the same metric. +- A lower cross-metric recommendation is accepted only if the previously driving metric also emits in that reconcile and its candidate is at or below the accepted lower total. Equal lower totals keep the existing driving metric to avoid ownership churn. +- Existing recommendations that predate driving-metric metadata do not lower until a non-decreasing recommendation establishes an owner metric. +- After a workload total is chosen, it is redistributed back to the selected pod's containers in proportion to their current GPU allocations. +- If neither upsize nor scale-back produces an accepted recommendation and current allocation still differs from baseline, the controller reuses the previous recommendation when present; otherwise it emits nothing. +- Workloads matched by GPU rebalancing policies are reevaluated periodically using `GlobalConfiguration.spec.gpuRebalancingCheckInterval` (default `1m`). + +## Namespaced Spec + +```yaml +apiVersion: rightsizing.kubex.ai/v1alpha1 +kind: GpuRebalancingPolicy +metadata: + name: gpu-rebalance + namespace: default +spec: + experimental: + gpuKaiContract: v1alpha1-2026-04 + scope: + labelSelector: + matchLabels: + app: my-gpu-app + minPodMetricsAge: 15m + metrics: + compute: + upsize: + thresholdPercent: 125 + metricsWindow: 10m + headroomPercent: 20 + maxPercent: 200 + scaleBack: + thresholdPercent: 60 + metricsWindow: 10m + headroomPercent: 20 + prometheus: + metric: kubex_gpu_container_compute_utilization_percent + namespaceLabel: namespace + podLabel: pod + containerLabel: container + memory: + upsize: + thresholdPercent: 125 + metricsWindow: 10m + headroomPercent: 20 + maxPercent: 200 + scaleBack: + thresholdPercent: 60 + metricsWindow: 10m + headroomPercent: 20 + prometheus: + metric: kubex_gpu_container_memory_utilization_percent + namespaceLabel: namespace + podLabel: pod + containerLabel: container + automationStrategyRef: + name: sample-automation-strategy +``` + +## Cluster-Wide Spec + +```yaml +apiVersion: rightsizing.kubex.ai/v1alpha1 +kind: ClusterWideGpuRebalancingPolicy +metadata: + name: gpu-rebalance-cluster +spec: + experimental: + gpuKaiContract: v1alpha1-2026-04 + scope: + namespaceSelector: + operator: In + values: ["*"] + labelSelector: + matchLabels: + app: my-gpu-app + minPodMetricsAge: 15m + metrics: + compute: + upsize: + thresholdPercent: 125 + metricsWindow: 10m + headroomPercent: 20 + maxPercent: 200 + scaleBack: + thresholdPercent: 60 + metricsWindow: 10m + headroomPercent: 20 + prometheus: + metric: kubex_gpu_container_compute_utilization_percent + namespaceLabel: namespace + podLabel: pod + containerLabel: container + memory: + upsize: + thresholdPercent: 125 + metricsWindow: 10m + headroomPercent: 20 + maxPercent: 200 + scaleBack: + thresholdPercent: 60 + metricsWindow: 10m + headroomPercent: 20 + prometheus: + metric: kubex_gpu_container_memory_utilization_percent + namespaceLabel: namespace + podLabel: pod + containerLabel: container + automationStrategyRef: + name: sample-clusterwide-automation-strategy +``` + +## Global Prometheus Settings + +Configure controller-wide Prometheus endpoint/timeouts via `GlobalConfiguration`: + +```yaml +apiVersion: rightsizing.kubex.ai/v1alpha1 +kind: GlobalConfiguration +metadata: + name: global-config +spec: + prometheus: + url: http://prometheus.monitoring.svc:9090 + requestTimeout: 30s +``` diff --git a/charts/kubex-automation-engine/templates/clusterautomationstrategy.yaml b/charts/kubex-automation-engine/templates/clusterautomationstrategy.yaml index 31b750b..2a63e6b 100644 --- a/charts/kubex-automation-engine/templates/clusterautomationstrategy.yaml +++ b/charts/kubex-automation-engine/templates/clusterautomationstrategy.yaml @@ -8,6 +8,10 @@ metadata: labels: {{- include "kubex-automation-engine.labels" $ | nindent 4 }} spec: + {{- if and $policyConfig.enablement $policyConfig.enablement.gpu }} + experimental: + gpuKaiContract: {{ $.Values.experimental.gpuKaiContract | quote }} + {{- end }} {{- if $policyConfig.enablement }} enablement: {{- if $policyConfig.enablement.cpu }} @@ -140,6 +144,40 @@ spec: {{- end }} {{- end }} {{- end }} + {{- if $policyConfig.enablement.gpu }} + {{- $gpuReq := default $policyConfig.enablement.gpu.request $policyConfig.enablement.gpu.requests }} + gpu: + {{- if $gpuReq }} + requests: + {{- if hasKey $gpuReq "downsize" }} + downsize: {{ $gpuReq.downsize }} + {{- end }} + {{- if hasKey $gpuReq "upsize" }} + upsize: {{ $gpuReq.upsize }} + {{- end }} + {{- if hasKey $gpuReq "setFromUnspecified" }} + setFromUnspecified: {{ $gpuReq.setFromUnspecified }} + {{- end }} + {{- if $gpuReq.floor }} + floor: {{ $gpuReq.floor | quote }} + {{- end }} + {{- if $gpuReq.ceiling }} + ceiling: {{ $gpuReq.ceiling | quote }} + {{- end }} + {{- if $gpuReq.containers }} + containers: + {{- range $containerName, $bounds := $gpuReq.containers }} + {{ $containerName }}: + {{- if $bounds.floor }} + floor: {{ $bounds.floor | quote }} + {{- end }} + {{- if $bounds.ceiling }} + ceiling: {{ $bounds.ceiling | quote }} + {{- end }} + {{- end }} + {{- end }} + {{- end }} + {{- end }} {{- end }} {{- if $policyConfig.inPlaceResize }} inPlaceResize: diff --git a/charts/kubex-automation-engine/templates/deployment.yaml b/charts/kubex-automation-engine/templates/deployment.yaml index 66ef3cf..7ffe4a6 100644 --- a/charts/kubex-automation-engine/templates/deployment.yaml +++ b/charts/kubex-automation-engine/templates/deployment.yaml @@ -116,6 +116,7 @@ spec: mountPath: {{ $localRecommendationsMountPath | quote }} readOnly: true {{- end }} + {{- if .Values.gateway.enabled }} - name: automation-gateway image: {{ .Values.gateway.image.repository }}:{{ required "gateway.image.tag must be set to an immutable gateway image tag" .Values.gateway.image.tag }} imagePullPolicy: {{ .Values.gateway.image.pullPolicy | default "IfNotPresent" }} @@ -159,6 +160,7 @@ spec: mountPath: /densify/config/api - name: densify-data mountPath: /densify/data + {{- end }} volumes: - name: cert secret: diff --git a/charts/kubex-automation-engine/templates/globalconfiguration.yaml b/charts/kubex-automation-engine/templates/globalconfiguration.yaml index 62011f1..68ac235 100644 --- a/charts/kubex-automation-engine/templates/globalconfiguration.yaml +++ b/charts/kubex-automation-engine/templates/globalconfiguration.yaml @@ -5,7 +5,7 @@ apiVersion: rightsizing.kubex.ai/v1alpha1 kind: GlobalConfiguration metadata: - name: global-config + name: {{ .Values.globalConfiguration.name }} labels: {{- include "kubex-automation-engine.labels" . | nindent 4 }} spec: @@ -14,6 +14,7 @@ spec: rescanInterval: {{ $rescanInterval }} mutationLogInterval: {{ .Values.globalConfiguration.mutationLogInterval }} snapshotInterval: {{ .Values.globalConfiguration.snapshotInterval }} + gpuRebalancingCheckInterval: {{ .Values.globalConfiguration.gpuRebalancingCheckInterval }} heartbeatInterval: {{ .Values.globalConfiguration.heartbeatInterval }} kubexAPIRequestTimeout: {{ $kubexAPIRequestTimeout }} @@ -31,6 +32,11 @@ spec: failureThreshold: {{ .Values.globalConfiguration.webhookHealth.failureThreshold }} successThreshold: {{ .Values.globalConfiguration.webhookHealth.successThreshold }} transitionCheckInterval: {{ .Values.globalConfiguration.webhookHealth.transitionCheckInterval | quote }} + kai: + schedulerName: {{ .Values.globalConfiguration.kai.schedulerName | quote }} + prometheus: + url: {{ .Values.globalConfiguration.prometheus.url | quote }} + requestTimeout: {{ .Values.globalConfiguration.prometheus.requestTimeout | quote }} webhookProbe: image: {{ .Values.globalConfiguration.webhookProbe.image | default (include "kubex-automation-engine.image" .) | quote }} {{- with .Values.globalConfiguration.webhookProbe.labels }} diff --git a/charts/kubex-automation-engine/templates/kai-queues.yaml b/charts/kubex-automation-engine/templates/kai-queues.yaml new file mode 100644 index 0000000..87a76ef --- /dev/null +++ b/charts/kubex-automation-engine/templates/kai-queues.yaml @@ -0,0 +1,47 @@ +{{- if .Values.kaiQueues.enabled -}} +{{- if not (.Capabilities.APIVersions.Has "scheduling.run.ai/v2/Queue") -}} +{{- fail (printf "\n\nERROR: kaiQueues.enabled=true requires the Run:ai Queue CRD (apiVersion: scheduling.run.ai/v2, kind: Queue).\nInstall the Run:ai scheduling CRDs first, or disable queue creation with --set kaiQueues.enabled=false.") -}} +{{- end -}} +apiVersion: scheduling.run.ai/v2 +kind: Queue +metadata: + name: kubex-parent-queue + labels: + {{- include "kubex-automation-engine.labels" . | nindent 4 }} +spec: + resources: + cpu: + quota: -1 + limit: -1 + overQuotaWeight: 1 + gpu: + quota: -1 + limit: -1 + overQuotaWeight: 1 + memory: + quota: -1 + limit: -1 + overQuotaWeight: 1 +--- +apiVersion: scheduling.run.ai/v2 +kind: Queue +metadata: + name: kubex-unlimited-gpu-queue + labels: + {{- include "kubex-automation-engine.labels" . | nindent 4 }} +spec: + parentQueue: kubex-parent-queue + resources: + cpu: + quota: -1 + limit: -1 + overQuotaWeight: 1 + gpu: + quota: -1 + limit: -1 + overQuotaWeight: 1 + memory: + quota: -1 + limit: -1 + overQuotaWeight: 1 +{{- end -}} diff --git a/charts/kubex-automation-engine/templates/policyevaluation.yaml b/charts/kubex-automation-engine/templates/policyevaluation.yaml index 6e4357b..748d5b7 100644 --- a/charts/kubex-automation-engine/templates/policyevaluation.yaml +++ b/charts/kubex-automation-engine/templates/policyevaluation.yaml @@ -8,6 +8,10 @@ metadata: {{- include "kubex-automation-engine.labels" . | nindent 4 }} spec: precedence: + - type: GpuRebalancingPolicy + priority: 120 + - type: ClusterWideGpuRebalancingPolicy + priority: 110 - type: StaticPolicy priority: 90 - type: ClusterStaticPolicy diff --git a/charts/kubex-automation-engine/templates/role.yaml b/charts/kubex-automation-engine/templates/role.yaml index 1a6ce8a..410776c 100644 --- a/charts/kubex-automation-engine/templates/role.yaml +++ b/charts/kubex-automation-engine/templates/role.yaml @@ -126,7 +126,10 @@ rules: resources: - clusterproactivepolicies - clusterstaticpolicies + - clusterwidegpurebalancingpolicies - globalconfigurations + - gpuconsolidationpolicies + - gpurebalancingpolicies - policyevaluations - proactivepolicies - staticpolicies @@ -143,7 +146,10 @@ rules: resources: - clusterproactivepolicies/finalizers - clusterstaticpolicies/finalizers + - clusterwidegpurebalancingpolicies/finalizers - globalconfigurations/finalizers + - gpuconsolidationpolicies/finalizers + - gpurebalancingpolicies/finalizers - policyevaluations/finalizers - proactivepolicies/finalizers - staticpolicies/finalizers @@ -154,7 +160,10 @@ rules: resources: - clusterproactivepolicies/status - clusterstaticpolicies/status + - clusterwidegpurebalancingpolicies/status - globalconfigurations/status + - gpuconsolidationpolicies/status + - gpurebalancingpolicies/status - policyevaluations/status - proactivepolicies/status - staticpolicies/status diff --git a/charts/kubex-automation-engine/templates/validatingwebhook.yaml b/charts/kubex-automation-engine/templates/validatingwebhook.yaml index a058b6a..90a8925 100644 --- a/charts/kubex-automation-engine/templates/validatingwebhook.yaml +++ b/charts/kubex-automation-engine/templates/validatingwebhook.yaml @@ -107,6 +107,29 @@ webhooks: resources: - clusterstaticpolicies sideEffects: None +- admissionReviewVersions: + - v1 + clientConfig: + {{- if not .Values.webhook.certManager.enabled }} + caBundle: {{ .caCert }} + {{- end }} + service: + name: {{ include "kubex-automation-engine.fullname" . }}-webhook-service + namespace: {{ include "kubex-automation-engine.namespace" . }} + path: /validate-rightsizing-kubex-ai-v1alpha1-clusterwidegpurebalancingpolicy + failurePolicy: {{ .Values.webhook.failurePolicy }} + name: vclusterwidegpurebalancingpolicy-v1alpha1.kb.io + rules: + - apiGroups: + - rightsizing.kubex.ai + apiVersions: + - v1alpha1 + operations: + - CREATE + - UPDATE + resources: + - clusterwidegpurebalancingpolicies + sideEffects: None - admissionReviewVersions: - v1 clientConfig: @@ -126,9 +149,56 @@ webhooks: - v1alpha1 operations: - CREATE + - UPDATE resources: - globalconfigurations sideEffects: None +- admissionReviewVersions: + - v1 + clientConfig: + {{- if not .Values.webhook.certManager.enabled }} + caBundle: {{ .caCert }} + {{- end }} + service: + name: {{ include "kubex-automation-engine.fullname" . }}-webhook-service + namespace: {{ include "kubex-automation-engine.namespace" . }} + path: /validate-rightsizing-kubex-ai-v1alpha1-gpuconsolidationpolicy + failurePolicy: {{ .Values.webhook.failurePolicy }} + name: vgpuconsolidationpolicy-v1alpha1.kb.io + rules: + - apiGroups: + - rightsizing.kubex.ai + apiVersions: + - v1alpha1 + operations: + - CREATE + - UPDATE + resources: + - gpuconsolidationpolicies + sideEffects: None +- admissionReviewVersions: + - v1 + clientConfig: + {{- if not .Values.webhook.certManager.enabled }} + caBundle: {{ .caCert }} + {{- end }} + service: + name: {{ include "kubex-automation-engine.fullname" . }}-webhook-service + namespace: {{ include "kubex-automation-engine.namespace" . }} + path: /validate-rightsizing-kubex-ai-v1alpha1-gpurebalancingpolicy + failurePolicy: {{ .Values.webhook.failurePolicy }} + name: vgpurebalancingpolicy-v1alpha1.kb.io + rules: + - apiGroups: + - rightsizing.kubex.ai + apiVersions: + - v1alpha1 + operations: + - CREATE + - UPDATE + resources: + - gpurebalancingpolicies + sideEffects: None - admissionReviewVersions: - v1 clientConfig: diff --git a/charts/kubex-automation-engine/values.yaml b/charts/kubex-automation-engine/values.yaml index 2838495..0555c3b 100644 --- a/charts/kubex-automation-engine/values.yaml +++ b/charts/kubex-automation-engine/values.yaml @@ -66,6 +66,12 @@ cleanup: drop: - "ALL" +# -- Image pull secrets for private registries +# Example: +# imagePullSecrets: +# - name: regcred +imagePullSecrets: [] + # -- Override the name of the chart nameOverride: "" @@ -151,6 +157,7 @@ localRecommendations: # Gateway sidecar configuration gateway: + enabled: true image: repository: "densify/automation-gateway" tag: "1.3" @@ -278,11 +285,18 @@ rbac: # CRD installation crds: {} +# Experimental feature acknowledgements for unstable APIs. +experimental: + # -- Required acknowledgement token for GPU/KAI experimental fields. + gpuKaiContract: "v1alpha1-2026-04" + # Global configuration defaults # These will be used to create the default GlobalConfiguration CR globalConfiguration: # -- Enable creation of default GlobalConfiguration enabled: true + # -- Name of the GlobalConfiguration resource + name: global-config # -- Recommendation reload interval (how often to fetch from Kubex) # Can also use deployment.controllerEnv.recommendationsFetchInterval for backward compatibility recommendationReloadInterval: "1h" @@ -294,6 +308,8 @@ globalConfiguration: kubexAPIRequestTimeout: "30s" # -- Mutation log send interval mutationLogInterval: "5m" + # -- GPU rebalancing policy workload check interval + gpuRebalancingCheckInterval: "1m" # -- Policy snapshot upload interval snapshotInterval: "30m" # -- Controller heartbeat report interval @@ -331,6 +347,19 @@ globalConfiguration: podSecurityContext: {} # -- Container security context for the webhook probe container. securityContext: {} + kai: + # -- Scheduler name set on pods when Kai GPU mutation is applied by the webhook + schedulerName: "kai-scheduler" + prometheus: + # -- Prometheus base URL used by GPU rebalancing policies + url: "http://prometheus.monitoring.svc:9090" + # -- Timeout for individual Prometheus query requests + requestTimeout: "30s" + +# Optional Run:ai Queue resources for Kai scheduler integrations +kaiQueues: + # -- Create built-in Run:ai Queue resources (`kubex-parent-queue` and `kubex-unlimited-gpu-queue`) + enabled: false # ================================================================ # BACKWARD COMPATIBILITY: Legacy environment variable configuration @@ -464,6 +493,14 @@ policy: # setFromUnspecified: true # # floor: "32Mi" # Optional minimum memory limit # # ceiling: "32Gi" # Optional maximum memory limit + # gpu: + # # GPU is requests-only; limits are not supported. + # request: + # downsize: true + # upsize: true + # setFromUnspecified: true + # # floor: "1" # Optional minimum GPU request + # # ceiling: "8" # Optional maximum GPU request # # # In-place resize configuration # inPlaceResize: diff --git a/charts/kubex-crds/templates/rightsizing.kubex.ai_automationstrategies.yaml b/charts/kubex-crds/templates/rightsizing.kubex.ai_automationstrategies.yaml index bf8ef6f..c00ecf4 100644 --- a/charts/kubex-crds/templates/rightsizing.kubex.ai_automationstrategies.yaml +++ b/charts/kubex-crds/templates/rightsizing.kubex.ai_automationstrategies.yaml @@ -147,6 +147,76 @@ spec: rule: '!has(self.floor) || !has(self.ceiling) || quantity(self.floor).compareTo(quantity(self.ceiling)) <= 0' type: object + gpu: + default: {} + description: |- + gpu controls enablement rules for GPU request resources. + EXPERIMENTAL: this field is subject to breaking changes or removal without notice. + properties: + overrideScheduler: + allOf: + - enum: + - none + - kai + - enum: + - none + - kai + default: none + description: |- + overrideScheduler selects the GPU mutation mode. + EXPERIMENTAL: this field is subject to breaking changes or removal without notice. + type: string + requests: + default: {} + description: |- + requests controls enablement for GPU requests. + EXPERIMENTAL: this field is subject to breaking changes or removal without notice. + properties: + ceiling: + description: ceiling is the maximum allowed resource quantity + for this usage. + type: string + containers: + additionalProperties: + properties: + ceiling: + description: ceiling is the maximum allowed resource + quantity for this usage. + type: string + floor: + description: floor is the minimum allowed resource + quantity for this usage. + type: string + type: object + description: |- + containers maps exact container names to floor/ceiling overrides for this usage. + Any bound omitted for a specific container falls back to the usage-level floor/ceiling. + type: object + downsize: + default: true + description: downsize controls whether resource downsizing + actions are permitted. + type: boolean + floor: + description: floor is the minimum allowed resource quantity + for this usage. + type: string + setFromUnspecified: + default: true + description: setFromUnspecified controls whether an unspecified + resource value may be set. + type: boolean + upsize: + default: true + description: upsize controls whether resource upsizing + actions are permitted. + type: boolean + type: object + x-kubernetes-validations: + - message: floor must be less than or equal to ceiling + rule: '!has(self.floor) || !has(self.ceiling) || quantity(self.floor).compareTo(quantity(self.ceiling)) + <= 0' + type: object memory: default: {} description: memory controls enablement rules for memory resources. @@ -249,6 +319,22 @@ spec: <= 0' type: object type: object + experimental: + description: |- + experimental must be set when using GPU/KAI fields. + EXPERIMENTAL: GPU/KAI strategy behavior is subject to breaking changes or removal without notice. + properties: + gpuKaiContract: + description: |- + gpuKaiContract explicitly acknowledges that GPU/KAI fields are experimental and may change. + EXPERIMENTAL: this acknowledgement value is required for unstable GPU/KAI behavior. + enum: + - none + - v1alpha1-2026-04 + type: string + required: + - gpuKaiContract + type: object inPlaceResize: default: {} description: inPlaceResize configures in-place resize behavior. @@ -263,6 +349,28 @@ spec: description: enabled controls whether in-place resize is allowed. type: boolean type: object + kai: + default: {} + description: |- + kai configures Kai scheduler-specific admission behavior for GPU actions. + EXPERIMENTAL: this field is subject to breaking changes or removal without notice. + properties: + queue: + default: kubex-unlimited-gpu-queue + description: |- + queue sets the Kai scheduler queue label value for GPU admission mutation when + the pod does not already define kai.scheduler/queue. + EXPERIMENTAL: this field is subject to breaking changes or removal without notice. + Keep this default aligned with DefaultKaiQueueName; kubebuilder markers cannot reference Go constants. + type: string + setQueueWhenSpecified: + default: false + description: |- + setQueueWhenSpecified controls whether Kai GPU admission mutation overwrites + an existing kai.scheduler/queue label when queue is specified. + EXPERIMENTAL: this field is subject to breaking changes or removal without notice. + type: boolean + type: object podEviction: default: {} description: podEviction configures eviction based resizing behavior. diff --git a/charts/kubex-crds/templates/rightsizing.kubex.ai_clusterautomationstrategies.yaml b/charts/kubex-crds/templates/rightsizing.kubex.ai_clusterautomationstrategies.yaml index 9522461..dbcd6c3 100644 --- a/charts/kubex-crds/templates/rightsizing.kubex.ai_clusterautomationstrategies.yaml +++ b/charts/kubex-crds/templates/rightsizing.kubex.ai_clusterautomationstrategies.yaml @@ -147,6 +147,76 @@ spec: rule: '!has(self.floor) || !has(self.ceiling) || quantity(self.floor).compareTo(quantity(self.ceiling)) <= 0' type: object + gpu: + default: {} + description: |- + gpu controls enablement rules for GPU request resources. + EXPERIMENTAL: this field is subject to breaking changes or removal without notice. + properties: + overrideScheduler: + allOf: + - enum: + - none + - kai + - enum: + - none + - kai + default: none + description: |- + overrideScheduler selects the GPU mutation mode. + EXPERIMENTAL: this field is subject to breaking changes or removal without notice. + type: string + requests: + default: {} + description: |- + requests controls enablement for GPU requests. + EXPERIMENTAL: this field is subject to breaking changes or removal without notice. + properties: + ceiling: + description: ceiling is the maximum allowed resource quantity + for this usage. + type: string + containers: + additionalProperties: + properties: + ceiling: + description: ceiling is the maximum allowed resource + quantity for this usage. + type: string + floor: + description: floor is the minimum allowed resource + quantity for this usage. + type: string + type: object + description: |- + containers maps exact container names to floor/ceiling overrides for this usage. + Any bound omitted for a specific container falls back to the usage-level floor/ceiling. + type: object + downsize: + default: true + description: downsize controls whether resource downsizing + actions are permitted. + type: boolean + floor: + description: floor is the minimum allowed resource quantity + for this usage. + type: string + setFromUnspecified: + default: true + description: setFromUnspecified controls whether an unspecified + resource value may be set. + type: boolean + upsize: + default: true + description: upsize controls whether resource upsizing + actions are permitted. + type: boolean + type: object + x-kubernetes-validations: + - message: floor must be less than or equal to ceiling + rule: '!has(self.floor) || !has(self.ceiling) || quantity(self.floor).compareTo(quantity(self.ceiling)) + <= 0' + type: object memory: default: {} description: memory controls enablement rules for memory resources. @@ -249,6 +319,22 @@ spec: <= 0' type: object type: object + experimental: + description: |- + experimental must be set when using GPU/KAI fields. + EXPERIMENTAL: GPU/KAI strategy behavior is subject to breaking changes or removal without notice. + properties: + gpuKaiContract: + description: |- + gpuKaiContract explicitly acknowledges that GPU/KAI fields are experimental and may change. + EXPERIMENTAL: this acknowledgement value is required for unstable GPU/KAI behavior. + enum: + - none + - v1alpha1-2026-04 + type: string + required: + - gpuKaiContract + type: object inPlaceResize: default: {} description: inPlaceResize configures in-place resize behavior. @@ -263,6 +349,28 @@ spec: description: enabled controls whether in-place resize is allowed. type: boolean type: object + kai: + default: {} + description: |- + kai configures Kai scheduler-specific admission behavior for GPU actions. + EXPERIMENTAL: this field is subject to breaking changes or removal without notice. + properties: + queue: + default: kubex-unlimited-gpu-queue + description: |- + queue sets the Kai scheduler queue label value for GPU admission mutation when + the pod does not already define kai.scheduler/queue. + EXPERIMENTAL: this field is subject to breaking changes or removal without notice. + Keep this default aligned with DefaultKaiQueueName; kubebuilder markers cannot reference Go constants. + type: string + setQueueWhenSpecified: + default: false + description: |- + setQueueWhenSpecified controls whether Kai GPU admission mutation overwrites + an existing kai.scheduler/queue label when queue is specified. + EXPERIMENTAL: this field is subject to breaking changes or removal without notice. + type: boolean + type: object podEviction: default: {} description: podEviction configures eviction based resizing behavior. diff --git a/charts/kubex-crds/templates/rightsizing.kubex.ai_clusterwidegpurebalancingpolicies.yaml b/charts/kubex-crds/templates/rightsizing.kubex.ai_clusterwidegpurebalancingpolicies.yaml new file mode 100644 index 0000000..aa2c813 --- /dev/null +++ b/charts/kubex-crds/templates/rightsizing.kubex.ai_clusterwidegpurebalancingpolicies.yaml @@ -0,0 +1,501 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.19.0 + name: clusterwidegpurebalancingpolicies.rightsizing.kubex.ai +spec: + group: rightsizing.kubex.ai + names: + kind: ClusterWideGpuRebalancingPolicy + listKind: ClusterWideGpuRebalancingPolicyList + plural: clusterwidegpurebalancingpolicies + singular: clusterwidegpurebalancingpolicy + scope: Cluster + versions: + - name: v1alpha1 + schema: + openAPIV3Schema: + description: ClusterWideGpuRebalancingPolicy is the Schema for the clusterwidegpurebalancingpolicies + API. + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: spec defines the desired state of ClusterWideGpuRebalancingPolicy + properties: + automationStrategyRef: + description: automationStrategyRef references the ClusterAutomationStrategy + to use when applying this policy. + properties: + name: + description: name is the name of the referenced strategy. + minLength: 1 + type: string + required: + - name + type: object + experimental: + description: |- + experimental must be set to acknowledge unstable GPU/KAI behavior. + EXPERIMENTAL: this API is subject to breaking changes or removal without notice. + properties: + gpuKaiContract: + description: |- + gpuKaiContract explicitly acknowledges that GPU/KAI fields are experimental and may change. + EXPERIMENTAL: this acknowledgement value is required for unstable GPU/KAI behavior. + enum: + - none + - v1alpha1-2026-04 + type: string + required: + - gpuKaiContract + type: object + metrics: + description: metrics configures compute and memory utilization signals + used for rebalancing. + properties: + compute: + description: compute configures the GPU compute utilization signal. + properties: + prometheus: + description: prometheus configures metric and label mapping + used for utilization checks. + properties: + containerLabel: + description: containerLabel is the metric label key containing + the container name. + minLength: 1 + pattern: ^[A-Za-z_][A-Za-z0-9_]*$ + type: string + metric: + description: metric is the Prometheus metric name used + to evaluate GPU utilization. + minLength: 1 + pattern: ^[A-Za-z_:][A-Za-z0-9_:]*$ + type: string + namespaceLabel: + description: namespaceLabel is the metric label key containing + the namespace name. + minLength: 1 + pattern: ^[A-Za-z_][A-Za-z0-9_]*$ + type: string + podLabel: + description: podLabel is the metric label key containing + the pod name. + minLength: 1 + pattern: ^[A-Za-z_][A-Za-z0-9_]*$ + type: string + required: + - containerLabel + - metric + - namespaceLabel + - podLabel + type: object + scaleBack: + default: + headroomPercent: 20 + metricsWindow: 10m + thresholdPercent: 80 + description: scaleBack configures GPU scale-back behavior + and thresholds for this metric. + properties: + headroomPercent: + default: 20 + description: headroomPercent adds extra GPU headroom above + inferred usage in the scale-back target. + format: int32 + minimum: 0 + type: integer + metricsWindow: + default: 10m + description: metricsWindow is the Prometheus lookback + window for sustained threshold checks. + type: string + x-kubernetes-validations: + - message: must be at least 1 minute + rule: duration(self) >= duration('1m') + thresholdPercent: + default: 80 + description: thresholdPercent is the absolute utilization + percentage threshold that triggers scale-back. + format: int32 + minimum: 1 + type: integer + type: object + upsize: + default: + headroomPercent: 20 + maxPercent: 200 + metricsWindow: 10m + thresholdPercent: 120 + description: upsize configures GPU upsize behavior and thresholds + for this metric. + properties: + headroomPercent: + default: 20 + description: headroomPercent adds extra GPU headroom above + inferred usage in the upsize target. + format: int32 + minimum: 0 + type: integer + maxPercent: + default: 200 + description: maxPercent caps growth relative to the stored + baseline allocation. + format: int32 + minimum: 1 + type: integer + metricsWindow: + default: 10m + description: metricsWindow is the Prometheus lookback + window for sustained threshold checks. + type: string + x-kubernetes-validations: + - message: must be at least 1 minute + rule: duration(self) >= duration('1m') + thresholdPercent: + default: 120 + description: thresholdPercent is the utilization percentage + threshold that triggers resizing. + format: int32 + minimum: 1 + type: integer + type: object + required: + - prometheus + type: object + memory: + description: memory configures the GPU memory utilization signal. + properties: + prometheus: + description: prometheus configures metric and label mapping + used for utilization checks. + properties: + containerLabel: + description: containerLabel is the metric label key containing + the container name. + minLength: 1 + pattern: ^[A-Za-z_][A-Za-z0-9_]*$ + type: string + metric: + description: metric is the Prometheus metric name used + to evaluate GPU utilization. + minLength: 1 + pattern: ^[A-Za-z_:][A-Za-z0-9_:]*$ + type: string + namespaceLabel: + description: namespaceLabel is the metric label key containing + the namespace name. + minLength: 1 + pattern: ^[A-Za-z_][A-Za-z0-9_]*$ + type: string + podLabel: + description: podLabel is the metric label key containing + the pod name. + minLength: 1 + pattern: ^[A-Za-z_][A-Za-z0-9_]*$ + type: string + required: + - containerLabel + - metric + - namespaceLabel + - podLabel + type: object + scaleBack: + default: + headroomPercent: 20 + metricsWindow: 10m + thresholdPercent: 80 + description: scaleBack configures GPU scale-back behavior + and thresholds for this metric. + properties: + headroomPercent: + default: 20 + description: headroomPercent adds extra GPU headroom above + inferred usage in the scale-back target. + format: int32 + minimum: 0 + type: integer + metricsWindow: + default: 10m + description: metricsWindow is the Prometheus lookback + window for sustained threshold checks. + type: string + x-kubernetes-validations: + - message: must be at least 1 minute + rule: duration(self) >= duration('1m') + thresholdPercent: + default: 80 + description: thresholdPercent is the absolute utilization + percentage threshold that triggers scale-back. + format: int32 + minimum: 1 + type: integer + type: object + upsize: + default: + headroomPercent: 20 + maxPercent: 200 + metricsWindow: 10m + thresholdPercent: 120 + description: upsize configures GPU upsize behavior and thresholds + for this metric. + properties: + headroomPercent: + default: 20 + description: headroomPercent adds extra GPU headroom above + inferred usage in the upsize target. + format: int32 + minimum: 0 + type: integer + maxPercent: + default: 200 + description: maxPercent caps growth relative to the stored + baseline allocation. + format: int32 + minimum: 1 + type: integer + metricsWindow: + default: 10m + description: metricsWindow is the Prometheus lookback + window for sustained threshold checks. + type: string + x-kubernetes-validations: + - message: must be at least 1 minute + rule: duration(self) >= duration('1m') + thresholdPercent: + default: 120 + description: thresholdPercent is the utilization percentage + threshold that triggers resizing. + format: int32 + minimum: 1 + type: integer + type: object + required: + - prometheus + type: object + required: + - compute + - memory + type: object + minPodMetricsAge: + default: 15m + description: |- + minPodMetricsAge is the minimum pod age required before pod metrics are considered. + Pods younger than this duration are skipped for metric-based rebalancing checks. + type: string + x-kubernetes-validations: + - message: must be zero or at least 1 minute + rule: self == '' || duration(self) == duration('0s') || duration(self) + >= duration('1m') + scope: + description: scope narrows the workloads and namespaces this policy + applies to. + properties: + labelSelector: + description: labelSelector limits the workload objects (e.g., + Deployments, CronJobs) this policy applies to. + properties: + matchExpressions: + description: matchExpressions is a list of label selector + requirements. The requirements are ANDed. + items: + description: |- + A label selector requirement is a selector that contains values, a key, and an operator that + relates the key and values. + properties: + key: + description: key is the label key that the selector + applies to. + type: string + operator: + description: |- + operator represents a key's relationship to a set of values. + Valid operators are In, NotIn, Exists and DoesNotExist. + type: string + values: + description: |- + values is an array of string values. If the operator is In or NotIn, + the values array must be non-empty. If the operator is Exists or DoesNotExist, + the values array must be empty. This array is replaced during a strategic + merge patch. + items: + type: string + type: array + x-kubernetes-list-type: atomic + required: + - key + - operator + type: object + type: array + x-kubernetes-list-type: atomic + matchLabels: + additionalProperties: + type: string + description: |- + matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels + map is equivalent to an element of matchExpressions, whose key field is "key", the + operator is "In", and the values array contains only "value". The requirements are ANDed. + type: object + type: object + x-kubernetes-map-type: atomic + namespaceSelector: + description: namespaceSelector restricts the namespaces this policy + applies to. + properties: + operator: + description: operator determines how the listed values are + evaluated. + enum: + - In + - NotIn + type: string + values: + description: values contains the namespace name patterns to + match. + items: + type: string + minItems: 1 + type: array + required: + - operator + - values + type: object + workloadTypes: + default: + - Deployment + - StatefulSet + - CronJob + - Rollout + - Job + - AnalysisRun + - DaemonSet + description: workloadTypes limits the workload kinds this policy + applies to. When omitted, all supported workload types are targeted. + items: + description: WorkloadType enumerates the workload kinds a policy + can target. + enum: + - Deployment + - StatefulSet + - DaemonSet + - CronJob + - Rollout + - Job + - AnalysisRun + type: string + type: array + required: + - namespaceSelector + type: object + weight: + default: 0 + description: |- + weight determines which policy wins when multiple policies of the same kind match. + Higher weights take precedence. When weights are equal, older policies win. + format: int32 + minimum: 0 + type: integer + required: + - automationStrategyRef + - experimental + - metrics + - scope + type: object + status: + description: status defines the observed state of ClusterWideGpuRebalancingPolicy + properties: + conditions: + description: |- + conditions represent the current state of the StaticPolicy resource. + Each condition has a unique type and reflects the status of a specific aspect of the resource. + + Standard condition types include: + - "Available": the resource is fully functional + - "Progressing": the resource is being created or updated + - "Degraded": the resource failed to reach or maintain its desired state + + The status of each condition is one of True, False, or Unknown. + items: + description: Condition contains details for one aspect of the current + state of this API Resource. + properties: + lastTransitionTime: + description: |- + lastTransitionTime is the last time the condition transitioned from one status to another. + This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. + format: date-time + type: string + message: + description: |- + message is a human readable message indicating details about the transition. + This may be an empty string. + maxLength: 32768 + type: string + observedGeneration: + description: |- + observedGeneration represents the .metadata.generation that the condition was set based upon. + For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date + with respect to the current state of the instance. + format: int64 + minimum: 0 + type: integer + reason: + description: |- + reason contains a programmatic identifier indicating the reason for the condition's last transition. + Producers of specific condition types may define expected values and meanings for this field, + and whether the values are considered a guaranteed API. + The value should be a CamelCase string. + This field may not be empty. + maxLength: 1024 + minLength: 1 + pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ + type: string + status: + description: status of the condition, one of True, False, Unknown. + enum: + - "True" + - "False" + - Unknown + type: string + type: + description: type of condition in CamelCase or in foo.example.com/CamelCase. + maxLength: 316 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ + type: string + required: + - lastTransitionTime + - message + - reason + - status + - type + type: object + type: array + x-kubernetes-list-map-keys: + - type + x-kubernetes-list-type: map + type: object + required: + - spec + type: object + served: true + storage: true + subresources: + status: {} diff --git a/charts/kubex-crds/templates/rightsizing.kubex.ai_globalconfigurations.yaml b/charts/kubex-crds/templates/rightsizing.kubex.ai_globalconfigurations.yaml index a97357c..2c7b471 100644 --- a/charts/kubex-crds/templates/rightsizing.kubex.ai_globalconfigurations.yaml +++ b/charts/kubex-crds/templates/rightsizing.kubex.ai_globalconfigurations.yaml @@ -45,6 +45,15 @@ spec: description: automationEnabled controls whether automation is globally enabled. type: boolean + gpuRebalancingCheckInterval: + default: 1m + description: |- + gpuRebalancingCheckInterval controls how often GPU rebalancing policies reevaluate workloads. + EXPERIMENTAL: this field is subject to breaking changes or removal without notice. + type: string + x-kubernetes-validations: + - message: must be at least 1 minute + rule: duration(self) >= duration('1m') heartbeatInterval: default: 5m description: heartbeatInterval controls how often controller heartbeat @@ -53,11 +62,27 @@ spec: x-kubernetes-validations: - message: must be at least 1 minute rule: duration(self) >= duration('1m') + kai: + default: {} + description: |- + kai configures Kai scheduler admission behavior. + EXPERIMENTAL: this field is subject to breaking changes or removal without notice. + properties: + schedulerName: + default: kai-scheduler + description: |- + schedulerName sets pod.spec.schedulerName when Kai GPU mutation is applied. + EXPERIMENTAL: this field is subject to breaking changes or removal without notice. + type: string + type: object kubexAPIRequestTimeout: default: 30s description: kubexAPIRequestTimeout controls timeout for outbound requests to the Kubex API. type: string + x-kubernetes-validations: + - message: must be greater than 0 seconds + rule: duration(self) > duration('0s') mutationLogInterval: default: 5m description: mutationLogInterval controls how often mutation logs @@ -66,6 +91,25 @@ spec: x-kubernetes-validations: - message: must be at least 1 minute rule: duration(self) >= duration('1m') + prometheus: + default: {} + description: |- + prometheus configures Prometheus query behavior for GPU rebalancing policies. + EXPERIMENTAL: this field is subject to breaking changes or removal without notice. + properties: + requestTimeout: + default: 30s + description: |- + requestTimeout controls timeout for each Prometheus query request. + EXPERIMENTAL: this field is subject to breaking changes or removal without notice. + type: string + url: + default: http://prometheus.monitoring.svc:9090 + description: |- + url sets the Prometheus base URL used for policy queries. + EXPERIMENTAL: this field is subject to breaking changes or removal without notice. + type: string + type: object protectedNamespacePatterns: default: - kube-* diff --git a/charts/kubex-crds/templates/rightsizing.kubex.ai_gpuconsolidationpolicies.yaml b/charts/kubex-crds/templates/rightsizing.kubex.ai_gpuconsolidationpolicies.yaml new file mode 100644 index 0000000..d35fc96 --- /dev/null +++ b/charts/kubex-crds/templates/rightsizing.kubex.ai_gpuconsolidationpolicies.yaml @@ -0,0 +1,208 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.19.0 + name: gpuconsolidationpolicies.rightsizing.kubex.ai +spec: + group: rightsizing.kubex.ai + names: + kind: GpuConsolidationPolicy + listKind: GpuConsolidationPolicyList + plural: gpuconsolidationpolicies + singular: gpuconsolidationpolicy + scope: Cluster + versions: + - name: v1alpha1 + schema: + openAPIV3Schema: + description: GpuConsolidationPolicy is the Schema for the gpuconsolidationpolicies + API. + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: spec defines the desired state of GpuConsolidationPolicy + properties: + experimental: + description: |- + experimental must be set to acknowledge unstable GPU/KAI behavior. + EXPERIMENTAL: this API is subject to breaking changes or removal without notice. + properties: + gpuKaiContract: + description: |- + gpuKaiContract explicitly acknowledges that GPU/KAI fields are experimental and may change. + EXPERIMENTAL: this acknowledgement value is required for unstable GPU/KAI behavior. + enum: + - none + - v1alpha1-2026-04 + type: string + required: + - gpuKaiContract + type: object + nodeSelector: + description: |- + nodeSelector selects the single compatibility pool this policy may evaluate. + Selector semantics match Kubernetes/Cilium label selector behavior. + Only selected nodes are considered compatible for consolidation, so use multiple + policies to represent multiple node pools. + properties: + matchExpressions: + description: matchExpressions is a list of label selector requirements. + The requirements are ANDed. + items: + description: |- + A label selector requirement is a selector that contains values, a key, and an operator that + relates the key and values. + properties: + key: + description: key is the label key that the selector applies + to. + type: string + operator: + description: |- + operator represents a key's relationship to a set of values. + Valid operators are In, NotIn, Exists and DoesNotExist. + type: string + values: + description: |- + values is an array of string values. If the operator is In or NotIn, + the values array must be non-empty. If the operator is Exists or DoesNotExist, + the values array must be empty. This array is replaced during a strategic + merge patch. + items: + type: string + type: array + x-kubernetes-list-type: atomic + required: + - key + - operator + type: object + type: array + x-kubernetes-list-type: atomic + matchLabels: + additionalProperties: + type: string + description: |- + matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels + map is equivalent to an element of matchExpressions, whose key field is "key", the + operator is "In", and the values array contains only "value". The requirements are ANDed. + type: object + type: object + x-kubernetes-map-type: atomic + requeueAfter: + default: 1m + description: requeueAfter is the periodic reevaluation interval for + consolidation checks. + type: string + x-kubernetes-validations: + - message: must be at least 1 second + rule: duration(self) >= duration('1s') + utilizationThresholdPercent: + default: 75 + description: utilizationThresholdPercent marks nodes below this GPU + utilization percentage as consolidation candidates. + format: int32 + maximum: 100 + minimum: 1 + type: integer + required: + - experimental + - nodeSelector + type: object + status: + description: status defines the observed state of GpuConsolidationPolicy + properties: + conditions: + description: |- + conditions represent the current state of the StaticPolicy resource. + Each condition has a unique type and reflects the status of a specific aspect of the resource. + + Standard condition types include: + - "Available": the resource is fully functional + - "Progressing": the resource is being created or updated + - "Degraded": the resource failed to reach or maintain its desired state + + The status of each condition is one of True, False, or Unknown. + items: + description: Condition contains details for one aspect of the current + state of this API Resource. + properties: + lastTransitionTime: + description: |- + lastTransitionTime is the last time the condition transitioned from one status to another. + This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. + format: date-time + type: string + message: + description: |- + message is a human readable message indicating details about the transition. + This may be an empty string. + maxLength: 32768 + type: string + observedGeneration: + description: |- + observedGeneration represents the .metadata.generation that the condition was set based upon. + For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date + with respect to the current state of the instance. + format: int64 + minimum: 0 + type: integer + reason: + description: |- + reason contains a programmatic identifier indicating the reason for the condition's last transition. + Producers of specific condition types may define expected values and meanings for this field, + and whether the values are considered a guaranteed API. + The value should be a CamelCase string. + This field may not be empty. + maxLength: 1024 + minLength: 1 + pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ + type: string + status: + description: status of the condition, one of True, False, Unknown. + enum: + - "True" + - "False" + - Unknown + type: string + type: + description: type of condition in CamelCase or in foo.example.com/CamelCase. + maxLength: 316 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ + type: string + required: + - lastTransitionTime + - message + - reason + - status + - type + type: object + type: array + x-kubernetes-list-map-keys: + - type + x-kubernetes-list-type: map + type: object + required: + - spec + type: object + served: true + storage: true + subresources: + status: {} diff --git a/charts/kubex-crds/templates/rightsizing.kubex.ai_gpurebalancingpolicies.yaml b/charts/kubex-crds/templates/rightsizing.kubex.ai_gpurebalancingpolicies.yaml new file mode 100644 index 0000000..6c7d828 --- /dev/null +++ b/charts/kubex-crds/templates/rightsizing.kubex.ai_gpurebalancingpolicies.yaml @@ -0,0 +1,476 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.19.0 + name: gpurebalancingpolicies.rightsizing.kubex.ai +spec: + group: rightsizing.kubex.ai + names: + kind: GpuRebalancingPolicy + listKind: GpuRebalancingPolicyList + plural: gpurebalancingpolicies + singular: gpurebalancingpolicy + scope: Namespaced + versions: + - name: v1alpha1 + schema: + openAPIV3Schema: + description: GpuRebalancingPolicy is the Schema for the gpurebalancingpolicies + API. + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: spec defines the desired state of GpuRebalancingPolicy + properties: + automationStrategyRef: + description: automationStrategyRef references the AutomationStrategy + to use when applying this policy. + properties: + name: + description: name is the name of the referenced strategy. + minLength: 1 + type: string + required: + - name + type: object + experimental: + description: |- + experimental must be set to acknowledge unstable GPU/KAI behavior. + EXPERIMENTAL: this API is subject to breaking changes or removal without notice. + properties: + gpuKaiContract: + description: |- + gpuKaiContract explicitly acknowledges that GPU/KAI fields are experimental and may change. + EXPERIMENTAL: this acknowledgement value is required for unstable GPU/KAI behavior. + enum: + - none + - v1alpha1-2026-04 + type: string + required: + - gpuKaiContract + type: object + metrics: + description: metrics configures compute and memory utilization signals + used for rebalancing. + properties: + compute: + description: compute configures the GPU compute utilization signal. + properties: + prometheus: + description: prometheus configures metric and label mapping + used for utilization checks. + properties: + containerLabel: + description: containerLabel is the metric label key containing + the container name. + minLength: 1 + pattern: ^[A-Za-z_][A-Za-z0-9_]*$ + type: string + metric: + description: metric is the Prometheus metric name used + to evaluate GPU utilization. + minLength: 1 + pattern: ^[A-Za-z_:][A-Za-z0-9_:]*$ + type: string + namespaceLabel: + description: namespaceLabel is the metric label key containing + the namespace name. + minLength: 1 + pattern: ^[A-Za-z_][A-Za-z0-9_]*$ + type: string + podLabel: + description: podLabel is the metric label key containing + the pod name. + minLength: 1 + pattern: ^[A-Za-z_][A-Za-z0-9_]*$ + type: string + required: + - containerLabel + - metric + - namespaceLabel + - podLabel + type: object + scaleBack: + default: + headroomPercent: 20 + metricsWindow: 10m + thresholdPercent: 80 + description: scaleBack configures GPU scale-back behavior + and thresholds for this metric. + properties: + headroomPercent: + default: 20 + description: headroomPercent adds extra GPU headroom above + inferred usage in the scale-back target. + format: int32 + minimum: 0 + type: integer + metricsWindow: + default: 10m + description: metricsWindow is the Prometheus lookback + window for sustained threshold checks. + type: string + x-kubernetes-validations: + - message: must be at least 1 minute + rule: duration(self) >= duration('1m') + thresholdPercent: + default: 80 + description: thresholdPercent is the absolute utilization + percentage threshold that triggers scale-back. + format: int32 + minimum: 1 + type: integer + type: object + upsize: + default: + headroomPercent: 20 + maxPercent: 200 + metricsWindow: 10m + thresholdPercent: 120 + description: upsize configures GPU upsize behavior and thresholds + for this metric. + properties: + headroomPercent: + default: 20 + description: headroomPercent adds extra GPU headroom above + inferred usage in the upsize target. + format: int32 + minimum: 0 + type: integer + maxPercent: + default: 200 + description: maxPercent caps growth relative to the stored + baseline allocation. + format: int32 + minimum: 1 + type: integer + metricsWindow: + default: 10m + description: metricsWindow is the Prometheus lookback + window for sustained threshold checks. + type: string + x-kubernetes-validations: + - message: must be at least 1 minute + rule: duration(self) >= duration('1m') + thresholdPercent: + default: 120 + description: thresholdPercent is the utilization percentage + threshold that triggers resizing. + format: int32 + minimum: 1 + type: integer + type: object + required: + - prometheus + type: object + memory: + description: memory configures the GPU memory utilization signal. + properties: + prometheus: + description: prometheus configures metric and label mapping + used for utilization checks. + properties: + containerLabel: + description: containerLabel is the metric label key containing + the container name. + minLength: 1 + pattern: ^[A-Za-z_][A-Za-z0-9_]*$ + type: string + metric: + description: metric is the Prometheus metric name used + to evaluate GPU utilization. + minLength: 1 + pattern: ^[A-Za-z_:][A-Za-z0-9_:]*$ + type: string + namespaceLabel: + description: namespaceLabel is the metric label key containing + the namespace name. + minLength: 1 + pattern: ^[A-Za-z_][A-Za-z0-9_]*$ + type: string + podLabel: + description: podLabel is the metric label key containing + the pod name. + minLength: 1 + pattern: ^[A-Za-z_][A-Za-z0-9_]*$ + type: string + required: + - containerLabel + - metric + - namespaceLabel + - podLabel + type: object + scaleBack: + default: + headroomPercent: 20 + metricsWindow: 10m + thresholdPercent: 80 + description: scaleBack configures GPU scale-back behavior + and thresholds for this metric. + properties: + headroomPercent: + default: 20 + description: headroomPercent adds extra GPU headroom above + inferred usage in the scale-back target. + format: int32 + minimum: 0 + type: integer + metricsWindow: + default: 10m + description: metricsWindow is the Prometheus lookback + window for sustained threshold checks. + type: string + x-kubernetes-validations: + - message: must be at least 1 minute + rule: duration(self) >= duration('1m') + thresholdPercent: + default: 80 + description: thresholdPercent is the absolute utilization + percentage threshold that triggers scale-back. + format: int32 + minimum: 1 + type: integer + type: object + upsize: + default: + headroomPercent: 20 + maxPercent: 200 + metricsWindow: 10m + thresholdPercent: 120 + description: upsize configures GPU upsize behavior and thresholds + for this metric. + properties: + headroomPercent: + default: 20 + description: headroomPercent adds extra GPU headroom above + inferred usage in the upsize target. + format: int32 + minimum: 0 + type: integer + maxPercent: + default: 200 + description: maxPercent caps growth relative to the stored + baseline allocation. + format: int32 + minimum: 1 + type: integer + metricsWindow: + default: 10m + description: metricsWindow is the Prometheus lookback + window for sustained threshold checks. + type: string + x-kubernetes-validations: + - message: must be at least 1 minute + rule: duration(self) >= duration('1m') + thresholdPercent: + default: 120 + description: thresholdPercent is the utilization percentage + threshold that triggers resizing. + format: int32 + minimum: 1 + type: integer + type: object + required: + - prometheus + type: object + required: + - compute + - memory + type: object + minPodMetricsAge: + default: 15m + description: |- + minPodMetricsAge is the minimum pod age required before pod metrics are considered. + Pods younger than this duration are skipped for metric-based rebalancing checks. + type: string + x-kubernetes-validations: + - message: must be zero or at least 1 minute + rule: self == '' || duration(self) == duration('0s') || duration(self) + >= duration('1m') + scope: + description: scope narrows the workloads and namespaces this policy + applies to. + properties: + labelSelector: + description: labelSelector limits the workload objects (e.g., + Deployments, CronJobs) this policy applies to. + properties: + matchExpressions: + description: matchExpressions is a list of label selector + requirements. The requirements are ANDed. + items: + description: |- + A label selector requirement is a selector that contains values, a key, and an operator that + relates the key and values. + properties: + key: + description: key is the label key that the selector + applies to. + type: string + operator: + description: |- + operator represents a key's relationship to a set of values. + Valid operators are In, NotIn, Exists and DoesNotExist. + type: string + values: + description: |- + values is an array of string values. If the operator is In or NotIn, + the values array must be non-empty. If the operator is Exists or DoesNotExist, + the values array must be empty. This array is replaced during a strategic + merge patch. + items: + type: string + type: array + x-kubernetes-list-type: atomic + required: + - key + - operator + type: object + type: array + x-kubernetes-list-type: atomic + matchLabels: + additionalProperties: + type: string + description: |- + matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels + map is equivalent to an element of matchExpressions, whose key field is "key", the + operator is "In", and the values array contains only "value". The requirements are ANDed. + type: object + type: object + x-kubernetes-map-type: atomic + workloadTypes: + default: + - Deployment + - StatefulSet + - CronJob + - Rollout + - Job + - AnalysisRun + - DaemonSet + description: workloadTypes limits the workload kinds this policy + applies to. When omitted, all supported workload types are targeted. + items: + description: WorkloadType enumerates the workload kinds a policy + can target. + enum: + - Deployment + - StatefulSet + - DaemonSet + - CronJob + - Rollout + - Job + - AnalysisRun + type: string + type: array + type: object + weight: + default: 0 + description: |- + weight determines which policy wins when multiple policies of the same kind match. + Higher weights take precedence. When weights are equal, older policies win. + format: int32 + minimum: 0 + type: integer + required: + - automationStrategyRef + - experimental + - metrics + type: object + status: + description: status defines the observed state of GpuRebalancingPolicy + properties: + conditions: + description: |- + conditions represent the current state of the StaticPolicy resource. + Each condition has a unique type and reflects the status of a specific aspect of the resource. + + Standard condition types include: + - "Available": the resource is fully functional + - "Progressing": the resource is being created or updated + - "Degraded": the resource failed to reach or maintain its desired state + + The status of each condition is one of True, False, or Unknown. + items: + description: Condition contains details for one aspect of the current + state of this API Resource. + properties: + lastTransitionTime: + description: |- + lastTransitionTime is the last time the condition transitioned from one status to another. + This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. + format: date-time + type: string + message: + description: |- + message is a human readable message indicating details about the transition. + This may be an empty string. + maxLength: 32768 + type: string + observedGeneration: + description: |- + observedGeneration represents the .metadata.generation that the condition was set based upon. + For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date + with respect to the current state of the instance. + format: int64 + minimum: 0 + type: integer + reason: + description: |- + reason contains a programmatic identifier indicating the reason for the condition's last transition. + Producers of specific condition types may define expected values and meanings for this field, + and whether the values are considered a guaranteed API. + The value should be a CamelCase string. + This field may not be empty. + maxLength: 1024 + minLength: 1 + pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ + type: string + status: + description: status of the condition, one of True, False, Unknown. + enum: + - "True" + - "False" + - Unknown + type: string + type: + description: type of condition in CamelCase or in foo.example.com/CamelCase. + maxLength: 316 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ + type: string + required: + - lastTransitionTime + - message + - reason + - status + - type + type: object + type: array + x-kubernetes-list-map-keys: + - type + x-kubernetes-list-type: map + type: object + required: + - spec + type: object + served: true + storage: true + subresources: + status: {}